dsdatawiz aims to make it easier to work with datatasets in R by
providing a set of functions to help with data wrangling and
aggregation. This package is also designed to provide tooltips which
might be useful in other packages or applications in Datasketch’s
ecosystem for data visualization and analysis.
This package provides a single function with which you can perform a
variety of data wrangling operations. The wrap_and_sort() function
lets you sort the numeric columns of a data frame and wrap the text
columns so that long text is displayed in a more readable way. There are
also options to sort the data frame by categorical columns and even
selecting a custom order for them. Finally, you can also specify the
number of rows to display.
The aggregate_data() function lets you aggregate a data frame by
grouping it by the levels of its categorical columns and summarizing the
numeric columns with a variety of summary functions such as mean,
median, sum, among others.
When aggregating the data using the aggregate_data() function, the
resulting data frame will contain an additional column called ..labels
which contains tooltips based on a template specified by the user (or a
default one if none is provided). These tooltips can be used in other
packages or applications to provide additional information about the
data.
You can install dsdatawiz from GitHub with:
# install.packages("devtools")
devtools::install_github("datasketch/dsdatawiz")# Aggregating the dataset iris by the species column
# and counting the number of rows for each species
aggregate_data(
data = iris,
group_vars = "species",
var_num_to_agg = NULL,
agg = "count"
)
#> List of 1
#> $ agg: chr "count"
#> NULL
#> NULL
#> Joining with `by = join_by(species)`
#> # A tibble: 3 × 3
#> species Conteo ..labels
#> <chr> <int> <html>
#> 1 setosa 50 <b>Species:</b> setosa<br/><b>Conteo:</b> 50
#> 2 versicolor 50 <b>Species:</b> versicolor<br/><b>Conteo:</b> 50
#> 3 virginica 50 <b>Species:</b> virginica<br/><b>Conteo:</b> 50
# Same as above but using hdtable and aggregating by max
hd <- hdtable(iris)
aggregate_data(
data = hd$data,
dic = hd$dic,
group_vars = "species",
var_num_to_agg = NULL,
agg = "max"
)
#> List of 1
#> $ agg: chr "max"
#> NULL
#> NULL
#> Joining with `by = join_by(species)`
#> # A tibble: 3 × 3
#> species Conteo ..labels
#> <chr> <int> <html>
#> 1 setosa 50 <b>Species:</b> setosa<br/><b>Conteo:</b> 50
#> 2 versicolor 50 <b>Species:</b> versicolor<br/><b>Conteo:</b> 50
#> 3 virginica 50 <b>Species:</b> virginica<br/><b>Conteo:</b> 50
# Aggregating the dataset starwars by the sex and gender columns
# and by the mean of the height column for each group, also
# providing custom format options and a tooltip template
hd2 <- hdtable(starwars)
aggregate_data(
data = hd2$data,
dic = hd2$dic,
group_vars = c("sex", "gender"),
var_num_to_agg = c("height"),
agg = "mean",
agg_na_rm = TRUE,
agg_text = "mean_height",
format_sample_cat = "UPPER",
format_sample_num = "1234",
na_label = "unknown",
tooltip_template = "<b>Sex:<b> {sex}<br/><b>Gender:<b> {gender}<br/><b>Mean Height:<b> {mean_height}"
)
#> List of 7
#> $ agg : chr "mean"
#> $ agg_na_rm : logi TRUE
#> $ agg_text : chr "mean_height"
#> $ format_sample_cat: chr "UPPER"
#> $ format_sample_num: chr "1234"
#> $ na_label : chr "unknown"
#> $ tooltip_template : chr "<b>Sex:<b> {sex}<br/><b>Gender:<b> {gender}<br/><b>Mean Height:<b> {mean_height}"
#> NULL
#> chr "<b>Sex:<b> {sex}<br/><b>Gender:<b> {gender}<br/><b>Mean Height:<b> {mean_height}"
#> Joining with `by = join_by(sex, gender)`
#> # A tibble: 6 × 4
#> # Groups: sex [5]
#> sex gender mean_height ..labels
#> <chr> <chr> <dbl> <html>
#> 1 female FEMININE 172. <b>Sex:<b> female<br/><b>Gender:<b> FEMI…
#> 2 hermaphroditic MASCULINE 175 <b>Sex:<b> hermaphroditic<br/><b>Gender:…
#> 3 male MASCULINE 179. <b>Sex:<b> male<br/><b>Gender:<b> MASCUL…
#> 4 none FEMININE 96 <b>Sex:<b> none<br/><b>Gender:<b> FEMINI…
#> 5 none MASCULINE 140 <b>Sex:<b> none<br/><b>Gender:<b> MASCUL…
#> 6 unknown UNKNOWN 175 <b>Sex:<b> unknown<br/><b>Gender:<b> UNK…# Group iris dataset by species and sort it by sepal_length
# in ascending and descending order
wrap_sort_data(data = iris,
var_cat_order = "Species",
var_num_sort = "Sepal.Length",
sort = "asc")
#> List of 1
#> $ sort: chr "asc"
#> NULL
#> NULL
#> # A tibble: 150 × 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 4.3 3 1.1 0.1 setosa
#> 2 4.4 2.9 1.4 0.2 setosa
#> 3 4.4 3 1.3 0.2 setosa
#> 4 4.4 3.2 1.3 0.2 setosa
#> 5 4.5 2.3 1.3 0.3 setosa
#> 6 4.6 3.1 1.5 0.2 setosa
#> 7 4.6 3.4 1.4 0.3 setosa
#> 8 4.6 3.6 1 0.2 setosa
#> 9 4.6 3.2 1.4 0.2 setosa
#> 10 4.7 3.2 1.3 0.2 setosa
#> # ℹ 140 more rows
hd <- hdtable(iris)
wrap_sort_data(data = hd$data,
var_cat_order = "species",
var_num_sort = "sepal_length",
sort = "desc")
#> List of 1
#> $ sort: chr "desc"
#> NULL
#> NULL
#> # A tibble: 150 × 5
#> sepal_length sepal_width petal_length petal_width species
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 5.8 4 1.2 0.2 setosa
#> 2 5.7 4.4 1.5 0.4 setosa
#> 3 5.7 3.8 1.7 0.3 setosa
#> 4 5.5 4.2 1.4 0.2 setosa
#> 5 5.5 3.5 1.3 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> 7 5.4 3.7 1.5 0.2 setosa
#> 8 5.4 3.9 1.3 0.4 setosa
#> 9 5.4 3.4 1.7 0.2 setosa
#> 10 5.4 3.4 1.5 0.4 setosa
#> # ℹ 140 more rows
# Group starwars dataset by sex and sort it by height, then slice the data by
# 10 rows and order sex by custom order first hermaphroditic and finally wrap
# the column name by one word per line.
hd2 <- hdtable(starwars)
wrap_sort_data(data = hd2$data,
var_cat_order = "sex",
var_num_sort = "height",
order_var1 = "hermaphroditic",
sort = "asc",
slice_n = 10,
axis_text_wrap = 1)
#> List of 4
#> $ order_var1 : chr "hermaphroditic"
#> $ sort : chr "asc"
#> $ slice_n : num 10
#> $ axis_text_wrap: num 1
#> NULL
#> NULL
#> # A tibble: 31 × 14
#> name height mass hair_color skin_color eye_color birth_year sex gender
#> <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr>
#> 1 Jabba … 175 1358 <NA> green-tan… orange 600 herm… mascu…
#> 2 Leia O… 150 49 brown light brown 19 fema… femin…
#> 3 Mon Mo… 150 NA auburn fair blue 48 fema… femin…
#> 4 Shmi S… 163 NA black fair brown 72 fema… femin…
#> 5 Beru W… 165 75 brown light blue 47 fema… femin…
#> 6 Dormé 165 NA brown light brown NA fema… femin…
#> 7 Barris… 166 50 black yellow blue 40 fema… femin…
#> 8 Jocast… 167 NA white fair blue NA fema… femin…
#> 9 Zam We… 168 55 blonde fair, gre… yellow NA fema… femin…
#> 10 Lumina… 170 56.2 black yellow blue 58 fema… femin…
#> # ℹ 21 more rows
#> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
#> # vehicles <list>, starships <list>