% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/collapse_groups_by.R
\name{collapse_groups_by}
\alias{collapse_groups_by}
\alias{collapse_groups_by_size}
\alias{collapse_groups_by_numeric}
\alias{collapse_groups_by_levels}
\alias{collapse_groups_by_ids}
\title{Collapse groups balanced by a single attribute}
\usage{
collapse_groups_by_size(
  data,
  n,
  group_cols,
  auto_tune = TRUE,
  method = "balance",
  col_name = ".coll_groups",
  parallel = FALSE,
  verbose = FALSE
)

collapse_groups_by_numeric(
  data,
  n,
  group_cols,
  num_cols,
  balance_size = FALSE,
  auto_tune = TRUE,
  method = "balance",
  group_aggregation_fn = mean,
  col_name = ".coll_groups",
  parallel = FALSE,
  verbose = FALSE
)

collapse_groups_by_levels(
  data,
  n,
  group_cols,
  cat_cols,
  cat_levels = NULL,
  balance_size = FALSE,
  auto_tune = TRUE,
  method = "balance",
  col_name = ".coll_groups",
  parallel = FALSE,
  verbose = FALSE
)

collapse_groups_by_ids(
  data,
  n,
  group_cols,
  id_cols,
  balance_size = FALSE,
  auto_tune = TRUE,
  method = "balance",
  col_name = ".coll_groups",
  parallel = FALSE,
  verbose = FALSE
)
}
\arguments{
\item{data}{\code{data.frame}. Can be \emph{grouped}, in which case
the function is applied group-wise.}

\item{n}{Number of new groups.}

\item{group_cols}{Names of factors in \code{`data`} for identifying the \emph{existing} groups
that should be collapsed.

Multiple names are treated as in \code{\link[dplyr:group_by]{dplyr::group_by()}}
(i.e., a hierarchy of groups), where each leaf group within each parent group is
considered a unique group to be collapsed.
Parent groups are not considered during collapsing, why leaf groups from different
parent groups can be collapsed together.

\strong{Note}: Do not confuse these group columns with potential columns that \code{`data`} is grouped by.
\code{`group_cols`} identifies the groups to be collapsed. When \code{`data`} is
grouped with \code{\link[dplyr:group_by]{dplyr::group_by()}}, the function is
applied separately to each of those subsets.}

\item{auto_tune}{Whether to create a larger set of collapsed group columns
from all combinations of the balancing dimensions and select the
overall most balanced group column(s).

This tends to create much more balanced collapsed group columns.

Can be slow, why we recommend enabling parallelization (see \code{`parallel`}).}

\item{method}{\code{"balance"}, \code{"ascending"}, or \code{"descending"}.
\itemize{
\item \code{"balance"} balances the attribute between the groups.
\item \code{"ascending"} orders by the attribute and groups from the lowest to highest value.
\item \code{"descending"} orders by the attribute and groups from the highest to lowest value.
}}

\item{col_name}{Name of the new group column. When creating multiple new group columns
(\code{`num_new_group_cols`>1}), this is the prefix for the names, which will
be suffixed with an underscore and a number (_1, _2, _3, etc.).}

\item{parallel}{Whether to parallelize the group column comparisons
when \code{`auto_tune`} is enabled.

Requires a registered parallel backend.
Like \code{doParallel::registerDoParallel}.}

\item{verbose}{Whether to print information about the process.
May make the function slightly slower.

N.B. Currently only used during auto-tuning.}

\item{num_cols}{Names of numerical columns to balance between groups.}

\item{balance_size}{Whether to balance the size of the collapsed groups. (logical)}

\item{group_aggregation_fn}{Function for aggregating values in the \code{`num_cols`} columns
for each group in \code{`group_cols`}.

Default is \code{mean()}, where the average value(s) are balanced across the new groups.

When using \code{sum()}, the groups will have similar sums across the new groups.

\strong{N.B.} Only used when \code{`num_cols`} is specified.}

\item{cat_cols}{Names of categorical columns to balance the average frequency
of one or more levels of.}

\item{cat_levels}{Names of the levels in the \code{`cat_cols`} columns to balance the average frequencies
of. When \code{`NULL`} (default), all levels are balanced.
Can be weights indicating the balancing importance of each level (within each column).

The weights are automatically scaled to sum to \code{1}.

Can be \code{".minority"} or \code{".majority"}, in which case the minority/majority level
are found and used.

\subsection{When \code{`cat_cols`} has single column name:}{

Either a \code{vector} with level names or a named \code{numeric vector} with weights:

E.g. \code{c("dog", "pidgeon", "mouse")} or \code{c("dog" = 5, "pidgeon" = 1, "mouse" = 3)}
}

\subsection{When \code{`cat_cols`} has multiple column names:}{

A named \code{list} with \code{vector}s for each column name in \code{`cat_cols`}.
When not providing a \code{vector} for a \code{`cat_cols`}
column, all levels are balanced in that column.

E.g. \code{list("col1" = c("dog" = 5, "pidgeon" = 1, "mouse" = 3),
 "col2" = c("hydrated", "dehydrated"))}.
}}

\item{id_cols}{Names of factor columns with IDs to balance the counts of between groups.

E.g. useful to get a similar number of participants in each group.}
}
\value{
\code{`data`} with a new grouping factor column.
}
\description{
\Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}

Collapses a set of groups into a smaller set of groups.

Balance the new groups by:
\itemize{
\item The \strong{number of rows} with \code{collapse_groups_by_size()}
\item \strong{Numerical columns} with \code{collapse_groups_by_numeric()}
\item One or more levels of \strong{categorical columns} with \code{collapse_groups_by_levels()}
\item Level counts in \strong{ID columns} with \code{collapse_groups_by_ids()}
\item \strong{Any combination} of these with \code{collapse_groups()}
}

These functions wrap \code{\link[groupdata2:collapse_groups]{collapse_groups()}}
to provide a simpler interface. To balance more than one of the attributes at a time
and/or create multiple new unique grouping columns at once, use
\code{\link[groupdata2:collapse_groups]{collapse_groups()}} directly.

While, \emph{on average}, the balancing work better than without, this is
\strong{not guaranteed on every run}. \code{`auto_tune`} (enabled by default) can yield
a much better overall balance than without in most contexts. This generates a larger set
of group columns using all combinations of the balancing columns and selects the
most balanced group column(s). This is slower and can be speeded up by enabling
parallelization (see \code{`parallel`}).

\strong{Tip}: When speed is more important than balancing, disable \code{`auto_tune`}.

\strong{Tip}: Check the balances of the new groups with
\code{\link[groupdata2:summarize_balances]{summarize_balances()}} and
\code{\link[groupdata2:ranked_balances]{ranked_balances()}}.

\strong{Note}: The categorical and ID balancing algorithms are different to those
in \code{\link[groupdata2:fold]{fold()}} and
\code{\link[groupdata2:partition]{partition()}}.
}
\details{
See details in \code{\link[groupdata2:collapse_groups]{collapse_groups()}}.
}
\examples{
# Attach packages
library(groupdata2)
library(dplyr)

# Set seed
if (requireNamespace("xpectr", quietly = TRUE)){
  xpectr::set_test_seed(42)
}

# Create data frame
df <- data.frame(
  "participant" = factor(rep(1:20, 3)),
  "age" = rep(sample(c(1:100), 20), 3),
  "answer" = factor(sample(c("a", "b", "c", "d"), 60, replace = TRUE)),
  "score" = sample(c(1:100), 20 * 3)
)
df <- df \%>\% dplyr::arrange(participant)
df$session <- rep(c("1", "2", "3"), 20)

# Sample rows to get unequal sizes per participant
df <- dplyr::sample_n(df, size = 53)

# Create the initial groups (to be collapsed)
df <- fold(
  data = df,
  k = 8,
  method = "n_dist",
  id_col = "participant"
)

# Ungroup the data frame
# Otherwise `collapse_groups*()` would be
# applied to each fold separately!
df <- dplyr::ungroup(df)

# When `auto_tune` is enabled for larger datasets
# we recommend enabling parallelization
# This can be done with:
# library(doParallel)
# doParallel::registerDoParallel(7) # use 7 cores

\dontrun{

# Collapse to 3 groups with size balancing
# Creates new `.coll_groups` column
df_coll <- collapse_groups_by_size(
  data = df,
  n = 3,
  group_cols = ".folds"
)

# Check balances
(coll_summary <- summarize_balances(
  data = df_coll,
  group_cols = ".coll_groups"
))

# Get ranked balances
# This is most useful when having created multiple
# new group columns with `collapse_groups()`
# The scores are standard deviations across groups
ranked_balances(coll_summary)

# Collapse to 3 groups with *categorical* balancing
df_coll <- collapse_groups_by_levels(
  data = df,
  n = 3,
  group_cols = ".folds",
  cat_cols = "answer"
)

# Check balances
(coll_summary <- summarize_balances(
  data = df_coll,
  group_cols = ".coll_groups",
  cat_cols = 'answer'
))

# Collapse to 3 groups with *numerical* balancing
# Also balance size to get similar sums
# as well as means
df_coll <- collapse_groups_by_numeric(
  data = df,
  n = 3,
  group_cols = ".folds",
  num_cols = "score",
  balance_size = TRUE
)

# Check balances
(coll_summary <- summarize_balances(
  data = df_coll,
  group_cols = ".coll_groups",
  num_cols = 'score'
))

# Collapse to 3 groups with *ID* balancing
# This should give us a similar number of IDs per group
df_coll <- collapse_groups_by_ids(
  data = df,
  n = 3,
  group_cols = ".folds",
  id_cols = "participant"
)

# Check balances
(coll_summary <- summarize_balances(
  data = df_coll,
  group_cols = ".coll_groups",
  id_cols = 'participant'
))

# Collapse to 3 groups with balancing of ALL attributes
# We create 5 new grouping factors and compare them
# The latter is in-general a good strategy even if you
# only need a single collapsed grouping factor
# as you can choose your preferred balances
# based on the summary
# NOTE: This is slow (up to a few minutes)
# consider enabling parallelization
df_coll <- collapse_groups(
  data = df,
  n = 3,
  num_new_group_cols = 5,
  group_cols = ".folds",
  cat_cols = "answer",
  num_cols = 'score',
  id_cols = "participant",
  auto_tune = TRUE   # Disabled by default in `collapse_groups()`
  # parallel = TRUE  # Add comma above and uncomment
)

# Check balances
(coll_summary <- summarize_balances(
  data = df_coll,
  group_cols = paste0(".coll_groups_", 1:5),
  cat_cols = "answer",
  num_cols = 'score',
  id_cols = 'participant'
))

# Compare the new grouping columns
# The lowest across-group standard deviation
# is the most balanced
ranked_balances(coll_summary)

}

}
\seealso{
Other grouping functions: 
\code{\link{all_groups_identical}()},
\code{\link{collapse_groups}()},
\code{\link{fold}()},
\code{\link{group}()},
\code{\link{group_factor}()},
\code{\link{partition}()},
\code{\link{splt}()}
}
\author{
Ludvig Renbo Olsen, \email{r-pkgs@ludvigolsen.dk}
}
\concept{grouping functions}
