% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/estimate_means.R
\name{estimate_means}
\alias{estimate_means}
\title{Estimate Marginal Means (Model-based average at each factor level)}
\usage{
estimate_means(
  model,
  by = "auto",
  predict = NULL,
  ci = 0.95,
  estimate = "average",
  transform = NULL,
  backend = getOption("modelbased_backend", "marginaleffects"),
  verbose = TRUE,
  ...
)
}
\arguments{
\item{model}{A statistical model.}

\item{by}{The (focal) predictor variable(s) at which to evaluate the desired
effect / mean / contrasts. Other predictors of the model that are not
included here will be collapsed and "averaged" over (the effect will be
estimated across them). The \code{by} argument is used to create a "reference grid"
or "data grid" with representative values for the focal predictors. \code{by}
can be a character (vector) naming the focal predictors (and optionally,
representative values or levels), or a list of named elements. See details
in \code{\link[insight:get_datagrid]{insight::get_datagrid()}} to learn more about how to create data grids
for predictors of interest.}

\item{predict}{Is passed to the \code{type} argument in \code{emmeans::emmeans()} (when
\code{backend = "emmeans"}) or in \code{marginaleffects::avg_predictions()} (when
\code{backend = "marginaleffects"}). For emmeans, see also
\href{https://CRAN.R-project.org/package=emmeans/vignettes/transformations.html}{this vignette}.
Valid options for `predict`` are:
\itemize{
\item \code{backend = "emmeans"}: \code{predict} can be \code{"response"}, \code{"link"}, \code{"mu"},
\code{"unlink"}, or \code{"log"}. If \code{predict = NULL} (default), the most appropriate
transformation is selected (which usually is \code{"response"}).
\item \code{backend = "marginaleffects"}: \code{predict} can be \code{"response"}, \code{"link"} or
any valid \code{type} option supported by model's class \code{predict()} method (e.g.,
for zero-inflation models from package \strong{glmmTMB}, you can choose
\code{predict = "zprob"} or \code{predict = "conditional"} etc., see
\link[glmmTMB:predict.glmmTMB]{glmmTMB::predict.glmmTMB}). By default, when \code{predict = NULL}, the most
appropriate transformation is selected, which usually returns predictions
or contrasts on the response-scale.
}

\code{"link"} will leave the values on scale of the linear predictors.
\code{"response"} (or \code{NULL}) will transform them on scale of the response
variable. Thus for a logistic model, \code{"link"} will give estimations expressed
in log-odds (probabilities on logit scale) and \code{"response"} in terms of
probabilities. To predict distributional parameters (called "dpar" in other
packages), for instance when using complex formulae in \code{brms} models, the
\code{predict} argument can take the value of the parameter you want to estimate,
for instance \code{"sigma"}, \code{"kappa"}, etc.}

\item{ci}{Confidence Interval (CI) level. Default to \code{0.95} (\verb{95\%}).}

\item{estimate}{Character string, indicating the type of target population
predictions refer to. This dictates how the predictions are "averaged" over
the non-focal predictors, i.e. those variables that are not specified in
\code{by} or \code{contrast}.
\itemize{
\item \code{"average"} (default): Takes the mean value for non-focal numeric
predictors and marginalizes over the factor levels of non-focal terms,
which computes a kind of "weighted average" for the values at which these
terms are hold constant. These predictions are a good representation of the
sample, because all possible values and levels of the non-focal predictors
are considered. It answers the question, "What is the predicted value for
an 'average' observation in \emph{my data}?". Cum grano salis, it refers to
randomly picking a subject of your sample and the result you get on
average. This approach is the one taken by default in the \code{emmeans}
package.
\item \code{"population"}: Non-focal predictors are marginalized over the observations
in the sample, where the sample is replicated multiple times to produce
"counterfactuals" and then takes the average of these predicted values
(aggregated/grouped by the focal terms). It can be considered as
extrapolation to a hypothetical target population. Counterfactual
predictions are useful, insofar as the results can also be transferred to
other contexts (Dickerman and Hernan, 2020). It answers the question, "What
is the predicted response value for the 'average' observation in \emph{the
broader target population}?". It does not only refer to the actual data in
your observed sample, but also "what would be if" we had more data, or if
we had data from a different sample.
}

In other words, the distinction between estimate types resides in whether
the prediction are made for:
\itemize{
\item A specific "individual" from the sample (i.e., a specific combination of
predictor values): this is what is obtained when using \code{\link[=estimate_relation]{estimate_relation()}}
and the other prediction functions.
\item An average individual from the sample: obtained with
\code{estimate_means(..., estimate = "average")}
\item The broader, hypothetical target population: obtained with
\code{estimate_means(..., estimate = "population")}
}}

\item{transform}{A function applied to predictions and confidence intervals
to (back-) transform results, which can be useful in case the regression
model has a transformed response variable (e.g., \code{lm(log(y) ~ x)}). For
Bayesian models, this function is applied to individual draws from the
posterior distribution, before computing summaries. Can also be \code{TRUE}, in
which case \code{insight::get_transformation()} is called to determine the
appropriate transformation-function.}

\item{backend}{Whether to use \code{"emmeans"} or \code{"marginaleffects"} as a backend.
Results are usually very similar. The major difference will be found for mixed
models, where \code{backend = "marginaleffects"} will also average across random
effects levels, producing "marginal predictions" (instead of "conditional
predictions", see Heiss 2022).

You can set a default backend via \code{options()}, e.g. use
\code{options(modelbased_backend = "emmeans")} to use the \strong{emmeans} package or
\code{options(modelbased_backend = "marginaleffects")} to set \strong{marginaleffects}
as default backend.}

\item{verbose}{Use \code{FALSE} to silence messages and warnings.}

\item{...}{Other arguments passed, for instance, to \code{\link[insight:get_datagrid]{insight::get_datagrid()}},
to functions from the \strong{emmeans} or \strong{marginaleffects} package, or to process
Bayesian models via \code{\link[bayestestR:describe_posterior]{bayestestR::describe_posterior()}}. Examples:
\itemize{
\item \code{insight::get_datagrid()}: Argument such as \code{length} or \code{range} can be used
to control the (number of) representative values.
\item \strong{marginaleffects}: Internally used functions are \code{avg_predictions()} for
means and contrasts, and \code{avg_slope()} for slopes. Therefore, arguments
for instance like \code{vcov}, \code{transform}, \code{equivalence}, \code{slope} or even
\code{newdata} can be passed to those functions.
\item \strong{emmeans}: Internally used functions are \code{emmeans()} and \code{emtrends()}.
Additional arguments can be passed to these functions.
\item Bayesian models: For Bayesian models, parameters are cleaned using
\code{describe_posterior()}, thus, arguments like, for example, \code{centrality},
\code{rope_range}, or \code{test} are passed to that function.
}}
}
\value{
A data frame of estimated marginal means.
}
\description{
Estimate average value of response variable at each factor level or
representative value, respectively at values defined in a "data grid" or
"reference grid". For plotting, check the examples in
\code{\link[=visualisation_recipe]{visualisation_recipe()}}. See also other related functions such as
\code{\link[=estimate_contrasts]{estimate_contrasts()}} and \code{\link[=estimate_slopes]{estimate_slopes()}}.
}
\details{
The \code{\link[=estimate_slopes]{estimate_slopes()}}, \code{\link[=estimate_means]{estimate_means()}} and \code{\link[=estimate_contrasts]{estimate_contrasts()}}
functions are forming a group, as they are all based on \emph{marginal}
estimations (estimations based on a model). All three are built on the
\strong{emmeans} or \strong{marginaleffects} package (depending on the \code{backend}
argument), so reading its documentation (for instance \code{\link[emmeans:emmeans]{emmeans::emmeans()}},
\code{\link[emmeans:emtrends]{emmeans::emtrends()}} or this \href{https://marginaleffects.com/}{website}) is
recommended to understand the idea behind these types of procedures.
\itemize{
\item Model-based \strong{predictions} is the basis for all that follows. Indeed,
the first thing to understand is how models can be used to make predictions
(see \code{\link[=estimate_link]{estimate_link()}}). This corresponds to the predicted response (or
"outcome variable") given specific predictor values of the predictors (i.e.,
given a specific data configuration). This is why the concept of \code{\link[insight:get_datagrid]{reference grid()}} is so important for direct predictions.
\item \strong{Marginal "means"}, obtained via \code{\link[=estimate_means]{estimate_means()}}, are an extension
of such predictions, allowing to "average" (collapse) some of the predictors,
to obtain the average response value at a specific predictors configuration.
This is typically used when some of the predictors of interest are factors.
Indeed, the parameters of the model will usually give you the intercept value
and then the "effect" of each factor level (how different it is from the
intercept). Marginal means can be used to directly give you the mean value of
the response variable at all the levels of a factor. Moreover, it can also be
used to control, or average over predictors, which is useful in the case of
multiple predictors with or without interactions.
\item \strong{Marginal contrasts}, obtained via \code{\link[=estimate_contrasts]{estimate_contrasts()}}, are
themselves at extension of marginal means, in that they allow to investigate
the difference (i.e., the contrast) between the marginal means. This is,
again, often used to get all pairwise differences between all levels of a
factor. It works also for continuous predictors, for instance one could also
be interested in whether the difference at two extremes of a continuous
predictor is significant.
\item Finally, \strong{marginal effects}, obtained via \code{\link[=estimate_slopes]{estimate_slopes()}}, are
different in that their focus is not values on the response variable, but the
model's parameters. The idea is to assess the effect of a predictor at a
specific configuration of the other predictors. This is relevant in the case
of interactions or non-linear relationships, when the effect of a predictor
variable changes depending on the other predictors. Moreover, these effects
can also be "averaged" over other predictors, to get for instance the
"general trend" of a predictor over different factor levels.
}

\strong{Example:} Let's imagine the following model \code{lm(y ~ condition * x)} where
\code{condition} is a factor with 3 levels A, B and C and \code{x} a continuous
variable (like age for example). One idea is to see how this model performs,
and compare the actual response y to the one predicted by the model (using
\code{\link[=estimate_expectation]{estimate_expectation()}}). Another idea is evaluate the average mean at each of
the condition's levels (using \code{\link[=estimate_means]{estimate_means()}}), which can be useful to
visualize them. Another possibility is to evaluate the difference between
these levels (using \code{\link[=estimate_contrasts]{estimate_contrasts()}}). Finally, one could also estimate
the effect of x averaged over all conditions, or instead within each
condition (\code{using [estimate_slopes]}).
}
\examples{
\dontshow{if (all(insight::check_if_installed(c("marginaleffects", "see", "lme4"), quietly = TRUE))) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
library(modelbased)

# Frequentist models
# -------------------
model <- lm(Petal.Length ~ Sepal.Width * Species, data = iris)

estimate_means(model)

# the `length` argument is passed to `insight::get_datagrid()` and modulates
# the number of representative values to return for numeric predictors
estimate_means(model, by = c("Species", "Sepal.Width"), length = 2)

# an alternative way to setup your data grid is specify the values directly
estimate_means(model, by = c("Species", "Sepal.Width = c(2, 4)"))

# or use one of the many predefined "tokens" that help you creating a useful
# data grid - to learn more about creating data grids, see help in
# `?insight::get_datagrid`.
estimate_means(model, by = c("Species", "Sepal.Width = [fivenum]"))

\dontrun{
# same for factors: filter by specific levels
estimate_means(model, by = "Species=c('versicolor', 'setosa')")
estimate_means(model, by = c("Species", "Sepal.Width=0"))

# estimate marginal average of response at values for numeric predictor
estimate_means(model, by = "Sepal.Width", length = 5)
estimate_means(model, by = "Sepal.Width=c(2, 4)")

# or provide the definition of the data grid as list
estimate_means(
  model,
  by = list(Sepal.Width = c(2, 4), Species = c("versicolor", "setosa"))
)

# Methods that can be applied to it:
means <- estimate_means(model, by = c("Species", "Sepal.Width=0"))

plot(means) # which runs visualisation_recipe()
standardize(means)

data <- iris
data$Petal.Length_factor <- ifelse(data$Petal.Length < 4.2, "A", "B")

model <- lme4::lmer(
  Petal.Length ~ Sepal.Width + Species + (1 | Petal.Length_factor),
  data = data
)
estimate_means(model)
estimate_means(model, by = "Sepal.Width", length = 3)
}
\dontshow{\}) # examplesIf}
}
\references{
Dickerman, Barbra A., and Miguel A. Hernán. 2020. Counterfactual Prediction
Is Not Only for Causal Inference. European Journal of Epidemiology 35 (7):
615–17. \doi{10.1007/s10654-020-00659-8}

Heiss, A. (2022). Marginal and conditional effects for GLMMs with
{marginaleffects}. Andrew Heiss. \doi{10.59350/xwnfm-x1827}
}
