% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/vtree.R
\name{vtree}
\alias{vtree}
\title{Draw a variable tree}
\usage{
vtree(data = NULL, vars, showuniform = TRUE, hideconstant = NULL,
  words = NULL, horiz = TRUE, title = "", sameline = FALSE,
  vp = TRUE, prune = list(), tprune = list(), keep = list(),
  tkeep = list(), prunebelow = list(), tprunebelow = list(),
  follow = list(), tfollow = list(), prunesmaller = NULL,
  prunebigger = NULL, summary = NULL, tsummary = NULL,
  shownodelabels = TRUE, showvarnames = TRUE, showpct = TRUE,
  showlpct = TRUE, showcount = TRUE, prefixcount = "",
  showrootcount = TRUE, showlegend = FALSE, showroot = TRUE,
  showvarinnode = FALSE, showlegendsum = FALSE, labelvar = NULL,
  labelnode = list(), tlabelnode = NULL, digits = 0, cdigits = 1,
  fillcolor = NULL, specfill = NULL, fillnodes = TRUE,
  NAfillcolor = "white", rootfillcolor = "#EFF3FF", palette = NULL,
  gradient = TRUE, revgradient = FALSE, sortfill = FALSE,
  singlecolor = 2, colorvarlabels = TRUE, color = c("blue",
  "forestgreen", "red", "orange", "pink"), colornodes = FALSE,
  plain = FALSE, Venn = FALSE, check.is.na = FALSE, seq = FALSE,
  pattern = FALSE, ptable = FALSE, text = list(), ttext = list(),
  varlabelloc = NULL, font = "Arial", varnamepointsize = 24,
  varnamebold = FALSE, legendpointsize = 14, HTMLtext = FALSE,
  splitwidth = 20, vsplitwidth = 8, splitspaces = TRUE,
  getscript = FALSE, mincount = 1, maxcount, showempty = FALSE,
  choicechecklist = TRUE, just = "c", justtext = NULL, thousands = "",
  folder = NULL, format = "", imageFileOnly = FALSE, pngknit = TRUE,
  pxwidth = NULL, pxheight = NULL, imagewidth = "", imageheight = "",
  width = NULL, height = NULL, maxNodes = 1000, unchecked = c("0",
  "FALSE", "No", "no"), checked = c("1", "TRUE", "Yes", "yes"),
  trim = NULL, rounded = TRUE, varminwidth = NULL, varminheight = NULL,
  squeeze = 1, arrowhead = "normal", nodesep = 0.5, ranksep = 0.5,
  margin = 0.2, graphattr = "", nodeattr = "", edgeattr = "",
  nodefunc = NULL, nodeargs = NULL, verbose = FALSE, runsummary = NULL,
  retain = NULL, auto = FALSE, parent = 1, last = 1, root = TRUE,
  subset = 1:nrow(z), numsmallernodes = 0, sumsmallernodes = 0,
  numbiggernodes = 0, sumbiggernodes = 0, as.if.knit = FALSE,
  prunelone = NULL, pruneNA = FALSE, lsplitwidth = 15,
  showlevels = TRUE, z = NULL)
}
\arguments{
\item{data}{Required: Data frame, or a single vector.}

\item{vars}{Required (unless \code{data} is a single vector):
Variables to use for the tree. Can be 
(1) a character string of whitespace-separated variable names,
(2) a vector of variable names,
(3) a formula without a left-hand side,
e.g. \code{~ Age + Sex},
but note that extended variable specifications cannot be used in this case.}

\item{showuniform}{Show a variable even when it only has one value?}

\item{hideconstant}{Hide a variable if its only value is one of the specified strings.}

\item{words}{A list of named vectors of values.
Used to build a variable tree 
representing all permutations of these values.
No counts will be shown.}

\item{horiz}{Should the tree be drawn horizontally?
(i.e. root node on the left, with the tree growing to the right)}

\item{title}{Label for the root node of the tree.}

\item{sameline}{Display node label on the same line as the count and percentage?
A single value (with no names) specifies the setting for all variables.
A logical vector of \code{TRUE} for named variables is interpreted as
A logical vector of \code{FALSE} for named variables is interpreted as
\code{FALSE} for those variables and \code{TRUE} for all others.}

\item{vp}{Use \emph{valid percentages}?
Valid percentages are computed by first excluding any missing values,
i.e. restricting attention to the set of "valid" observations.
The denominator is thus the number of non-missing observations.
When \code{vp=TRUE}, nodes for missing values show the number of missing values
but do not show a percentage;
all the other nodes show valid percentages.
When \code{vp=FALSE}, all nodes (including nodes for missing values)
show percentages of the total number of observations.}

\item{prune, keep, prunebelow, follow}{List of named vectors that specify pruning.
(see \strong{Pruning} below)}

\item{tprune, tkeep, tprunebelow, tfollow}{List of lists of named vectors that specify "targeted" pruning.
(see \strong{Pruning} below)}

\item{prunesmaller}{Prune any nodes with count less than specified number.}

\item{prunebigger}{Prune any nodes with count greater than specified number.}

\item{summary}{A character string used to specify summary statistics to display in the nodes.
See \strong{Displaying summary information} below for details.}

\item{tsummary}{A list of character-string vectors.
The initial elements of each character string vector point to a specific node.
The final element of each character string vector is a summary string,
with the same structure as \code{summary}.}

\item{shownodelabels}{Show node labels?
A single value (with no names) specifies the setting for all variables.
Otherwise, a named logical vector indicates which variables should have their
node labels shown.
If the vector consists of only \code{TRUE} values,
it is interpreted as \code{TRUE} for those variables and \code{FALSE} for all others.
Similarly, if the vector consists of only \code{FALSE} values, 
it is interpreted as \code{FALSE} for those variables and \code{TRUE} for all others.}

\item{showvarnames}{Show the name of the variable next to each layer of the tree?}

\item{showpct, showlpct}{Show percentage? \code{showpct} is for nodes, \code{showlpct} is for legends.
A single value (with no names) specifies the setting for all variables.
A logical vector of \code{TRUE} for named variables is interpreted as
A logical vector of \code{FALSE} for named variables is interpreted as
\code{FALSE} for those variables and TRUE for all others.}

\item{showcount}{Show count in each node?
A single value (with no names) specifies the setting for all variables.
A logical vector of \code{TRUE} for named variables is interpreted as
A logical vector of \code{FALSE} for named variables is interpreted as
\code{FALSE} for those variables and \code{TRUE} for all others.}

\item{prefixcount}{Text that will precede each count.}

\item{showrootcount}{Should count in root node?}

\item{showlegend}{Show legend (including marginal frequencies) for each variable?}

\item{showroot}{Show the root node?
When \code{seq=TRUE}, it may be useful to set \code{showroot=FALSE}.}

\item{showvarinnode}{Show the variable name in each node?}

\item{showlegendsum}{Show summary information in the legend?
(Provided \code{summary} has been specified).}

\item{labelvar}{A named vector of labels for variables.}

\item{labelnode}{List of vectors used to change how values of variables are displayed.
The name of each element of the
list is one of the variable names in \code{vars}.
Each element of the list is a vector of character strings,
representing the values of the variable.
The names of the vector represent the labels to be used in place of the values.}

\item{tlabelnode}{A list of vectors, each of which specifies a particular node,
as well as a label for that node (a "targeted" label).
The names of each vector specify variable names,
except for an element named \code{label}, which specifies the label to use.}

\item{digits, cdigits}{Number of decimal digits to show in percentages (\code{digits})
and in continuous values displayed via the summary parameter (\code{cdigits}).}

\item{fillcolor}{[Color] A named vector of colors for filling the nodes of each variable.
If an unnamed, scalar color is specified,
all nodes will have this color.}

\item{specfill}{[Color] A list with specified color values for specified variables.}

\item{fillnodes}{[Color] Fill the nodes with color?}

\item{NAfillcolor}{[Color] Fill-color for missing-value nodes.
If \code{NULL}, fill colors of missing value nodes will be consistent
with the fill colors in the rest of the tree.}

\item{rootfillcolor}{[Color] Fill-color for the root node.}

\item{palette}{[Color] A vector of palette numbers (which can range between 1 and 14).
The names of the vector indicate the corresponding variable.
See \strong{Palettes} below for more information.}

\item{gradient}{[Color] Use gradients of fill color across the values of each variable?
A single value (with no names) specifies the setting for all variables.
A logical vector of \code{TRUE} values for named variables is interpreted as
\code{TRUE} for those variables and \code{FALSE} for all others.
A logical vector of \code{FALSE} values for named variables is interpreted as
\code{FALSE} for those variables and \code{TRUE} for all others.}

\item{revgradient}{[Color] Should the gradient be reversed (i.e. dark to light instead of light to dark)?
A single value (with no names) specifies the setting for all variables.
A logical vector of \code{TRUE} values for named variables is interpreted as
A logical vector of \code{FALSE} values for named variables is interpreted as
\code{FALSE} for those variables and \code{TRUE} for all others.}

\item{sortfill}{[Color] Sort colors in order of node count?
When a \code{gradient} fill is used, this results in
the nodes with the smallest counts having the lightest shades
and the nodes with the largest counts having the darkest shades.}

\item{singlecolor}{[Color] When a variable has a single value,
this parameter is used to specify whether nodes should have a
(1) light shade, (2) a medium shade, or (3) a dark shade.
specify \code{singlecolor=1} to assign a light shade.}

\item{colorvarlabels}{[Color] Color the variable labels?}

\item{color}{[Color] A vector of color names for the \emph{outline} of the nodes in each layer.}

\item{colornodes}{[Color] Color the node outlines?}

\item{plain}{[Color] Use "plain" settings?
These settings are as follows: for each variable all nodes are the same color,
namely a shade of blue (with each successive variable using a darker shade);
all variable labels are black; and the \code{squeeze} parameter is set to 0.6.}

\item{Venn}{Display multi-way set membership information?
This provides an alternative to a Venn diagram.
This sets \code{showpct=FALSE} and \code{shownodelabels=FALSE}.
Assumption: all of the specified variables are logicals or 0/1 numeric variables.}

\item{check.is.na}{Replace each variable named in \code{vars} with a logical vector indicating
whether or not each of its values is missing?}

\item{seq}{Display the variable tree using \emph{sequences}?
Each unique sequence (i.e. pattern) of values will be shown separately.
The sequences are sorted from least frequent to most frequent.}

\item{pattern}{Display the variable tree using \emph{patterns}?
These are the same as \code{seq}, but lines without arrows are drawn,
and instead of a sequence variable, a pattern variable is shown.}

\item{ptable}{Generate a pattern table instead of a variable tree? 
Only applies when \code{pattern=TRUE}.}

\item{text}{A list of vectors containing extra text to add to
nodes corresponding to specified values of a specified variable.
The name of each element of the list
must be one of the variable names in \code{vars}.
Each element is a vector of character strings.
The names of the vector identify the nodes to which the text should be added.}

\item{ttext}{A list of vectors, each of which specifies a particular node,
as well as text to add to that node ("targeted" text).
The names of each vector specify variable names,
except for an element named \code{text}, which specifies the text to add.}

\item{varlabelloc}{A named vector of vertical label locations
("t", "c", or "b" for top, center, or bottom, respectively)
for nodes of each variable.
(Sets the Graphviz \code{labelloc} attribute.)}

\item{font}{Font.}

\item{varnamepointsize}{Font size (in points) to use when displaying variable names.}

\item{varnamebold}{Show the variable name in bold?}

\item{legendpointsize}{Font size (in points) to use when displaying legend.}

\item{HTMLtext}{Is the text formatted in HTML?}

\item{splitwidth, vsplitwidth}{The minimum number of characters before an automatic
linebreak is inserted.
\code{splitwidth} is for node labels, \code{vsplitwidth} is for variable names.}

\item{splitspaces}{When \code{vars} is a character string,
split it by spaces to get variable names?
It is only rarely necessary to use this parameter.
This should only be \code{FALSE} when a single variable name
that contains spaces is specified.}

\item{getscript}{Instead of displaying the variable tree,
return the DOT script as a character string?}

\item{mincount, maxcount}{Minimum or maximum count to include in a pattern tree or pattern table.
(\code{maxcount} overrides \code{mincount}.)}

\item{showempty}{Show nodes that do not contain any observations?}

\item{choicechecklist}{When REDCap checklists are specified using the \code{stem:} syntax,
automatically extract the names of choices and use them as variable names?}

\item{just}{Text justification ("l"=left, "c"=center, "r"=right).}

\item{justtext}{Like \code{just}, but only for extra text, like summaries.}

\item{thousands}{Thousands separator for big numbers.}

\item{folder, format, imageFileOnly, pngknit}{Control image file generation.
\code{folder}: a path to a folder where image file will be stored.
\code{format}: "png" or "pdf" format.
\code{imageFileOnly}: should an image file should be produced but not displayed?
\code{pngknit}: generate a PNG file when called during knit?
(See \strong{Knitr, R Markdown, Sweave} below for more information.)}

\item{pxwidth, pxheight}{Width and height of the PNG bitmap to be rendered
when \code{vtree} is called from R Markdown.
If neither \code{pxwidth} nor \code{pxheight} is specified,
\code{pxwidth} is automatically set to 2000 pixels.}

\item{imagewidth, imageheight}{Character strings representing width and height of the PNG image
to be rendered when \code{vtree} is called from R Markdown,
e.g. \code{"4in"}
If neither \code{imageheight} nor \code{imagewidth} is specified,
\code{imageheight} is set to 3 inches.}

\item{width, height}{Width and height (in pixels) to be passed to \code{DiagrammeR::grViz}.}

\item{maxNodes}{An error occurs if the number of nodes exceeds \code{maxNodes}.}

\item{unchecked, checked}{Vector of character strings interpreted as "unchecked" and "checked" respectively.}

\item{trim}{(LaTeX Sweave only.) Crop the image using a feature
of \code{\\includegraphics}.
Vector of bp (big points) to trim in the order
left, lower, right, upper.}

\item{rounded}{[Graphviz] Use rounded boxes for nodes?}

\item{varminwidth, varminheight}{[Graphviz] Named vector of minimum initial widths or heights for nodes of each variable.

\code{varminwidth} sets the Graphviz \code{width} attribute.
\code{varminheight} sets the Graphviz \code{height} attribute.}

\item{squeeze}{[GraphViz] The degree (between 0 and 1) to which the tree will be "squeezed".
This controls two Graphviz parameters: \code{margin} and \code{nodesep}.}

\item{arrowhead}{[Graphviz] arrowhead style. Defaults to \code{"normal"}.
Other choices include \code{"none"}, \code{"vee"}.}

\item{nodesep, ranksep, margin}{[Graphviz] attributes for node separation amount,
rank separation amount, and node margin.}

\item{graphattr, nodeattr, edgeattr}{[Graphviz] Character string: Graphviz attributes for the graph, node, and edge respectively.}

\item{nodefunc, nodeargs}{Node function and node arguments (see \strong{Node functions} below).}

\item{verbose}{Report additional details?}

\item{runsummary}{A list of functions, with the same length as \code{summary}.
Each function must take a data frame as its sole argument,
and return a logical value.
Each string in \code{summary} will only be interpreted if
the corresponding logical value is \code{TRUE}.
the corresponding string in \code{summary} will be evaluated.}

\item{retain}{Vector of names of additional variables in the data frame that need to be
available to execute the functions in \code{runsummary}.}

\item{auto}{Automatically choose variables? (\code{vars} should not be specified)}

\item{parent, last}{[Internal use only.] Node number of parent and last node.}

\item{root}{[Internal use only.] Is this the root node of the tree?}

\item{subset}{[Internal use only.] A vector representing the subset of observations.}

\item{numsmallernodes}{[Internal use only.] Counting nodes that were suppressed by prunesmaller.}

\item{sumsmallernodes}{[Internal use only.] Summing nodes that were suppress by prunesmaller.}

\item{numbiggernodes}{[Internal use only.] Counting nodes that were suppressed by prunebigger.}

\item{sumbiggernodes}{[Internal use only.] Summing nodes that were suppress by prunebigger.}

\item{as.if.knit}{(Deprecated) Behave as if called while knitting?}

\item{prunelone}{(Deprecated) A vector of values specifying "lone nodes" (of \emph{any} variable) to prune.
A lone node is a node that has no siblings (an "only child").}

\item{pruneNA}{(Deprecated) Prune all missing values?
This is problematic because "valid" percentages
are hard to interpret when NAs are pruned.}

\item{lsplitwidth}{(Deprecated) In legends, the minimum number of characters before an automatic
linebreak is inserted.}

\item{showlevels}{(Deprecated) Same as showvarnames.}

\item{z}{(Deprecated) This was replaced by the \code{data} parameter}
}
\value{
The value returned by \code{vtree} varies
depending on both the parameter values specified
and the context in which \code{vtree} is called.

First, there are two special cases where \code{vtree} does not show a variable tree:
 
\itemize{
  \item If \code{ptable=TRUE}, the return value is a data frame representing a pattern table.
  \item Otherwise, if \code{getscript=TRUE}, the return value is a character string,
        consisting of a DOT script that describes the variable tree.
}

If neither of the above cases applies, the return value is as follows.
If knitting is \emph{not} taking place
(such as when \code{vtree} is used \strong{interactively}):
\itemize{
  \item the return value is an object of class \code{htmlwidget} (see \link[DiagrammeR]{DiagrammeR}).
        It will intelligently print itself into HTML in a variety of contexts
        including the R console, within R Markdown documents,
        and within Shiny output bindings.
        
        The \code{info} attribute of the return object is a list whose top
        level represents the root node of the tree.
        Within this list is a list named after the first variable in the tree.
        In turn, within this list are lists named after the observed
        values of that variable.
        In turn, each of these lists is an element named after
        the next variable in the tree.
        And so on.
        The root element as well as each list element named after a value of a variable also 
        contains elements \code{.n} (representing the number of observations),
        \code{.pct} (representing the percentage), and
        \code{.txt} (representing additional text such as summaries).
        
}

If knitting \emph{is} taking place:
\itemize{
  \item If \code{pngknit=TRUE} (the default),
        the return value is a character string of
        pandoc markdown code to embed a PNG file with fully-specified path.
        The character string will have class \code{knit_asis} so that
        knitr will treat it as is
        (the effect is the same as the chunk option results = 'asis')
        when it is written to the output. (See \code{?knitr::asis_output})
  \item If \code{pngknit=FALSE}, the return value is the same as when knitting is not
        taking place, i.e. an object of class \code{htmlwidget}.
}
}
\description{
Variable trees display information about nested subsets of a data frame,
in which the subsetting is defined by the values of categorical variables.
}
\section{Knitr, R Markdown, Sweave}{

If \code{folder} is not specified and knitting to LaTeX,
the folder will be set to the value of \code{knitr::opts_chunk$get("fig.path")}.
(If this folder does not exist, it will be created.)
If \code{folder} is not specified and knitting to markdown,
a temporary folder will be used.

If \code{format} is not specified and knitting is taking place,
then a PNG file is generated, unless a LaTeX document is 
being generated (e.g. via Sweave), in which case a PDF file is generated.   
PNG image files will end in \code{.png}.
PDF image files will end in \code{.pdf}.
                                                
As noted in the \strong{Value} section above,
\code{vtree} has special support for R Markdown.

By default, when knitting an R Markdown file,
\code{vtree} generates PNG files and embeds them automatically in the output document.
This feature is needed when knitting to a \code{.docx} file.
When knitting to HTML, it is not necessary to generate PNG files
because HTML browsers can directly display htmlwidgets.

To generate htmlwidgets instead of PNG files, specify \code{pngknit=FALSE}.
(Note, however, that there are some advantages to embedding PNG files in an HTML file.
For example,
some browsers perform poorly when numerous htmlwidgets are included in an HTML file.)

When PNG files are generated, they are stored by default in a temporary folder.
The folder can also be specified using the \code{folder} parameter.
(Using the base R function \code{options}, 
a custom option \code{vtree_folder} is used to automatically keep track of this.)
Successive PNG files generated by an R Markdown file
are named \code{vtree001.png}, \code{vtree002.png}, etc.
(A custom option \code{vtree_count} is used to automatically keep track of the number of PNG files.)
}

\section{Pruning}{

Each of the parameters \code{prune}, \code{keep}, \code{prunebelow}, \code{follow}
takes a named list of vectors as its argument.
Each vector specifies nodes of a variable.
\itemize{
  \item \code{prune}: which nodes should be pruned.
  \item \code{keep}: which nodes should \emph{not} be pruned.
  \item \code{prunebelow}: which nodes should have their descendants pruned.
  \item \code{follow}: which nodes should \emph{not} have their descendants pruned.
}
The \code{tprune} parameter specifies "targeted" pruning.
Standard pruning removes all nodes with the specified value of the specified variable.
The \code{tprune} parameter specifies one or more particular paths from the root of the tree
down to a node to be pruned.
}

\section{Displaying summary information}{

The \code{summary} parameter allows you to specify information to display
in each node. The parameter can be specified as a vector of character strings,
where each element represents a different variable to summarize.
When an element of \code{summary} is specified as a single variable name, 
the following default set of summary statistics is shown:
the variable name, number of missing values, mean and standard deviation,
median and interquartile range and range.
A customized summary is shown when an element of \code{summary}
is specified as a character string with the following structure:
\itemize{
  \item{First, the name of the variable for which a summary is desired.}
  \item{Next a space.}
  \item{The remainder of the string specifies what to display, with text as well as special codes (such as \code{\%mean\%}) to indicate the type of summary desired and to control which nodes display the summary, etc. See the vignette for more details.}
}
}

\section{Palettes}{

The following palettes
(obtained from \code{RColorBrewer}) are used in the order indicated:

\tabular{rlcrlcrlcrlcclcr}{
 1 \tab Reds     \tab \tab 4 \tab Oranges  \tab \tab 7  \tab PuBu   \tab \tab 10 \tab PuBuGn \tab \tab 13 \tab RdYlGn \cr
 2 \tab Blues    \tab \tab 5 \tab Purples  \tab \tab 8  \tab PuRd   \tab \tab 11 \tab BuPu   \tab \tab 14 \tab Set1   \cr 
 3 \tab Greens   \tab \tab 6 \tab YlGn     \tab \tab 9  \tab YlOrBr \tab \tab 12 \tab YlOrRd \tab \tab    \tab        \cr
}
}

\examples{

# Call vtree and give the root node a title
vtree(FakeData,"Sex Severity",title="People")

# R Markdown inline call to vtree
# `r vtree(FakeData,"Sex Severity")`

# Rename some nodes
vtree(FakeData,"Severity Sex",labelnode=list(Sex=(c("Male"="M","Female"="F"))))

# Rename a variable
vtree(FakeData,"Severity Sex",labelvar=c(Severity="How bad?"))

# Show legend. Put labels on the same line as counts and percentages
vtree(FakeData,"Severity Sex Viral",sameline=TRUE,showlegend=TRUE)

# Use the summary parameter to list ID numbers (truncated to 40 characters) in specified nodes
vtree(FakeData,"Severity Sex",summary="id \nid = \%list\% \%var=Severity\% \%trunc=40\%")

# Add text to specified nodes of a tree ("targeted text")
vtree(FakeData,"Severity Sex",ttext=list(
  c(Severity="Severe",Sex="M",text="\nMales with Severe disease"),
  c(Severity="NA",text="\nUnknown severity")))

}
\references{
Barrowman N, Webster RJ (2025). “Exploring Data Subsets with vtree.” \emph{Journal of Statistical Software}, \strong{114}(4), 1-28. <doi:10.18637/jss.v114.i04>.
}
\seealso{
\href{../doc/vtree.html}{\code{vignette("vtree")}}
}
\author{
Nick Barrowman <nbarrowman@cheo.on.ca>
}
