% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stats.R
\name{chisquare}
\alias{chisquare}
\alias{chisquare,features-method}
\alias{chisquare,context-method}
\alias{chisquare,cooccurrences-method}
\title{Perform chisquare-text.}
\usage{
chisquare(.Object)

\S4method{chisquare}{features}(.Object)

\S4method{chisquare}{context}(.Object)

\S4method{chisquare}{cooccurrences}(.Object)
}
\arguments{
\item{.Object}{A \code{features} object, or an object inheriting from it
(\code{context}, \code{cooccurrences}).}
}
\value{
Same class as input object, with enriched table in the
  \code{stat}-slot.
}
\description{
Perform Chisquare-Test based on a table with counts
}
\details{
The basis for computing for the chi square test is a contingency table of
observationes, which is prepared for every single token in the corpus. It
reports counts for a token to inspect and all other tokens in a corpus of
interest (coi) and a reference corpus (ref):
\tabular{rccc}{
\tab coi   \tab ref \tab TOTAL\cr
count token \tab \eqn{o_{11}}{o11}  \tab \eqn{o_{12}}{o12} \tab \eqn{r_{1}}{r1}\cr
other tokens \tab \eqn{o_{21}}{o21}    \tab \eqn{o_{22}}{o22} \tab \eqn{r_{2}}{r2}\cr
TOTAL \tab \eqn{c_{1}}{c1}    \tab \eqn{c_{2}}{c2} \tab N\cr
}
Based on the contingency table, expected values are calculated for each cell,
as the product of the column and margin sums, divided by the overall number
of tokens (see example). The standard formula for calculating the chi-square
test is computed as follows. \deqn{X^{2} = \sum{\frac{(O_{ij} -
E_{ij})^2}{O_{ij}}}}{X2 = (o11 - e11)^2/e11 + (o12 - e12)^2/e12 + (o12 -
e12)^2/e12 + (o22 - e22)^2/e22}
Results from the chisquare test are only robust for at least 5 observed
counts in the corpus of interest. Usually, results need to be filtered
accordingly (see examples).
}
\examples{
use("polmineR")
library(data.table)
m <- partition(
  "GERMAPARLMINI", speaker = "Merkel", interjection = "speech",
  regex = TRUE, p_attribute = "word"
)
f <- features(m, "GERMAPARLMINI", included = TRUE)
f_min <- subset(f, count_coi >= 5)
summary(f_min)

\dontrun{

# A sample do-it-yourself calculation for chisquare:

# (a) prepare matrix with observed values
o <- matrix(data = rep(NA, 4), ncol = 2) 
o[1,1] <- as.data.table(m)[word == "Weg"][["count"]]
o[1,2] <- count("GERMAPARLMINI", query = "Weg")[["count"]] - o[1,1]
o[2,1] <- size(f)[["coi"]] - o[1,1]
o[2,2] <- size(f)[["ref"]] - o[1,2]


# prepare matrix with expected values, calculate margin sums first

r <- rowSums(o)
c <- colSums(o)
N <- sum(o)

e <- matrix(data = rep(NA, 4), ncol = 2) 
e[1,1] <- r[1] * (c[1] / N)
e[1,2] <- r[1] * (c[2] / N)
e[2,1] <- r[2] * (c[1] / N)
e[2,2] <- r[2] * (c[2] / N)


# compute chisquare statistic

y <- matrix(rep(NA, 4), ncol = 2)
for (i in 1:2) for (j in 1:2) y[i,j] <- (o[i,j] - e[i,j])^2 / e[i,j]
chisquare_value <- sum(y)

as(f, "data.table")[word == "Weg"][["chisquare"]]
}
}
\references{
Manning, Christopher D.; Schuetze, Hinrich (1999):
  \emph{Foundations of Statistical Natural Language Processing}. MIT Press:
  Cambridge, Mass., pp. 169-172.

Kilgarriff, A. and Rose, T. (1998): Measures for corpus
  similarity and homogeneity. \emph{Proc. 3rd Conf. on Empirical Methods in
  Natural Language Processing}. Granada, Spain, pp 46-52.
}
\seealso{
Other statistical methods: 
\code{\link{ll}()},
\code{\link{pmi}()},
\code{\link{t_test}()}
}
\author{
Andreas Blaette
}
\concept{statistical methods}
\keyword{textstatistics}
