% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lsh_compare.R
\name{lsh_compare}
\alias{lsh_compare}
\title{Compare candidates identified by LSH}
\usage{
lsh_compare(candidates, corpus, f, progress = interactive())
}
\arguments{
\item{candidates}{A data frame returned by \code{\link{lsh_candidates}}.}

\item{corpus}{The same \code{\link{TextReuseCorpus}} corpus which was used to generate the candidates.}

\item{f}{A comparison function such as \code{\link{jaccard_similarity}}.}

\item{progress}{Display a progress bar while comparing documents.}
}
\value{
A data frame with values calculated for \code{score}.
}
\description{
The \code{\link{lsh_candidates}} only identifies potential matches, but
cannot estimate the actual similarity of the documents. This function takes a
data frame returned by \code{\link{lsh_candidates}} and applies a comparison
function to each of the documents in a corpus, thereby calculating the
document similarity score. Note that since your corpus will have minhash
signatures rather than hashes for the tokens itself, you will probably wish
to use \code{\link{tokenize}} to calculate new hashes. This can be done for
just the potentially similar documents. See the package vignettes for
details.
}
\examples{
dir <- system.file("extdata/legal", package = "textreuse")
minhash <- minhash_generator(200, seed = 234)
corpus <- TextReuseCorpus(dir = dir,
                          tokenizer = tokenize_ngrams, n = 5,
                          minhash_func = minhash)
buckets <- lsh(corpus, bands = 50)
candidates <- lsh_candidates(buckets)
lsh_compare(candidates, corpus, jaccard_similarity)
}

