\name{mark_ngrams}
\alias{mark_ngrams}
\title{Mark N-gram Combinations in Cuneiform Text}
\description{
Takes a character vector of Sumerian text and marks all n-gram
combinations (from \code{\link{ngram_frequencies}}) with curly braces.
Longer combinations are marked first, shorter ones afterwards
(including inside already-marked regions).
}
\usage{
mark_ngrams(x, ngram, mapping = NULL)
}
\arguments{
\item{x}{A character vector of Sumerian text (transliteration,
sign names, or cuneiform). Will be converted to cuneiform
internally.}

\item{ngram}{A data frame as returned by \code{\link{ngram_frequencies}},
with at least columns \code{combination} and \code{length}.}

\item{mapping}{A data frame containing the sign mapping table with columns \code{syllables}, \code{name}, and \code{cuneiform}. If \code{NULL} (the default), the package's internal mapping file \file{etcsl_mapping.txt} is loaded.}
}
\value{
A character vector of cuneiform text with n-gram combinations
enclosed in curly braces and surrounded by spaces.
}
\details{
The function first converts \code{x} to cuneiform (if not already)
and removes spaces and brackets \code{()[]{}}.

Then it sorts \code{ngram} descending by \code{length} and replaces
each occurrence of a combination with \code{ \{combination\} }
(space, open brace, combination, close brace, space).

Shorter n-grams may be marked inside already-marked longer n-grams
(nesting is allowed).
}
\examples{

# Load the example text of "Enki and the World Order"
path  <- system.file("extdata", "enki_and_the_world_order.txt", package = "sumer")
text <- readLines(path, encoding="UTF-8")
cat(text[1:10],sep="\n")

# Find combinations that appear at least 6 times in the text
freq <- ngram_frequencies(text, min_freq = 6)
freq[1:10,]

# Mark these combinations in the text
text_marked <- mark_ngrams(text, freq)
cat(text_marked[1:10], sep="\n")

# You can enter transliterated text
x <- "kij2-sig unu2 gal d-re-e-ne-ka me-te-ac im-mi-ib-jal2"
mark_ngrams(x, freq)

# Find all occurences of a pattern in the annotated text
term     <- "IGI.DIB.TU"
(pattern <- mark_ngrams(term, freq))
result   <- text_marked[grepl(pattern, text_marked, fixed=TRUE)]
cat(result, sep="\n")


}
\seealso{
\code{\link{ngram_frequencies}}
}
