\name{make_dictionary}
\alias{make_dictionary}
\title{Create a Sumerian Dictionary from Annotated Text Files}
\description{
Parses Word documents (.docx) or plain text files containing annotated
Sumerian translations and creates a structured dictionary data frame.
The function extracts sign names, their cuneiform representations,
possible readings, and translations with grammatical types.
}
\usage{
make_dictionary(file, mapping = NULL)
}
\arguments{
\item{file}{A character vector of file paths to .docx or text files.
Files must contain translation lines that are formatted as described below.}
\item{mapping}{A data frame containing sign-to-reading mappings with columns
\code{name}, \code{cuneiform} and \code{syllables}. If \code{NULL} (default), the package's built-in
mapping file \code{etcsl_mapping.txt} is used.}
}
\details{
\subsection{Input Format}{
The input files must contain lines starting with \code{|} in the following format:

\code{|sign_name: TYPE: meaning}

or

\code{|equation for sign_name: TYPE: meaning}

 For example:
\preformatted{
|a2-tab: S: the double amount of work performance
|me=ME: S: divine force
|AN: S: god of heaven
|na=NA: Sx->A: whose existence is bound to S
}

Lines not starting with \code{|} are ignored. Only the first entry in an equation of sign names is used for the dictionary. The following notation is suggested for grammatical types:

\itemize{
\item \code{S} for substantives and noun phrases,  (e.g., "the old man in the temple")
\item \code{V} for verbs and decorated verbs (e.g., "to go", "to bring the delivery into the temple")
\item \code{A} for adjectives, attributes and subordinate clauses that further define the subject (e.g., "who/which is weak", "whose resource for sustaining life is grain")
\item \code{Sx->A} for a symbol that transforms the preceding noun phrase into an attribute (e.g., "whose resource for sustaining life is \code{S}"). Other transformations are denoted accordingly.
\item \code{N} for numbers,
\item \code{D} for everything else.
}
}

\subsection{Processing Steps}{
\enumerate{
\item Extracts text from .docx files or reads plain text
\item Filters lines starting with \code{|}
\item Normalizes sign names and looks up possible readings from the mapping table
\item Aggregates translations and counts occurrences
}
}

\subsection{Output Structure}{
For each unique sign, the output contains:
\itemize{
\item One \code{cunei.} row with the cuneiform character(s)
\item One \code{reading} row with possible phonetic readings
\item One or more \code{trans.} rows with translations, sorted by frequency
}
}
}
\value{
A data frame with the following columns:
\describe{
\item{sign_name}{The normalized Sumerian sign name (e.g., "A", "AN", "ME")}
\item{row_type}{Type of entry: \code{"cunei."} (cuneiform), \code{"reading"} (phonetic readings), or \code{"trans."} (translation)}
\item{count}{Number of occurrences for translations; \code{NA} for cuneiform and reading entries}
\item{type}{Grammatical type (e.g., "S", "V", "Sx->A") for translations; empty for other line types}
\item{meaning}{The cuneiform character(s), reading(s), or translated meaning depending on line_type}
}
}
\seealso{
\code{\link{as.cuneiform}}, \code{\link{split_sumerian}}
}
\examples{

# Create a dictionary from a single text document
filename  <- system.file("extdata", "text_with_translations.txt", package = "sumer")
dict <- make_dictionary(filename)

# Use the dictionary
look_up("an", dict)
}
