\name{init_substr_info}
\alias{init_substr_info}
\title{Initialize a Data Frame of All Substrings}
\description{
Creates a data frame containing all contiguous substrings of a token vector, including the full token sequence itself. Each row represents one substring, with its starting position, length in tokens, the concatenated expression, and empty columns for type and translation.

The rows are ordered by \code{n_tokens} descending and \code{start} ascending, so that the row number can be computed from \code{start} and \code{n_tokens} using \code{\link{substr_position}}.

This is an internal helper function.
}
\usage{
init_substr_info(token)
}
\arguments{
  \item{token}{A character vector of Sumerian tokens (e.g. cuneiform signs).}
}
\details{
For a token vector of length \eqn{N}, the function generates all \eqn{N(N+1)/2} contiguous substrings. The substrings are ordered by \code{n_tokens} descending (longest first) and within each group by \code{start} ascending. This ordering ensures that the row index of any substring can be computed with the formula

\deqn{\mathrm{row} = \frac{(N - k)(N - k + 1)}{2} + s}{row = (N - k)(N - k + 1) / 2 + s}

where \eqn{k} is the number of tokens (\code{n_tokens}) and \eqn{s} is the starting position (\code{start}).

The \code{expr} column contains the tokens concatenated without separators. The \code{type} and \code{translation} columns are initialized as empty strings, intended to be filled in later.
}
\value{
A data frame with \eqn{N(N+1)/2} rows and the following columns:
  \item{start}{Integer. The position of the first token in the substring (1-based).}
  \item{n_tokens}{Integer. The number of tokens in the substring.}
  \item{expr}{Character. The concatenated token sequence (without separators).}
  \item{type}{Character. Initialized as empty string \code{""}.}
  \item{translation}{Character. Initialized as empty string \code{""}.}
}
\seealso{
\code{\link{substr_position}} for computing the row index from \code{start} and \code{n_tokens},
\code{\link{skeleton}} for creating translation templates,
\code{\link{make_dictionary}} for creating dictionaries from filled-in templates
}
\examples{
x<-"<d-nu-dim2-mud> ki a. jal2 (e2{kur}) ra. gaba jal2. an ki a"

token <- split_sumerian(as.cuneiform(x))$signs

df <- sumer:::init_substr_info(token)
df

# Verify that substr_position recovers the row indices
N <- length(token)
all(seq_len(nrow(df)) == sumer:::substr_position(df$start, df$n_tokens, N))
}
\keyword{internal}
