% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/SeqId.R, R/matchSeqIds.R
\name{SeqId}
\alias{SeqId}
\alias{getSeqId}
\alias{regexSeqId}
\alias{locateSeqId}
\alias{seqid2apt}
\alias{apt2seqid}
\alias{is.apt}
\alias{is.SeqId}
\alias{is.AptName}
\alias{matchSeqIds}
\alias{getSeqIdMatches}
\title{Working with SomaLogic SeqIds}
\usage{
getSeqId(x, trim.version = FALSE)

regexSeqId()

locateSeqId(x, trailing = TRUE)

seqid2apt(x)

apt2seqid(x)

is.apt(x)

is.SeqId(x)

is.AptName(x)

matchSeqIds(x, y, order.by.x = TRUE)

getSeqIdMatches(x, y, show = FALSE)
}
\arguments{
\item{x}{Character. A vector of strings, usually analyte/feature column
names, \code{AptNames}, or \code{SeqIds}. For \code{\link[=seqid2apt]{seqid2apt()}}, a vector \emph{of} \code{SeqIds}.
For \code{\link[=apt2seqid]{apt2seqid()}}, a character vector \emph{containing} \code{SeqIds}.
For \code{\link[=matchSeqIds]{matchSeqIds()}}, a vector of pattern matches containing \code{SeqIds}.
Can be \code{AptNames} with \code{GeneIDs}, the \code{seq.XXXX} format,
or even "naked" \code{SeqIds}.}

\item{trim.version}{Logical. Whether to remove the version number,
i.e. "1234-56_7" -> "1234-56". Primarily for legacy ADATs.}

\item{trailing}{Logical. Should the regular expression explicitly specify
\emph{trailing} \code{SeqId} pattern match, i.e. \code{"regex$"}?
This is the most common case and the default.}

\item{y}{Character. A second vector of \code{AptNames} containing \code{SeqIds}
to match against those in contained in \code{x}.
For \code{\link[=matchSeqIds]{matchSeqIds()}} these values are returned if there are matching elements.}

\item{order.by.x}{Logical. Order the returned character string by
the \code{x} (first) argument?}

\item{show}{Logical. Return the data frame visibly?}
}
\value{
\code{\link[=getSeqId]{getSeqId()}}: a character vector of \code{SeqIds} captured from a string.

\code{\link[=regexSeqId]{regexSeqId()}}: a regular expression (\code{regex}) string
pre-defined to match SomaLogic the \code{SeqId} pattern.

\code{\link[=locateSeqId]{locateSeqId()}}: a data frame containing the \code{start} and \code{stop}
integer positions for \code{SeqId} matches at each value of \code{x}.

\code{\link[=seqid2apt]{seqid2apt()}}: a character vector with the \verb{seq.*} prefix, i.e.
the inverse of \code{\link[=getSeqId]{getSeqId()}}.

\code{\link[=apt2seqid]{apt2seqid()}}: a character vector of \code{SeqIds}. \code{\link[=is.SeqId]{is.SeqId()}} will
return \code{TRUE} for all elements.

\code{\link[=is.apt]{is.apt()}}, \code{\link[=is.SeqId]{is.SeqId()}}: Logical. \code{TRUE} or \code{FALSE}.

\code{\link[=matchSeqIds]{matchSeqIds()}}: a character string corresponding to values
in \code{y} of the intersect of \code{x} and \code{y}. If no matches are
found, \code{character(0)}.

\code{\link[=getSeqIdMatches]{getSeqIdMatches()}}: a \eqn{n x 2} data frame, where \code{n} is the
length of the intersect of the matching \code{SeqIds}.
The data frame is named by the passed arguments, \code{x} and \code{y}.
}
\description{
The \code{SeqId} is the cornerstone used to uniquely identify
SomaLogic analytes.
\code{SeqIds} follow the format \strong{\verb{<Pool>-<Clone>_<Version>}}, for example
\code{"1234-56_7"} can be represented as:
\tabular{ccc}{
\strong{Pool} \tab \strong{Clone} \tab \strong{Version} \cr
\code{1234}   \tab \code{56}      \tab \code{7}
}
See \strong{Details} below for the definition of each sub-unit.
The \strong{\verb{<Pool>-<Clone>}} combination is sufficient to uniquely identify a
specific analyte and therefore versions are no longer provided (though
they may be present in legacy ADATs).
The tools below enable users to extract, test, identify, compare,
and manipulate \code{SeqIds} across assay runs and/or versions.
}
\details{
\tabular{ll}{
\strong{Pool:}    \tab ties back to the original well during \strong{SELEX} \cr
\strong{Clone:}   \tab ties to the specific sequence within a pool \cr
\strong{Version:} \tab refers to custom modifications (optional/defunct)
}
\describe{
\item{\code{AptName}}{a \code{SeqId} combined with a string, usually a \code{GeneId}- or
\code{seq.}-prefix, for convenient, human-readable
manipulation from within \code{R}.}
}
}
\section{Functions}{
\itemize{
\item \code{getSeqId()}: extracts/captures the the \code{SeqId} match from an analyte column identifier,
i.e. column name of an ADAT loaded with \code{\link[=read_adat]{read_adat()}}. Assumes the
\code{SeqId} pattern occurs at the end of the string, which for
the vast majority of cases will be true. For edge cases, see the
\code{trailing} argument to \code{\link[=locateSeqId]{locateSeqId()}}.

\item \code{regexSeqId()}: generates a pre-formatted regular expression for
matching of \code{SeqIds}. Note the \emph{trailing} match, which is most
commonly required, but \code{\link[=locateSeqId]{locateSeqId()}} offers
an alternative to mach \emph{anywhere} in a string.
Used internally in \emph{many} utility functions

\item \code{locateSeqId()}: generates a data frame of the positional \code{SeqId} matches. Specifically
designed to facilitate \code{SeqId} extraction via \code{\link[=substr]{substr()}}.
Similar to \code{\link[stringr:str_locate]{stringr::str_locate()}}.

\item \code{seqid2apt()}: converts a \code{SeqId} into anonymous-AptName format, i.e.
\code{1234-56} -> \code{seq.1234.56}. Version numbers (\verb{1234-56_ver})
are always trimmed when present.

\item \code{apt2seqid()}: converts an anonymous-AptName into \code{SeqId} format, i.e.
\code{seq.1234.56} -> \code{1234-56}. Version numbers (\code{seq.1234.56.ver})
are always trimmed when present.

\item \code{is.apt()}: regular expression match to determine if a string \emph{contains}
a \code{SeqId}, and thus is probably an \code{AptName} format string. Both
legacy \code{EntrezGeneSymbol-SeqId} combinations or newer
so-called \code{"anonymous-AptNames"} formats (\code{seq.1234.45}) are matched.

\item \code{is.SeqId()}: tests for \code{SeqId} format, i.e. values returned from \code{\link[=getSeqId]{getSeqId()}}
will always return \code{TRUE}.

\item \code{is.AptName()}: tests for \code{AptName} format, i.e. values returned from \code{\link[=seqid2apt]{seqid2apt()}}
will always return \code{TRUE}. This function will only match \code{AptNames}, not
\code{SeqIds}, and is therefore more strict than \code{is.apt()}.

\item \code{matchSeqIds()}: matches two character vectors on the basis of their
intersecting \code{SeqIds}. Note that elements in \code{y} not
containing a \code{SeqId} regular expression are silently dropped.

\item \code{getSeqIdMatches()}: matches two character vectors on the basis of their intersecting \emph{SeqIds}
only (irrespective of the \code{GeneID}-prefix). This produces a two-column
data frame which then can be used as to map between the two sets.

The final order of the matches/rows is by the input
corresponding to the \emph{first} argument (\code{x}).

By default the data frame is invisibly returned to
avoid dumping excess output to the console (see the \verb{show =} argument.)

}}
\examples{
x <- c("ABDC.3948.48.2", "3948.88",
       "3948.48.2", "3948-48_2", "3948.48.2",
       "3948-48_2", "3948-88",
       "My.Favorite.Apt.3948.88.9")

tibble::tibble(orig       = x,
               SeqId      = getSeqId(x),
               SeqId_trim = getSeqId(x, TRUE),
               AptName    = seqid2apt(SeqId))

# Logical Matching
is.apt("AGR2.4959.2") # TRUE
is.apt("seq.4959.2")  # TRUE
is.apt("4959-2")      # TRUE
is.apt("AGR2")        # FALSE


# SeqId Matching
x <- c("seq.4554.56", "seq.3714.49", "PlateId")
y <- c("Group", "3714-49", "Assay", "4554-56")
matchSeqIds(x, y)
matchSeqIds(x, y, order.by.x = FALSE)

# vector of features
feats <- getAnalytes(example_data)

match_df <- getSeqIdMatches(feats[1:100], feats[90:500])  # 11 overlapping
match_df

a <- utils::head(feats, 15)
b <- withr::with_seed(99, sample(getSeqId(a)))   # => SeqId & shuffle
(getSeqIdMatches(a, b))                          # sorted by first vector "a"
}
\seealso{
\code{\link[generics:setops]{generics::intersect()}}
}
\author{
Stu Field
}
