% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/gc_cal.R
\name{gc_cal}
\alias{gc_cal}
\title{Identify and Extract Gene Clusters from Scaled BLAST Data}
\usage{
gc_cal(
  Data = bin_genes,
  in_gene_list = photosynthesis_gene_list,
  AllGeneNum = 30,
  MinConSeq = 15
)
}
\arguments{
\item{Data}{A data frame produced by \code{\link{orf_extract}} (i.e., a scaled
BLAST table).  Must include the columns \code{genome_contig},
\code{gene}, and \code{orf_position}.}

\item{in_gene_list}{A character vector of “reference” gene symbols (e.g.,
\code{photosynthesis_gene_list}) that are expected
to appear in the target cluster(s).}

\item{AllGeneNum}{Integer.  Maximum total ORF count (annotated plus hypothetical)
that the algorithm is allowed to span when defining a cluster
(default: 30).}

\item{MinConSeq}{Integer.  Minimum number of \strong{reference genes} that must be
present \strong{and consecutive} within the candidate cluster
(default: 15).  Must satisfy \code{1 <= MinConSeq <= AllGeneNum}.}
}
\value{
A data frame identical in structure to \code{Data} but filtered to
contain only those rows that belong to valid clusters.  An extra
column \code{gene_cluster} (format: \code{genome_contig---N}) is added
to uniquely label every cluster.
}
\description{
This function screens contigs for regions that contain a
pre-defined set of “reference” genes (e.g., photosynthetic genes, viral genes)
arranged in a continuous block.  Contigs are
first coarsely filtered by the minimum number of reference genes
they carry, then finely scanned for clusters that satisfy user-
defined density and contiguity criteria.  Each detected cluster
is returned with a unique \code{gene_cluster} identifier.
}
\details{
\enumerate{
\item \strong{Coarse filter}:  Contigs with fewer than \code{MinConSeq} reference
genes are discarded.
\item \strong{Fine scan}:  For each remaining contig, the algorithm slides a
window that can encompass up to \code{AllGeneNum} consecutive ORFs
and retains windows that contain at least \code{MinConSeq} reference
genes in uninterrupted order.
\item \strong{Cluster labelling}:  Each valid cluster receives a unique ID
(\code{genome_contig---1}, \code{genome_contig---2}, …).
}
}
