% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/vs_alignment_classification.R
\name{vs_alignment_classification}
\alias{vs_alignment_classification}
\alias{alignment_classification}
\alias{lca}
\alias{lca_classification}
\title{Taxonomic classification with LCA}
\usage{
vs_alignment_classification(
  fastx_input,
  database,
  lcaout = NULL,
  lca_cutoff = 1,
  top_hits_only = FALSE,
  gapopen = "20I/2E",
  gapext = "2I/1E",
  id = 0.7,
  strand = "plus",
  maxaccepts = 2,
  maxrejects = 32,
  threads = 1,
  vsearch_options = NULL,
  tmpdir = NULL
)
}
\arguments{
\item{fastx_input}{(Required). A FASTA/FASTQ file path or FASTA/FASTQ object.
See \emph{Details}.}

\item{database}{(Required). A FASTA/FASTQ file path or FASTA/FASTQ tibble
object containing the target sequences.}

\item{lcaout}{(Optional). A character string specifying the name of the
output file. If \code{NULL} (default), no output is
written to a file and the results are returned as a tibble with the columns
\code{query_id} and \code{taxonomy}.}

\item{lca_cutoff}{(Optional). Adjust the fraction of matching hits required
for the last common ancestor (LCA). Defaults to \code{1.0}, which requires
all hits to match at each taxonomic rank for that rank to be included. If a
lower cutoff value is used, e.g. 0.95, a small fraction of non-matching hits
are allowed while that rank will still be reported. The argument to this
option must be between \code{0.5} and \code{1.0}.}

\item{top_hits_only}{(Optional). If \code{TRUE}, only the top hits with an
equally high percentage of identity between the query and database sequence
sets are written to the output. Defaults to \code{FALSE}.}

\item{gapopen}{(Optional). Penalties for gap opening. Defaults to
\code{"20I/2E"}. See \emph{Details}.}

\item{gapext}{(Optional). Penalties for gap extension. Defaults to
\code{"2I/1E"}. See \emph{Details}.}

\item{id}{(Optional). Pairwise identity threshold. Defines the minimum
identity required for matches. Defaults to \code{0.7}.}

\item{strand}{(Optional). Specifies which strand to consider when comparing
sequences. Can be either \code{"plus"} (default) or \code{"both"}.}

\item{maxaccepts}{(Optional). Maximum number of matching target sequences to
accept before stopping the search for a given query. Defaults to \code{2}.
Must be larger than \code{1} for information to be useful.}

\item{maxrejects}{(Optional). Maximum number of non-matching target sequences
to consider before stopping the search for a given query. Defaults to 32. If
\code{maxaccepts} and \code{maxrejects} are both set to 0, the complete
database is searched.}

\item{threads}{(Optional). Number of computational threads to be used by
\code{VSEARCH}. Defaults to \code{1}.}

\item{vsearch_options}{(Optional). Additional arguments to pass to
\code{VSEARCH}. Defaults to \code{NULL}. See \emph{Details}.}

\item{tmpdir}{(Optional). Path to the directory where temporary files should
be written when tables are used as input or output. Defaults to
\code{NULL}, which resolves to the session-specific temporary directory
(\code{tempdir()}).}
}
\value{
A tibble or \code{NULL}.

If \code{lcaout} is specified the results are written to the specified file.
If \code{lcaout} is \code{NULL} a data.frame is returned.

The data.frame contains the classification results for each query sequence.
Both the \code{Header} and \code{Sequence} columns of \code{fasta_input} are
copied into this table, and in addition are also the columns for each rank.
The ranks depend on the database file used, but are typically domain, phylum,
class, order,family, genus and species.
}
\description{
\code{vs_alignment_classification} assigns taxonomy by global
alignment and Last Common Ancestor (LCA) consensus of database hits using
\code{VSEARCH}.
}
\details{
Performs global sequence alignment against a reference database and assigns
taxonomy using the Last Common Ancestor (LCA) approach, reporting the deepest
taxonomic level consistently supported by the majority of hits.

\code{fastx_input} and \code{database} can either be file paths to a
FASTA/FASTQ files or FASTA/FASTQ objects. FASTA objects are tibbles that
contain the columns \code{Header} and \code{Sequence}, see
\code{\link[microseq]{readFasta}}. FASTQ objects are tibbles that contain the
columns \code{Header}, \code{Sequence}, and \code{Quality}, see
\code{\link[microseq]{readFastq}}.

Pairwise identity (\code{id}) is calculated as the number of matching columns
divided by the alignment length minus terminal gaps.

\code{vsearch_options} allows users to pass additional command-line arguments
to \code{VSEARCH} that are not directly supported by this function. Refer to
the \code{VSEARCH} manual for more details.

Visit the \code{VSEARCH}
\href{https://github.com/torognes/vsearch?tab=readme-ov-file#getting-help}{documentation}
for information about defining \code{gapopen} and \code{gapext}.
}
\examples{
\dontrun{
# Example files
db.file <- file.path(file.path(path.package("Rsearch"), "extdata"),
                     "sintax_db.fasta")
fasta.file <- file.path(file.path(path.package("Rsearch"), "extdata"),
                     "small.fasta")

tax.tbl <- vs_alignment_classification(fastx_input = fasta.file,
                                       database = db.file)
View(tax.tbl)
}

}
\references{
\url{https://github.com/torognes/vsearch}
}
