#' Calcualte the distance matrices using the BAM files
#' @description This function generates an average correlation/distance coefficient
#' for every exon present in the BAM files. This is done by calculating
#' the point-to-point correlation/distance of the distribution of reads
#' across the transcript of each exon and comparing it across samples.
#' @param path.bams,bams either a path to the directory where the BAM files are
#' or a vector of paths to each individual file; if a path is specified,
#' it extracts all files that end in .bam; looks in the working directory by default
#' @param path.gtf the path to the gtf/gff annotation file;
#' if unspecified, looks for one in the working directory
#' @param genes a tibble of the genes extracted from the gtf file;
#' this is meant for speed if the output of cast_gtf_to_genes() is already generated
#' @param expression.matrix expression matrix; not necessary but is used to filter the
#' gtf to fewer entries and for subsampling if subsample.genes=TRUE;
#' if not provided, raw read counts are extracted from the BAM files
#' @param subsample.genes logical, whether to subsample low abundance genes to decrease
#' computational time; the first minimum of the distribution of abundances is calculated,
#' and genes lower than it are subsampled to match the number of genes higher than it;
#' a plot is generated to show that minimum
#' @param make.index whether a BAM index should be generated; if this is FALSE (the default)
#' and no index exists, the function will exit with an error; the index needs to have
#' the same name as each BAM file, but ending with .bam.bai
#' @param uniqueOnly whether only uniquely mapped reads should contribute to the profile;
#' default is TRUE
#' @param mapq.unique The values of the mapping quality field in the BAM file that corresponds
#' to uniquely mapped reads; by default, values of 255 are used as these correspond to
#' the most popular aligners, but an adjustment might be needed;
#' the mapq scores should be as follows: 255 for STAR, 60 for hisat2,
#' 255 for bowtie in -k mode, 40 for bowtie2 default, 50 for tophat
#' @param slack slack needs to be >=readLength, adjust for efficiency; the default is 200,
#' as it is higher than most modern sequencing experiments
#' @param method one of the distance metrics to be used, defaults to pearson correlation
#'        CURRENTLY ONLY CORRELATIONS!
#' @param save.image.every.1000 whether to save a workspace image after every 1000 exons
#' are processed; default is FALSE
#' @param ncores Number of cores to use for parallel computation; needs doParallel installed;
#' defaults to sequential computation
#' @return A list with three elements: the first element is the expression matrix,
#'         as supplied or calculated; the other two are the abundance and distance matrix;
#'         they have the same # of columns as the expression matrix,
#'         and as many rows as genes processed.
#' @export
#' @examples
#' bams <- rep(system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE), 2)
#' genes <- data.frame("id" = 1:2,
#'                     "gene_id" = c("gene1", "gene2"),
#'                     "seqid" = c("seq1", "seq2"),
#'                     "start" = 1,
#'                     "end" = 1600)
#' obj <- calculate_distance_matrices_transcript(
#'   bams = bams,
#'   genes = genes,
#'   mapq.unique = 99
#' )

calculate_distance_matrices_transcript <- function(
  path.bams=".",
  bams=NULL,
  path.gtf=list.files(".", pattern="\\.g[tf]f$"),
  genes=NULL,
  expression.matrix=NULL,
  subsample.genes=FALSE,
  make.index=FALSE,
  uniqueOnly=TRUE,
  mapq.unique=255,
  slack=200,
  method = "correlation_pearson",
  save.image.every.1000=FALSE,
  ncores=1){
  if(base::is.null(bams)){
    bams <- base::list.files(path.bams, pattern=".bam$", full.names=TRUE)
  }
  if(base::length(bams)<2) base::stop("Please provide at least 2 BAM files")
  for(j in base::seq_len(base::length(bams))){
    bamIndex <- base::paste0(bams[j], ".bai")
    if(!base::file.exists(bamIndex)){
      if(make.index){
        bams[j] <- Rsamtools::sortBam(bams[j], destination=base::paste0(bams[j], ".sorted"))
        bamIndex <- Rsamtools::indexBam(bams[j])
      }else{
        stop("BAM index not found. It can be generated by setting make.index=TRUE")
      }
    }
  }
  if(base::is.null(genes)){
    base::message("Creating gene table from gtf file...")
    genes <- noisyr::cast_gtf_to_genes(path.gtf)
    base::message("Done")
  }
  if(base::nrow(genes)<2) base::stop("Please provide at least 2 genes")
  if(!base::is.null(expression.matrix)){
    genes.subset <- genes[base::match(base::rownames(expression.matrix),
                                      genes$gene_id),]
    if(subsample.genes){
      rSum <- rowSums(expression.matrix)
      densSumMin <- noisyr::calculate_threshold_fixed_density(rSum)
      subsampled.gene.ids <-
        c(base::names(rSum)[rSum>=densSumMin],
          base::names(rSum[base::sample(base::names(rSum)[rSum<densSumMin],
                                        size=base::sum(rSum>=densSumMin))]))
      genes.subset <- genes.subset[genes.subset$gene_id %in% subsampled.gene.ids,]
    }
  }else{
    genes.subset <- genes
  }

  ngenes <- base::nrow(genes.subset)
  if(base::is.null(expression.matrix)){
    calculate.matrix <- TRUE
    expression.matrix <- base::matrix(nrow=ngenes, ncol=base::length(bams))
    base::rownames(expression.matrix) <- genes.subset$gene_id
  }else calculate.matrix <- FALSE

  use.corr.dist <- base::strsplit(method, "_")[[1]][1]
  if(use.corr.dist!="correlation")
    stop(paste("Distance measures are currently not supported for transcript approach.",
               "Please use a correlation measure instead."))
  base.method <- base::sub(paste0(use.corr.dist,"_"), "", method)

  dist.mat <- base::matrix(nrow=ngenes,
                           ncol=base::length(bams))
  base::message("Calculating distances...")
  start_time <- base::Sys.time()
  if(ncores>1){
    doParallel::registerDoParallel(cores=ncores)
    dist.exp.mats <- foreach::foreach(
      n=1:ngenes,
      .combine=rbind,
      .inorder=TRUE) %dopar% {
        obj <- noisyr::calculate_profile(gene=genes.subset[n,],
                                         bams=bams,
                                         uniqueOnly=uniqueOnly,
                                         mapq.unique=mapq.unique,
                                         slack=slack)
        profile <- obj$profile
        cors <- stats::cor(base::unname(profile), method=base.method)
        cors[cors==1] = NA
        avCors <- base::colMeans(cors, na.rm=TRUE)
        avCors[base::is.nan(avCors)] = NA
        dist.mat[n,] <- avCors
        if(n%%1000==0){
          if(save.image.every.1000){
            base::save.image()
          }
        }
        c(avCors, obj$exp)
      }
    dist.mat[] <- dist.exp.mats[, 1:base::length(bams)]
    if(calculate.matrix) expression.matrix[] <-
      dist.exp.mats[, (base::length(bams)+1):(2*base::length(bams))]

  }else{
    for(n in base::seq_len(ngenes)){
      obj <- noisyr::calculate_profile(gene=genes.subset[n,],
                                       bams=bams,
                                       uniqueOnly=uniqueOnly,
                                       mapq.unique=mapq.unique,
                                       slack=slack)
      profile <- obj$profile
      if(calculate.matrix) expression.matrix[n,] <- obj$exp

      cors <- stats::cor(base::unname(profile), method=base.method)
      cors[cors==1] = NA
      avCors <- base::colMeans(cors, na.rm=TRUE)
      avCors[base::is.nan(avCors)] = NA
      dist.mat[n,] = avCors
      if(n%%1000==0){
        if(save.image.every.1000){
          base::save.image()
        }
        part_time <- base::Sys.time()
        base::message("Done", n, "genes out of", ngenes)
        time_elapsed <- part_time - start_time
        base::message("Time elapsed:", base::round(time_elapsed, 2),
                      base::units(time_elapsed))
      }
    }
  }
  if(save.image.every.1000){
    base::save.image()
  }
  end_time <- base::Sys.time()
  base::message("Finished ", ngenes, " genes")
  time_elapsed <- end_time - start_time
  base::message("Time elapsed: ", base::round(time_elapsed, 2),
                base::units(time_elapsed))

  abn.mat <- expression.matrix[base::match(genes.subset$gene_id,
                                           base::rownames(expression.matrix)),]
  abn.mat <- base::unname(abn.mat)
  abn.mat.sort <- abn.mat
  dist.mat.sort <- dist.mat
  for(j in base::seq_len(base::ncol(abn.mat))){
    ordering <- base::order(abn.mat[,j])
    abn.mat.sort[,j] <- abn.mat[ordering,j]
    dist.mat.sort[,j] <- dist.mat[ordering,j]
  }

  if(noisyr::get_methods_correlation_distance(names=FALSE)[
    base::match(method, noisyr::get_methods_correlation_distance())] == "d"){
    base::message("Chosen method ", method, " is a dissimilarity measure, outputting inverse...")
    dist.matrix <- 1/dist.matrix
  }

  base::message("Finished")
  returnObject <- base::list("exp" = expression.matrix,
                             "abn" = abn.mat.sort,
                             "dist" = dist.mat.sort)
  return(returnObject)
}
