#' @export fixed.differences
#' 
#' @title Fixed Differences
#' @description Summarize fixed base pair differences between strata.
#' 
#' @param g a \code{\link{gtypes}} object.
#' @param count.indels logical. Count indels when evaluating sites for fixed differences?
#' @param consec.indels.as.one logical. If \code{count.indels} is TRUE, count consecutive indels as a a single indel?
#' @param bases a character vector of valid bases to consider.
#' @param num.cores number of CPU cores to use. Value is passed to \code{\link[parallel]{mclapply}}.
#' 
#' @return a list with components:
#' \tabular{ll}{
#'   \code{sites} \tab list of sites with fixed differences for each pair of strata \cr
#'   \code{num.fixed} \tab data.frame of number of sites fixed between each pair of strata \cr
#' }
#' 
#' @author Eric Archer <eric.archer@@noaa.gov>

fixed.differences <- function(g, count.indels = T, consec.indels.as.one = T, bases = c("a", "c", "g", "t", "-"), num.cores = 1) {  
  stopifnot.gtypes(g, "haploid")
  stopifnot.aligned(g$sequences)
  
  if(length(unique(decode.strata(g))) == 1) stop("Only one strata in 'gtypes'")
  if(count.indels) bases <- sort(unique(c(bases, "-")))
  
  opt <- options(mc.cores = num.cores)
  
  # get fixed sites for each strata
  strata.gtypes <- strata.split(g, remove.sequences = TRUE)
  strata.fixed.sites <- sapply(strata.gtypes, function(strata) fixed.sites(strata$sequences, bases), simplify = F)
  
  # for each pair of strata, return matrix of sites with fixed differences
  # fixed differences are sites which aren't variable in a strata and different between strata
  strata.pairs <- combn(names(strata.gtypes), 2)
  pair.fixed.diff <- mclapply(1:ncol(strata.pairs), function(p) {
    fixed.1 <- strata.fixed.sites[[strata.pairs[1, p]]]
    fixed.2 <- strata.fixed.sites[[strata.pairs[2, p]]]
    shared.sites <- intersect(names(fixed.1), names(fixed.2))
    seq.mat <- rbind(fixed.1[shared.sites], fixed.2[shared.sites])
    rownames(seq.mat) <- strata.pairs[, p]
    if(ncol(seq.mat) == 0) return(list(sites = seq.mat, num.fixed = 0))
    
    # count fixed nucleotides (don't count sites with indels)
    nucs.diff <- apply(seq.mat, 2, function(x) x[1] != x[2] & !any(x %in% c("-", ".")))
    sites <- which(nucs.diff)
    num.fixed <- sum(nucs.diff)
    
    # count fixed indels
    if(count.indels) {
      is.diff.indel <- apply(seq.mat, 2, function(x) any(x == "-") & x[1] != x[2])
      indels.fixed <- if(consec.indels.as.one) {
        # mark non-shared indels as '*'
        indel.diff <- apply(seq.mat, 2, function(i) {
          has.indel <- i == "-"
          if(any(has.indel) & i[1] != i[2]) i[has.indel] <- "*"
          i
        })
        # count runs of non-shared indels in either sequence
        f1.rle <- rle(indel.diff[1, ])    
        f2.rle <- rle(indel.diff[2, ])    
        sum(f1.rle$values == "*") + sum(f2.rle$values == "*")
      } else sum(is.diff.indel)
      sites <- sort(c(sites, which(is.diff.indel)))
      num.fixed <- sum(num.fixed, indels.fixed)
    }  
    list(sites = seq.mat[, sites, drop = F], num.fixed = num.fixed)
  })
  
  # compile sites with fixed differences
  sites <- lapply(pair.fixed.diff, function(x) x$sites)
  names(sites) <- apply(strata.pairs, 2, function(x) paste(x, collapse = " v. "))
  
  # count number of fixed differencs between pairs
  num.fixed <- sapply(pair.fixed.diff, function(x) x$num.fixed)
  strata.pairs <- data.frame(t(strata.pairs), num.fixed)
  colnames(strata.pairs) <- c("strata.1", "strata.2", "num.fixed")
  
  options(opt)
  list(sites = sites, num.fixed = strata.pairs)
}