

#' Nested cross-validation for caret
#'
#' This function applies nested cross-validation (CV) to training of models
#' using the `caret` package. The function also allows the option of embedded
#' filtering of predictors for feature selection nested within the outer loop of
#' CV. Predictions on the outer test folds are brought back together and error
#' estimation/ accuracy determined. The default is 10x10 nested CV.
#'
#' @param y Response vector. For classification this should be a factor.
#' @param x Matrix or dataframe of predictors
#' @param filterFUN Filter function, e.g. [ttest_filter] or [relieff_filter].
#'   Any function can be provided and is passed `y` and `x`. Must return a
#'   character vector with names of filtered predictors.
#' @param filter_options List of additional arguments passed to the filter
#'   function specified by `filterFUN`.
#' @param weights Weights applied to each sample for models which can use
#'   weights. Note `weights` and `balance` cannot be used at the same time.
#'   Weights are not applied in filters.
#' @param balance Specifies method for dealing with imbalanced class data.
#'   Current options are `"randomsample"` or `"smote"`. See [randomsample()] and
#'   [smote()]
#' @param balance_options List of additional arguments passed to the balancing
#'   function
#' @param outer_method String of either `"cv"` or `"LOOCV"` specifying whether
#'   to do k-fold CV or leave one out CV (LOOCV) for the outer folds
#' @param n_outer_folds Number of outer CV folds
#' @param outer_folds Optional list containing indices of test folds for outer
#'   CV. If supplied, `n_outer_folds` is ignored.
#' @param metric A string that specifies what summary metric will be used to
#'   select the optimal model. By default, "logLoss" is used for classification
#'   and "RMSE" is used for regression. Note this differs from the default
#'   setting in caret which uses "Accuracy" for classification. See details.
#' @param trControl A list of values generated by the `caret` function
#'   [trainControl]. This defines how inner CV training through `caret` is
#'   performed. Default for the inner loop is 10-fold CV. See
#'   http://topepo.github.io/caret/using-your-own-model-in-train.html.
#' @param tuneGrid Data frame of tuning values, see [caret::train].
#' @param savePredictions Indicates whether hold-out predictions for each inner
#'   CV fold should be saved for ROC curves, accuracy etc see
#'   [caret::trainControl]. Default is `"final"` to capture predictions for
#'   inner CV ROC.
#' @param cv.cores Number of cores for parallel processing of the outer loops.
#'   NOTE: this uses `parallel::mclapply` on unix/mac and `parallel::parLapply`
#'   on windows.
#' @param finalCV Logical whether to perform one last round of CV on the whole
#'   dataset to determine the final model parameters. If set to `FALSE`, the
#'   median of the best hyperparameters from outer CV folds for continuous/
#'   ordinal hyperparameters, or highest voted for categorical hyperparameters,
#'   are used to fit the final model. Performance metrics are independent of
#'   this last step.
#' @param na.option Character value specifying how `NA`s are dealt with.
#'   `"omit"` is equivalent to `na.action = na.omit`. `"omitcol"` removes cases
#'   if there are `NA` in 'y', but columns (predictors) containing `NA` are
#'   removed from 'x' to preserve cases. Any other value means that `NA` are
#'   ignored (a message is given).
#' @param ... Arguments passed to [caret::train] including `method`
#' @return An object with S3 class "nestcv.train"
#'   \item{call}{the matched call}
#'   \item{output}{Predictions on the left-out outer folds}
#'   \item{outer_result}{List object of results from each outer fold containing
#'   predictions on left-out outer folds, caret result and number of filtered
#'   predictors at each fold.}
#'   \item{outer_folds}{List of indices of outer test folds}
#'   \item{dimx}{dimensions of `x`}
#'   \item{y}{original response vector}
#'   \item{yfinal}{final response vector (post-balancing)}
#'   \item{final_fit}{Final fitted caret model using best tune parameters}
#'   \item{final_vars}{Column names of filtered predictors entering final model}
#'   \item{summary_vars}{Summary statistics of filtered predictors}
#'   \item{roc}{ROC AUC for binary classification where available.}
#'   \item{trControl}{`caret::trainControl` object used for inner CV}
#'   \item{bestTunes}{best tuned parameters from each outer fold}
#'   \item{finalTune}{final parameters used for final model}
#'   \item{summary}{Overall performance summary. Accuracy and balanced accuracy
#'   for classification. ROC AUC for binary classification. RMSE for
#'   regression.}
#' @details
#' Parallelisation is performed on the outer folds using `parallel::mclapply` on
#' unix/mac and `parallel::parLapply` on windows.
#'
#' We strongly recommend that you try calls to `nestcv.train` with `cv.cores=1`
#' first. With `caret` this may flag up that specific packages are not installed
#' or that there are problems with input variables `y` and `x` which may have to
#' be corrected for the call to run in multicore mode.
#'   
#'   If the outer folds are run using parallelisation, then parallelisation in
#'   caret must be off, otherwise an error will be generated. Alternatively if
#'   you wish to use parallelisation in caret, then parallelisation in
#'   `nestcv.train` can be fully disabled by leaving `cv.cores = 1`.
#'   
#'   For classification, `metric` defaults to using 'logLoss' with the
#'   `trControl` arguments `classProbs = TRUE, summaryFunction = mnLogLoss`,
#'   rather than 'Accuracy' which is the default classification metric in
#'   `caret`. See [trainControl]. LogLoss is arguably more consistent than
#'   Accuracy for tuning parameters in datasets with small sample size.
#'
#'   Models can be fitted with a single set of fixed parameters, in which case
#'   `trControl` defaults to `trainControl(method = "none")` which disables
#'   inner CV as it is unnecessary. See
#'   https://topepo.github.io/caret/model-training-and-tuning.html#fitting-models-without-parameter-tuning
#'
#' @author Myles Lewis
#' @examples
#' \donttest{
#' ## sigmoid function
#' sigmoid <- function(x) {1 / (1 + exp(-x))}
#' 
#' ## load iris dataset and simulate a binary outcome
#' data(iris)
#' x <- iris[, 1:4]
#' colnames(x) <- c("marker1", "marker2", "marker3", "marker4")
#' x <- as.data.frame(apply(x, 2, scale))
#' y2 <- sigmoid(0.5 * x$marker1 + 2 * x$marker2) > runif(nrow(x))
#' y2 <- factor(y2, labels = c("class1", "class2"))
#' 
#' ## Example using random forest with caret
#' cvrf <- nestcv.train(y2, x, method = "rf",
#'                      n_outer_folds = 3,
#'                      cv.cores = 2)
#' summary(cvrf)
#' 
#' ## Example of glmnet tuned using caret
#' ## set up small tuning grid for quick execution
#' ## length.out of 20-100 is usually recommended for lambda
#' ## and more alpha values ranging from 0-1
#' tg <- expand.grid(lambda = exp(seq(log(2e-3), log(1e0), length.out = 5)),
#'                   alpha = 1)
#' 
#' ncv <- nestcv.train(y = y2, x = x,
#'                     method = "glmnet",
#'                     n_outer_folds = 3,
#'                     tuneGrid = tg, cv.cores = 2)
#' summary(ncv)
#' 
#' ## plot tuning for outer fold #1
#' plot(ncv$outer_result[[1]]$fit, xTrans = log)
#' 
#' ## plot final ROC curve
#' plot(ncv$roc)
#' 
#' ## plot ROC for left-out inner folds
#' inroc <- innercv_roc(ncv)
#' plot(inroc)
#' }
#' @importFrom caret createFolds train trainControl mnLogLoss confusionMatrix
#'   defaultSummary
#' @importFrom data.table rbindlist
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach registerDoSEQ
#' @importFrom parallel mclapply
#' @importFrom pROC roc
#' @importFrom stats predict setNames
#' @export
#' 
nestcv.train <- function(y, x,
                         filterFUN = NULL,
                         filter_options = NULL,
                         weights = NULL,
                         balance = NULL,
                         balance_options = NULL,
                         outer_method = c("cv", "LOOCV"),
                         n_outer_folds = 10,
                         outer_folds = NULL,
                         cv.cores = 1,
                         metric = ifelse(is.factor(y), "logLoss", "RMSE"),
                         trControl = NULL,
                         tuneGrid = NULL,
                         savePredictions = "final",
                         finalCV = TRUE,
                         na.option = "pass",
                         ...) {
  nestcv.call <- match.call(expand.dots = TRUE)
  outer_method <- match.arg(outer_method)
  if (!is.null(balance) & is.numeric(y)) {
    stop("`balance` can only be used for classification")}
  ok <- checkxy(y, x, na.option, weights)
  y <- y[ok$r]
  x <- x[ok$r, ok$c]
  weights <- weights[ok$r]
  if (!is.null(balance) & !is.null(weights)) {
    stop("`balance` and `weights` cannot be used at the same time")}
  if (is.null(trControl)) {
    trControl <- if (is.factor(y)) {
      trainControl(method = "cv", 
                   number = 10,
                   classProbs = TRUE,
                   savePredictions = savePredictions,
                   summaryFunction = mnLogLoss)
    } else trainControl(method = "cv", 
                        number = 10,
                        savePredictions = savePredictions)
  }
  # switch off inner CV if tuneGrid is single row
  if (!is.null(tuneGrid)) {
    if (nrow(tuneGrid) == 1) trControl <- trainControl(method = "none", classProbs = TRUE)
  }
  if (is.null(outer_folds)) {
    outer_folds <- switch(outer_method,
                          cv = createFolds(y, k = n_outer_folds),
                          LOOCV = 1:length(y))
  }
  
  if (Sys.info()["sysname"] == "Windows" & cv.cores >= 2) {
    cl <- makeCluster(cv.cores)
    dots <- list(...)
    varlist <- c("outer_folds", "y", "x", "filterFUN", "filter_options",
                 "weights", "balance", "balance_options",
                 "metric", "trControl", "tuneGrid", "nestcv.trainCore", "dots")
    clusterExport(cl, varlist = varlist, envir = environment())
    outer_res <- parLapply(cl = cl, outer_folds, function(test) {
      args <- c(list(test=test, y=y, x=x,
                     filterFUN=filterFUN, filter_options=filter_options,
                     weights=weights, balance=balance,
                     balance_options=balance_options, metric=metric,
                     trControl=trControl, tuneGrid=tuneGrid), dots)
      do.call(nestcv.trainCore, args)
    })
    stopCluster(cl)
  } else {
    outer_res <- mclapply(outer_folds, function(test) {
      nestcv.trainCore(test, y, x,
                       filterFUN, filter_options,
                       weights, balance, balance_options,
                       metric, trControl, tuneGrid, ...)
    }, mc.cores = cv.cores)
  }
  
  predslist <- lapply(outer_res, '[[', 'preds')
  output <- data.table::rbindlist(predslist)
  output <- as.data.frame(output)
  if (!is.null(rownames(x))) {
    rownames(output) <- unlist(lapply(predslist, rownames))}
  summary <- predSummary(output)
  caret.roc <- NULL
  if (is.factor(y) & nlevels(y) == 2) {
    caret.roc <- pROC::roc(output$testy, output$predyp, direction = "<", 
                           quiet = TRUE)
  }
  bestTunes <- lapply(outer_res, function(i) i$fit$bestTune)
  bestTunes <- as.data.frame(data.table::rbindlist(bestTunes))
  rownames(bestTunes) <- paste('Fold', seq_len(nrow(bestTunes)))
  
  dat <- nest_filt_bal(NULL, y, x, filterFUN, filter_options,
                       balance, balance_options)
  yfinal <- dat$ytrain
  filtx <- dat$filt_xtrain
  
  if (finalCV) {
    # use CV on whole data to finalise parameters
    if (cv.cores >= 2) {
      if (Sys.info()["sysname"] == "Windows") {
        cl <- makeCluster(cv.cores)
        registerDoParallel(cl)
        on.exit({stopCluster(cl)
          foreach::registerDoSEQ()})
      } else {
        # unix
        registerDoParallel(cores = cv.cores)
        on.exit(foreach::registerDoSEQ())
      }
    }
    final_fit <- caret::train(x = filtx, y = yfinal,
                              weights = weights,
                              metric = metric,
                              trControl = trControl,
                              tuneGrid = tuneGrid, ...)
    finalTune <- final_fit$bestTune
  } else {
    # use outer folds for final parameters, fit single final model
    finalTune <- finaliseTune(bestTunes)
    fitControl <- trainControl(method = "none", classProbs = is.factor(y))
    final_fit <- caret::train(x = filtx, y = yfinal,
                              weights = weights,
                              trControl = fitControl,
                              tuneGrid = finalTune, ...)
  }
  
  out <- list(call = nestcv.call,
              output = output,
              outer_result = outer_res,
              outer_method = outer_method,
              outer_folds = outer_folds,
              dimx = dim(x),
              y = y,
              yfinal = yfinal,
              final_fit = final_fit,
              final_vars = colnames(filtx),
              roc = caret.roc,
              trControl = trControl,
              bestTunes = bestTunes,
              finalTune = finalTune,
              summary = summary)
  class(out) <- "nestcv.train"
  out
}


nestcv.trainCore <- function(test, y, x,
                             filterFUN, filter_options,
                             weights, balance, balance_options,
                             metric, trControl, tuneGrid, ...) {
  dat <- nest_filt_bal(test, y, x, filterFUN, filter_options,
                       balance, balance_options)
  ytrain <- dat$ytrain
  ytest <- dat$ytest
  filt_xtrain <- dat$filt_xtrain
  filt_xtest <- dat$filt_xtest
  
  fit <- caret::train(x = filt_xtrain, y = ytrain,
                      weights = weights[-test],
                      metric = metric,
                      trControl = trControl,
                      tuneGrid = tuneGrid, ...)
  predy <- predict(fit, newdata = filt_xtest)
  preds <- data.frame(predy=predy, testy=ytest)
  if (is.factor(y)) {
    predyp <- predict(fit, newdata = filt_xtest, type = "prob")
    # note predyp has 2 columns
    preds$predyp <- predyp[,2]
  }
  rownames(preds) <- rownames(filt_xtest)
  ret <- list(preds = preds,
              fit = fit,
              nfilter = ncol(filt_xtrain))
  ret
}


# finalise the caret model tuning from bestTune dataframe
#' @importFrom stats median
finaliseTune <- function(x) {
  fintune <- lapply(colnames(x), function(i) {
    if (is.numeric(x[, i])) return(median(x[, i]))
    tab <- table(x[, i])
    names(tab)[which.max(tab)]  # majority vote for factors
  })
  names(fintune) <- colnames(x)
  data.frame(fintune, check.names = FALSE)
}


#' @export
summary.nestcv.train <- function(object, 
                                 digits = max(3L, getOption("digits") - 3L), 
                                 ...) {
  cat("Nested cross-validation with caret\n")
  if (!is.null(object$call$filterFUN)) 
    cat("Filter: ", object$call$filterFUN, "\n") else cat("No filter\n")
  cat("Outer loop: ", switch(object$outer_method,
                             cv = paste0(length(object$outer_folds), "-fold cv\n"),
                             LOOCV = "leave-one-out CV\n"))
  cat("Inner loop: ", paste0(object$trControl$number, "-fold ",
                             object$trControl$method, "\n"))
  balance <- object$call$balance
  if (!is.null(balance)) {
    cat("Balancing: ", balance, "\n")
  }
  cat(object$dimx[1], "observations,", object$dimx[2], "predictors\n")
  if (!is.numeric(object$y)) print(c(table(object$y)))
  cat("\n")
  nfilter <- unlist(lapply(object$outer_result, '[[', 'nfilter'))
  foldres <- object$bestTunes
  foldres$n.filter <- nfilter
  print(foldres, digits = digits, print.gap = 2L)
  cat("\nFinal parameters:\n")
  print(object$finalTune, digits = digits, print.gap = 2L, row.names = FALSE)
  cat("\nResult:\n")
  print(object$summary, digits = digits, print.gap = 2L)
  out <- list(dimx = object$dimx, folds = foldres,
              final_param = object$finalTune, result = object$summary)
  invisible(out)
}


#' @method predict nestcv.train
#' @export
predict.nestcv.train <- function(object, newdata, ...) {
  if (any(!object$final_vars %in% colnames(newdata))) 
    stop("newdata is missing some predictors", call. = FALSE)
  predict(object$final_fit, newdata = newdata[, object$final_vars], ...)
}


#' Summarise variables
#' 
#' @param x Matrix or dataframe with variables in columns
#' @return A matrix with variables in rows and mean, median and SD for each
#'   variable or number of levels if the variable is a factor. If `NA` are
#'   detected, an extra column `n.NA` is added with the numbers of `NA` for each
#'   variable.
#' @importFrom Rfast colMedians colVars
#' @export
#' 
summary_vars <- function(x) {
  if (is.matrix(x)) {
    if (!is.numeric(x)) return("Not numeric")
    mat <- x
    selCols <- TRUE
  } else {
    selCols <- unlist(lapply(x, is.numeric))
    mat <- as.matrix(x[, selCols])
  }
  summary_mat <- cbind(colMeans(mat, na.rm = TRUE), colMedians(mat, na.rm = TRUE),
                       colVars(mat, std = TRUE, na.rm = TRUE))
  colnames(summary_mat) <- c("mean", "median", "sd")
  if (any(!selCols)) {
    nlevels_mat <- unlist(lapply(x[, !selCols], function(i) nlevels(as.factor(i))))
    out <- matrix(NA, nrow = ncol(x), ncol = 4,
                  dimnames = list(colnames(x), c("mean", "median", "sd", "nlevels")))
    out[selCols, 1:3] <- summary_mat
    out[!selCols, 4] <- nlevels_mat
  } else out <- summary_mat
  n.NA <- apply(x, 2, function(i) sum(is.na(i)))
  if (any(n.NA > 0)) out <- cbind(out, n.NA)
  out
}

