#' Feature Transformation -- VectorAssembler
#'
#' Combine multiple vectors into a single row-vector; that is,
#' where each row element of the newly generated column is a
#' vector formed by concatenating each row element from the
#' specified input columns.
#'
#' @template roxlate-ml-transformation
#'
#' @export
ft_vector_assembler <- function(x,
                                input.col = NULL,
                                output.col = NULL,
                                ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.VectorAssembler"
  invoke_simple_transformer(x, class, list(
    setInputCols = as.list(as.character(input.col)),
    setOutputCol = ensure_scalar_character(output.col)
  ))
}

#' Feature Transformation -- StringIndexer
#'
#' Encode a column of labels into a column of label indices.
#' The indices are in [0, numLabels), ordered by label frequencies, with
#' the most frequent label assigned index 0. The transformation
#' can be reversed with \code{\link{ft_index_to_string}}.
#'
#' @template roxlate-ml-transformation
#'
#' @param params An (optional) \R environment -- when available,
#'   the index <-> label mapping generated by the string indexer
#'   will be injected into this environment under the \code{labels}
#'   key.
#'
#' @export
ft_string_indexer <- function(x,
                              input.col = NULL,
                              output.col = NULL,
                              params = NULL,
                              ...)
{
  ml_backwards_compatibility_api()
  df <- spark_dataframe(x)
  sc <- spark_connection(df)

  indexer <- invoke_new(
    sc,
    "org.apache.spark.ml.feature.StringIndexer"
  )

  sim <- indexer %>%
    invoke("setInputCol", input.col) %>%
    invoke("setOutputCol", output.col) %>%
    invoke("fit", df)

  # Report labels to caller if requested -- these map
  # the discovered labels in the data set to an associated
  # index.
  if (is.environment(params))
    params$labels <- as.character(invoke(sim, "labels"))

  transformed <- invoke(sim, "transform", df)

  sdf_register(transformed)
}

#' Feature Transformation -- Binarizer
#'
#' Apply thresholding to a column, such that values less than or equal to the
#' \code{threshold} are assigned the value 0.0, and values greater than the
#' threshold are assigned the value 1.0.
#'
#' @template roxlate-ml-transformation
#'
#' @param threshold The numeric threshold.
#'
#' @export
ft_binarizer <- function(x,
                         input.col = NULL,
                         output.col = NULL,
                         threshold = 0.5,
                         ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.Binarizer"
  invoke_simple_transformer(x, class, list(
    setInputCol  = ensure_scalar_character(input.col),
    setOutputCol = ensure_scalar_character(output.col),
    setThreshold = ensure_scalar_double(threshold)
  ))
}

#' Feature Transformation -- Discrete Cosine Transform (DCT)
#'
#' Transform a column in the time domain into another column in the frequency
#' domain.
#'
#' @template roxlate-ml-transformation
#'
#' @param inverse Perform inverse DCT?
#'
#' @export
ft_discrete_cosine_transform <- function(x,
                                         input.col = NULL,
                                         output.col = NULL,
                                         inverse = FALSE,
                                         ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.DCT"
  invoke_simple_transformer(x, class, list(
    setInputCol  = ensure_scalar_character(input.col),
    setOutputCol = ensure_scalar_character(output.col),
    setInverse   = ensure_scalar_boolean(inverse)
  ))
}

#' Feature Transformation -- IndexToString
#'
#' Symmetrically to \code{\link{ft_string_indexer}},
#' \code{ft_index_to_string} maps a column of label indices back to a
#' column containing the original labels as strings.
#'
#' @template roxlate-ml-transformation
#'
#' @export
ft_index_to_string <- function(x,
                               input.col = NULL,
                               output.col = NULL,
                               ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.IndexToString"
  invoke_simple_transformer(x, class, list(
    setInputCol  = ensure_scalar_character(input.col),
    setOutputCol = ensure_scalar_character(output.col)
  ))
}

## TODO: These routines with so-called 'row vector' features by
## default, but it would be much nicer to implement routines to
## scale whole columns instead.
# ft_standard_scaler <- function(df, input.col, output.col,
#                                      with.mean, with.std)
# {
#   sc <- spark_connection(df)
#
#   scaler <- invoke_new(
#     sc,
#     "org.apache.spark.ml.feature.StandardScaler"
#   )
#
#   scaler %>%
#     invoke("setInputCol", input.col) %>%
#     invoke("setOutputCol", output.col) %>%
#     invoke("setWithMean", as.logical(with.mean)) %>%
#     invoke("setWithStd", as.logical(with.std)) %>%
#     invoke("transform", df)
# }
#
# ft_min_max_scaler <- function(df, input.col, output.col,
#                                     min = 0, max = 1)
# {
#   sc <- spark_connection(df)
#
#   scaler <- invoke_new(
#     sc,
#     "org.apache.spark.ml.feature.MinMaxScaler"
#   )
#
#   scaler %>%
#     invoke("setInputCol", input.col) %>%
#     invoke("setOutputCol", output.col) %>%
#     invoke("setMin", as.numeric(min)) %>%
#     invoke("setMax", as.numeric(max)) %>%
#     invoke("transform", df)
# }

#' Feature Transformation -- Bucketizer
#'
#' Similar to \R's \code{\link{cut}} function, this transforms a numeric column
#' into a discretized column, with breaks specified through the \code{splits}
#' parameter.
#'
#' @template roxlate-ml-transformation
#'
#' @param splits A numeric vector of cutpoints, indicating the bucket
#'   boundaries.
#'
#' @export
ft_bucketizer <- function(x,
                          input.col = NULL,
                          output.col = NULL,
                          splits,
                          ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.Bucketizer"
  invoke_simple_transformer(x, class, list(
    setInputCol  = ensure_scalar_character(input.col),
    setOutputCol = ensure_scalar_character(output.col),
    setSplits    = as.list(as.numeric(splits))
  ))
}

#' Feature Transformation -- ElementwiseProduct
#'
#' Computes the element-wise product between two columns. Generally, this is
#' intended as a scaling transformation, where an input vector is scaled by
#' another vector, but this should apply for all element-wise product
#' transformations.
#'
#' @template roxlate-ml-transformation
#'
#' @param scaling.col The column used to scale \code{input.col}.
#'
#' @export
ft_elementwise_product <- function(x,
                                   input.col = NULL,
                                   output.col = NULL,
                                   scaling.col,
                                   ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.ElementwiseProduct"
  invoke_simple_transformer(x, class, list(
    setInputCol   = ensure_scalar_character(input.col),
    setOutputCol  = ensure_scalar_character(output.col),
    setScalingVec = ensure_scalar_character(scaling.col)
  ))
}

#' Feature Transformation -- SQLTransformer
#'
#' Transform a data set using SQL. Use the \code{__THIS__}
#' placeholder as a proxy for the active table.
#'
#' Although this function accepts the \code{input.col} and \code{output.col}
#' arguments, they are ignored -- this interface is done purely for
#' compatibility with \code{\link{sdf_mutate}}.
#'
#' @template roxlate-ml-transformation
#'
#' @param sql A SQL statement.
#'
#' @export
ft_sql_transformer <- function(x,
                               input.col = NULL,
                               output.col = NULL,
                               sql,
                               ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.SQLTransformer"
  invoke_simple_transformer(x, class, list(
    setStatement = paste(sql, collapse = "\n")
  ))
}

#' Feature Transformation -- QuantileDiscretizer
#'
#' Takes a column with continuous features and outputs a column with binned
#' categorical features. The bin ranges are chosen by taking a sample of the
#' data and dividing it into roughly equal parts. The lower and upper bin bounds
#' will be -Infinity and +Infinity, covering all real values. This attempts to
#' find numBuckets partitions based on a sample of the given input data, but it
#' may find fewer depending on the data sample values.
#'
#' Note that the result may be different every time you run it, since the sample
#' strategy behind it is non-deterministic.
#'
#' @template roxlate-ml-transformation
#'
#' @param n.buckets The number of buckets to use.
#'
#' @export
ft_quantile_discretizer <- function(x,
                                    input.col = NULL,
                                    output.col = NULL,
                                    n.buckets = 5L,
                                    ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.QuantileDiscretizer"
  invoke_simple_transformer(x, class, list(
    setInputCol   = ensure_scalar_character(input.col),
    setOutputCol  = ensure_scalar_character(output.col),
    setNumBuckets = ensure_scalar_integer(n.buckets),
    function(transformer, sdf) invoke(transformer, "fit", sdf)
  ))
}

#' Feature Transformation -- OneHotEncoder
#'
#' One-hot encoding maps a column of label indices to a column of binary
#' vectors, with at most a single one-value. This encoding allows algorithms
#' which expect continuous features, such as Logistic Regression, to use
#' categorical features.
#'
#' @template roxlate-ml-transformation
#'
#' @export
ft_one_hot_encoder <- function(x,
                               input.col = NULL,
                               output.col = NULL,
                               ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.OneHotEncoder"
  invoke_simple_transformer(x, class, list(
    setInputCol  = ensure_scalar_character(input.col),
    setOutputCol = ensure_scalar_character(output.col)
  ))
}

#' Feature Tranformation -- Tokenizer
#'
#' A tokenizer that converts the input string to lowercase and then splits it
#' by white spaces.
#'
#' @template roxlate-ml-transformation
#'
#' @export
ft_tokenizer <- function(x,
                         input.col = NULL,
                         output.col = NULL,
                         ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.Tokenizer"
  invoke_simple_transformer(x, class, list(
    setInputCol  = ensure_scalar_character(input.col),
    setOutputCol = ensure_scalar_character(output.col)
  ))
}

#' Feature Tranformation -- RegexTokenizer
#'
#' A regex based tokenizer that extracts tokens either by using the provided
#' regex pattern to split the text (default) or repeatedly matching the regex
#' (if gaps is false). Optional parameters also allow filtering tokens using a
#' minimal length. It returns an array of strings that can be empty.
#'
#' @template roxlate-ml-transformation
#' @param pattern The regular expression pattern to be used.
#'
#' @export
ft_regex_tokenizer <- function(x,
                               input.col = NULL,
                               output.col = NULL,
                               pattern,
                               ...)
{
  ml_backwards_compatibility_api()
  class <- "org.apache.spark.ml.feature.RegexTokenizer"
  invoke_simple_transformer(x, class, list(
    setInputCol  = ensure_scalar_character(input.col),
    setOutputCol = ensure_scalar_character(output.col),
    setPattern   = ensure_scalar_character(pattern)
  ))
}


# TODO
# #' Feature Transformations -- HashingTF
# #'
# #' Maps a sequence of terms to their term frequencies.
# #'
# #' @template roxlate-ml-transformation
# #'
# #' @param n_features Number of features.
# #' @param binary Boolean; binary?
# #'
# #' @export
# ft_hashing_tf <- function(x,
#                           input.col = NULL,
#                           output.col = NULL,
#                           n.features = NULL,
#                           binary = FALSE,
#                           ...)
# {
#   ml_backwards_compatibility_api()
#   class <- "org.apache.spark.ml.feature.HashingTF"
#   invoke_simple_transformer(x, class, list(
#     setInputCol    = ensure_scalar_character(input.col),
#     setOutputCol   = ensure_scalar_character(output.col),
#     setNumFeatures = ensure_scalar_integer(n.features),
#     setBinary      = ensure_scalar_boolean(binary)
#   ))
# }
