####Hotdeck in context of kNN-k Nearest Neighbour Imputation
#Author: Alexander Kowarik, Statistics Austria
## (k)NN-Imputation
## (k)NN-Imputation, donors from cold deck
#data - data.frame of the data with missing
#variable - vector of variablesnames to be imputed
#metric - method for distance computation of in function daisy(cluster), otherwise automatical selection
#k - number of neighbours used
#dist_var - list/vector of the variablenames used for distance computation
#weights - list/vector of the weights for the different dist variables
#numFun - function for evaluating the k NN (numerical variable)
#catFun - function for evaluating the k NN (categorical variable)
#makeNA - vector of values which should be imputed too e.g. 8,9 or 98,99 in SPSS-data sets
#NAcond - list of conditions for each variable to create NAs there (not yet implemented)
#donorcond - list of conditions for a donor e.g. "<=10000"
#TODO: Donors from cold deck

sampleCat <- function(x){
  #sample with probabilites corresponding to there number in the NNs
  if(!is.factor(x))
    x <- as.factor(x)
  s <- summary(x)
  s <- s[s!=0]
  sample(names(s),1,prob=s)
}
maxCat <- function(x){
  #choose cat with max prob, random if max is not unique
  if(!is.factor(x))
    x <- as.factor(x)
  s <- summary(x)
  s <- s[s!=0]
  if(sum(s>0)>1)
    s <- sample(s)
  names(s)[which.max(s)]
}
which.minN <- function(x,n){
  n <- min(n,length(x))
  out <- vector()
  for(i in 1:n){
    out[i] <- names(x)[which.min(x)]
    x[which.min(x)] <- Inf
  }
  as.numeric(out)
}
kNN <-
    function(data, variable=colnames(data), metric=NULL, k=5, dist_var=colnames(data),weights=NULL,
        numFun = median, catFun=maxCat,
        makeNA=NULL,NAcond=NULL, impNA=TRUE, donorcond=NULL,mixed=vector(),mixed.constant=NULL,trace=FALSE,
        imp_var=TRUE,imp_suffix="imp",addRandom=FALSE){
  #basic checks
  if(!is.null(mixed.constant)){
    if(length(mixed.constant)!=length(mixed))
      stop("length 'mixed.constant' and length 'mixed' differs")
  }
  startTime <- Sys.time()
  nvar <- length(variable)
  ndat <- nrow(data)
  if(!is.data.frame(data)&&!is.matrix(data))
    stop("supplied data should be a dataframe or matrix")
  
  if(is.matrix(data))
    data <- as.data.frame(data)
  #impNA==FALSE -> NAs should remain NAs (Routing NAs!?)
  if(!impNA)
    data[is.na(data)] <- "THISISanNASTRINGthatshouldnotbeimputedbytheroutine"
  if(!is.null(makeNA)){
    if(length(makeNA)!=nvar)
      stop("The vector 'variable' must have the same length as the 'makeNA' list")
    else{
      for(i in 1:nvar){
        data[data[,variable[i]]%in%makeNA[[i]],variable[i]] <- NA 
      }
    }
  }
  if(!any(is.na(data)))
    stop("Nothing to impute, because no NA are present (also after using makeNA)")
  if(imp_var){
    imp_vars <- paste(variable,"_",imp_suffix,sep="")
    data[,imp_vars] <- FALSE
    for(i in 1:length(variable)){
      data[is.na(data[,variable[i]]),imp_vars[i]] <- TRUE
        if(!any(is.na(data[,variable[i]])))
          data<-data[,-which(names(data)==paste(variable[i],"_",imp_suffix,sep=""))]
    }
  }
  orders <- vector()
  for(i in 1:ncol(data)){
    orders <- c(orders,is.ordered(data[,i]))
  }
  orders <- colnames(data)[orders]
  levOrders <- vector()
  if(length(orders)>0){
    for(i in 1:length(orders)){
      levOrders[i] <- levels(data[,orders[i]])[length(levels(data[,orders[i]]))]
    }
  }
  factors <- vector()
  for(i in 1:ncol(data)){
    factors <- c(factors,is.factor(data[,i]))
  }
  factors <- colnames(data)[factors]
  factors <- factors[!factors%in%orders]
  numerical <- vector()
  for(i in 1:ncol(data)){
    numerical <- c(numerical,is.numeric(data[,i])|is.integer(data[,i]))
  }
  numerical <- colnames(data)[numerical]
  numerical <- numerical[!numerical%in%mixed]
  if(trace){
    cat("Detected as categorical variable:\n")
    print(factors)
    cat("Detected as ordinal variable:\n")
    print(orders)
    cat("Detected as numerical variable:\n")
    print(numerical)  
  }

  ###Make an index for selecting donors
  INDEX <- 1:ndat
  ##START DISTANCE IMPUTATION
  ## if(is.null(metric))
  ##   metric <- c("euclidean", "manhattan", "gower")
  ## else if(!metric%in%c("euclidean", "manhattan", "gower"))
  ##   stop("metric is unknown")
  if(is.null(weights)){
    weights <- rep(1,length(dist_var))
  }else if(length(weights)!=length(dist_var)){
    stop("length of weights must be equal the number of distance variables")
  }
  if(addRandom){
    numerical <- c(numerical, "RandomVariableForImputation")
    data[,"RandomVariableForImputation"] <- rnorm(ndat)
    if(is.list(dist_var)){
      for(i in 1:length(dist_var)){
        dist_var[[i]] <- c(dist_var[[i]],"RandomVariableForImputation")
        weights[[i]] <- c(weights[[i]],min(weights[[i]])/(sum(weights[[i]])+1))
      }
    }else{
      dist_var <- c(dist_var,"RandomVariableForImputation")
      weights <- c(weights,min(weights)/(sum(weights)+1))
    }
  }
  for(j in 1:nvar){
    if(any(is.na(data[,variable[j]]))){
      if(is.list(dist_var)){
        if(!is.list(weights))
          stop("if dist_var is a list weights must be a list")
        dist_varx <- dist_var[[j]]
        weightsx <- weights[[j]]
      }else{
        dist_varx <- dist_var[dist_var!=variable[j]]
        weightsx <- weights[dist_var%in%dist_varx]
      }
      if(!is.null(donorcond)){
        cmd <- paste("TF <- !is.na(data[,variable[j]])&data[,variable[j]]",donorcond[[j]],sep="")
        eval(parse(text=cmd))
        don_dist_var <- data[TF,dist_varx,drop=FALSE]#TODO:for list of dist_var
        don_index <- INDEX[TF]
      }else{
        TF <- !is.na(data[,variable[j]])
        don_dist_var <- data[TF,dist_varx,drop=FALSE]#TODO:for list of dist_var
        don_index <- INDEX[TF]
      }

      TF_imp <- is.na(data[,variable[j]])
      imp_dist_var <- data[TF_imp,dist_varx,drop=FALSE]#TODO:for list of dist_var
      imp_index <- INDEX[TF_imp]
      dist_single <- function(don_dist_var,imp_dist_var,numericalX,factorsX,ordersX,mixedX,levOrdersX,don_index,imp_index,weightsx,k,mixed.constant){
        #gd <- distance(don_dist_var,imp_dist_var,weights=weightsx)
        if(is.null(mixed.constant))
          mixed.constant <- rep(0,length(mixedX))
        gd <- gowerD(don_dist_var,imp_dist_var,weights=weightsx,numericalX,factorsX,ordersX,mixedX,levOrdersX,mixed.constant=mixed.constant);
        rownames(gd) <- don_index
        colnames(gd) <- imp_index
        which.minNk <- function(x)1
        cmd <- paste("which.minNk <- function(x)which.minN(x,",k,")",sep="")
        eval(parse(text=cmd))
        mindi <- apply(gd,2,which.minNk)
        erg <- as.matrix(mindi)
        if(k==1){
          erg <- t(erg)
        }
        erg
      }
      numericalX <-numerical[numerical%in%dist_varx]
      factorsX <-factors[factors%in%dist_varx]
      ordersX <-orders[orders%in%dist_varx]
      levOrdersX <- levOrders[orders%in%dist_varx]
      #print(levOrdersX)
      mixedX <-mixed[mixed%in%dist_varx]
      mindi <- dist_single(don_dist_var,imp_dist_var,numericalX,factorsX,ordersX,mixedX,levOrdersX,don_index,imp_index,weightsx,k,mixed.constant)
      getI <- function(x)data[x,variable[j]]
      if(trace)
        cat(sum(is.na(data[,variable[j]])),"items of","variable:",variable[j]," imputed\n")
      kNNs <- as.matrix(apply(mindi,2,getI))
      if(k==1){
        kNNs <- matrix(kNNs,nrow=1)
      }
      if(variable[j]%in%factors)
        data[is.na(data[,variable[j]]),variable[j]] <- apply(kNNs,2,catFun)
      else if(is.integer(data[,variable[j]])){
        data[is.na(data[,variable[j]]),variable[j]] <- round(apply(kNNs,2,numFun))
      }else
        data[is.na(data[,variable[j]]),variable[j]] <- apply(kNNs,2,numFun)
    }
  }
  print(difftime(startTime,Sys.time()))  
  if(!impNA)
    data[data=="THISISanNASTRINGthatshouldnotbeimputedbytheroutine"] <- NA
  if(addRandom)
    data <- data[,-which(names(data)=="RandomVariableForImputation")]
  data
}