#! /usr/bin/env python

# NOTE:
# Unlike the other Classifiers Dlda has the number of features to be used (nf)
# as the only parameter. This means that, using an adequate resampling method and
# paramethers, this tool can give a reliable estimation about the predictivity of
# the model.

from numpy import *
from optparse import OptionParser
from mlpy import *
  
# Command line parsing
parser = OptionParser()
parser.add_option("-d", "--data", metavar = "FILE", action = "store", type = "string",
                  dest = "data", help = "data - required")
parser.add_option("-n", "--normalize", action = "store_true", default = False,
                  dest = "norm", help = "normalize data")
parser.add_option("-s", "--standardize", action = "store_true", default = False,
                  dest = "std", help = "standardize data")

parser.add_option("-k", action = "store", type = "int",
                  dest = "k", help = "k for k-fold cross validation")
parser.add_option("-c", action = "store", type = "int", nargs = 2, metavar = "SETS PAIRS",
                  dest = "c", help = "sets and pairs for monte carlo cross validation")
parser.add_option("-S", "--stratified", action = "store_true", default = False,
                  dest = "strat", help = "for stratified cv")

parser.add_option("-v", "--verbose", action = "store_true", default = False,
                  dest = "verb", help = "print partial results every resampling step")
parser.add_option("-m", "--min", action = "store", type = "int",
                  dest = "min", help = "min value for nf parameter [default %default]", default = 1)
parser.add_option("-M", "--max", action = "store", type = "int",
                  dest = "max", help = "max value for nf parameter [default %default]", default = 10)
parser.add_option("-p", "--steps", action = "store", type = "int",
                  dest = "steps", help = "amplitude of steps for nf parameter [default %default]", default = 1)

parser.add_option("-l", "--lists", action = "store_true", default = False,
                  dest = "lists", help = "Canberra distance indicator")
parser.add_option("-a", "--auc", action = "store_true", default = False,
                  dest = "auc", help = "wmw_auc indicator")
parser.add_option("-b", "--bal", action = "store_true", default = False,
                  dest = "bal", help = "parameter of DLDA classifier refering to the balancement\
                  of training and test sets")


(options, args) = parser.parse_args()
if not options.data:
    parser.error("option -d [data] is required")
if not (options.k or options.c):
    parser.error("option -k (k-fold) or -c (monte carlo) for resampling is required")
if (options.k and options.c):
    parser.error("option -k (k-fold) and -c (monte carlo) are mutually exclusive")
if options.min < 1:
    parser.error("option -m must be >= 1")
if options.steps > options.max - options.min:
    parser.error("option -p must be <= (option -M - option -m)")
if options.min > options.max:
    parser.error("option -m must be <= option -M")

# Number of Features
NF = []        # nf in a list of the NF that i want to add to the model at each compute
NF.append(0)
while (options.min + sum(NF) + options.steps) <= options.max:   #check that the nf at the next step is not > options.max
    NF.append(options.steps)

# Data
x, y = data_fromfile(options.data)

if options.max > x.shape[1]:
    parser.error("max number of features must be <= number of features in data file")
    
if options.std:
    x = data_standardize(x)
    
if options.norm:
    x = data_normalize(x)

# Resampling
if options.strat:
    if options.k:
        print "stratified %d-fold cv" % options.k
        res = kfoldS(cl = y, sets = options.k)
    elif options.c:
        print "stratified monte carlo cv (%d sets, %d pairs)" %(options.c[0], options.c[1])
        res = montecarloS(cl = y, sets = options.c[0], pairs = options.c[1])
else:
    if options.k:
        print "%d-fold cv" % options.k
        res = kfold(nsamples = y.shape[0], sets = options.k)
    elif options.c:
        print "monte carlo cv (%d sets, %d pairs)" %(options.c[0], options.c[1])
        res = montecarlo(nsamples = y.shape[0], sets = options.c[0], pairs = options.c[1])

if options.lists:
    R = Ranking(method='onestep')
    lp = empty((len(res), x.shape[1]), dtype = int)


##########

MCC = empty((len(NF),len(res)))
ERR = empty((len(NF),len(res)))
AUC = zeros((len(NF),len(res)))

for t, r in enumerate(res):
    xtr, ytr, xts, yts = x[r[0]], y[r[0]], x[r[1]], y[r[1]]
    d = Dlda(nf = options.min, bal = options.bal)
    
    for rig, i in enumerate(NF):
        p = None
        d.compute(xtr, ytr, i)
        p = d.predict(xts)
        ERR[rig, t] = err(yts, p)
        MCC[rig, t] = mcc(yts, p)
        if options.auc:
            AUC[rig, t] = wmw_auc(yts, d.realpred)

    if (options.verb or (t == len(res)-1)):
        print 'Results are averaged on', (t + 1), 'indipendent train & test sets'
        
        for l in range(ERR.shape[0]):
            print "Numb. of Features %s: error %f, mcc %f, auc %f" \
                  %(((l * options.steps) + options.min),\
                    (mean(ERR[l, range(t + 1)])),\
                    (mean(MCC[l, range(t + 1)])),\
                    (mean(AUC[l, range(t + 1)])))
