#!/usr/bin/env python

import getopt
import sys
import tempfile

import numpy as np

sys.path.append("liblinear-1.8/python")
import liblinearutil as ll

import common
import impute
import parse
import vectorize

def usage(f = sys.stdout):
    print >> f, """\
Usage: %s -s <SET_FILENAME> [FP_FILENAME]...
Train a classifier using the given set of features and fingerprint files.
The output is a .model file.

  -c COST                     set the regularization cost (default 1).
  -g, --group=GROUP_FILENAME  read groups from GROUP_FILENAME (required).
  -h, --help                  show this help.
  -s, --set=SET_FILENAME      use the set of features in SET_FILENAME (required).
  --scale                     scale feature vectors to the range [0, 1].\
""" % sys.argv[0]

class options (object):
    cost = None
    group_filename = None
    set_filename = None
    scale = False

def make_liblinear(groups):
    y = []
    x = []

    numeric_label = 0
    for group in groups:
        for features in group.features:
            y.append(numeric_label)
            x.append(list(features))
        numeric_label += 1

    return y, x

def scale(features):
    m, n = features.shape
    s_min = np.zeros(n)
    s_max = np.zeros(n)
    for i in range(n):
        assigned = [x for x in features[:,i] if x >= 0]
        if assigned:
            mn = min(assigned)
            mx = max(assigned)
        else:
            mn = 0.0
            mx = 0.0
        s_min[i] = mn
        s_max[i] = mx
        if mn == mx:
            denom = 1.0
        else:
            denom = mx - mn
        for j in range(m):
            if features[j, i] >= 0:
                features[j, i] = (features[j, i] - mn) / denom
    return features, s_min, s_max

def prepare_features(groups):
    """Impute and scale features, and assign them back to groups."""
    feature_list = []
    for group in groups:
        for features in group.features:
            feature_list.append(features)

    feature_matrix = np.vstack(feature_list)
    feature_matrix = impute.impute(feature_matrix)
    if options.scale:
        feature_matrix, s_min, s_max = scale(feature_matrix)
        scale_params = zip(s_min, s_max)
    else:
        scale_params = None

    f_i = iter(feature_matrix)
    for group in groups:
        group.features = []
        for i in range(len(group.rs_list)):
            group.features.append(f_i.next())

    return scale_params

def train(groups):
    y, x = make_liblinear(groups)

    # -s 0: L2-regularized logistic regression (primal)
    # -q:   quiet mode
    param_str = "-s 0 -q"
    if options.cost is not None:
        # -c cost: set the parameter C (default 1)
        param_str += " -c " + str(options.cost)
    param = ll.parameter(param_str)
    prob = ll.problem(y, x)
    model = ll.train(prob, param)

    # Re-train to get an accuracy estimate.
    param.cross_validation = True
    param.nr_fold = 5
    acc = ll.train(prob, param)
    print >> sys.stderr, "Accuracy", acc
    return model

def mean(a):
    """Like numpy.mean(a, 0), but don't treat a 1 by n [[a_1, a_2, ...]] array as
    a special case that only returns the mean of the flattened array."""
    h, w = a.shape
    if h == 1:
        return a[0]
    else:
        return np.mean(a, 0)

def var(a):
    """Like numpy.var(a, 0), but don't treat a 1 by n [[a_1, a_2, ...]] array as
    a special case that only returns the variance of the flattened array."""
    h, w = a.shape
    if h == 1:
        return np.array([np.nan] * w)
    else:
        return np.vectorize(common.clamp)(np.var(a, 0))

def group_means(group):
    return mean(np.array(group.features))

def group_variances(group):
    # Map NaN for undefined variance to 0.0 (as if we had two identical samples).
    return np.nan_to_num(var(np.array(group.features)))

def save_model(f, feature_names, groups, model, scale_params = None):
    print "begin features"
    for feature_name in feature_names:
        print feature_name
    print "end features"

    if scale_params is not None:
        print "begin scale"
        for s_min, s_max in scale_params:
            print "%g,%g" % (s_min, s_max)
        print "end scale"

    print

    print "c", model.param.C

    print

    for group in groups:
        print "class"
        print "nmapname", group.desc.nmapname
        for nmapclass in group.desc.nmapclasses:
            print "nmapclass", parse.format_nmapclass(nmapclass.nmapclass)
            for cpe in nmapclass.cpe:
                print "cpe", cpe
        means = group_means(group)
        print "means", " ".join(str(m) for m in means)
        variances = group_variances(group)
        print "variances", " ".join(str(v) for v in variances)

    print "begin liblinear"
    # Save the model to a temporary file and read it back.
    model_file = tempfile.NamedTemporaryFile()
    ll.save_model(model_file.name, model)
    sys.stdout.write(model_file.read())
    model_file.close()
    print "end liblinear"

opts, args = getopt.gnu_getopt(sys.argv[1:], "c:g:hs:", ["group=", "help", "set=", "scale"])
for o, a in opts:
    if o == "-c":
        options.cost = float(a)
    elif o == "-g" or o == "--group":
        options.group_filename = a
    elif o == "-h" or o == "--help":
        usage()
        sys.exit()
    elif o == "-s" or o == "--set":
        options.set_filename = a
    elif o == "--scale":
        options.scale = True

if options.set_filename is None:
    usage(sys.stderr)
    exit(1)
if options.group_filename is None:
    usage(sys.stderr)
    exit(1)

feature_names = parse.parse_feature_set_file(options.set_filename)


groups = parse.parse_groups_file(options.group_filename)
for group in groups:
    group.features = []
    for rs in group.rs_list:
        features = vectorize.vectorize(feature_names, rs)
        group.features.append(features)

scale_params = prepare_features(groups)

print >> sys.stderr, "Training."
model = train(groups)

save_model(sys.stdout, feature_names, groups, model, scale_params)