#!/usr/bin/env python

import getopt
import sys
import tempfile

import numpy as np
import scipy.cluster.hierarchy

import sys
sys.path.append("liblinear-1.8/python")
import liblinearutil as ll

import common
import parse
import vectorize

def usage(f = sys.stdout):
    print >> f, """\
Usage: %s -s <SET_FILENAME> [FP_FILENAME]...
Do clustering of the given fingerprint files.

  -h, --help              show this help.
  -k K                    generate K clusters (conflicts with -t).
  -s, --set=SET_FILENAME  use the set of features in SET_FILENAME (required).
  -t THRESHOLD            use THRESHOLD separation of clusters (conflicts with -k).\
""" % sys.argv[0]

DEFAULT_THRESHOLD = 0.5

class options (object):
    k = None
    threshold = None
    set_filename = None


opts, args = getopt.gnu_getopt(sys.argv[1:], "hk:s:t:", ["help", "set="])
for o, a in opts:
    if o == "-k":
        options.k = int(a)
    elif o == "-s" or o == "--set":
        options.set_filename = a
    elif o == "-t":
        options.threshold = float(a)

if options.set_filename is None:
    usage(sys.stderr)
    exit(1)

def whiten(a):
    """Divide columns by their standard deviation to set per-column variance to
    1.0. Leaves columns with 0 variance unchanged."""
    std = map(lambda z: z == 0 and 1 or z, np.std(a, 0))
    return a / std

feature_names = parse.parse_feature_set_file(options.set_filename)

rs_list = []
features_list = []
for fp_filename, fp_f in common.find_files_or_stdin(args, "*.6fp"):
    rs = parse.parse_6fp(fp_f)
    rs.filename = fp_filename
    features = vectorize.vectorize(feature_names, rs)
    features = map(lambda z: (z is vectorize.MISSING or z is vectorize.UNKNOWN) and -1 or z, features)
    rs_list.append(rs)
    features_list.append(features)

if options.k is None:
    options.threshold = options.threshold or DEFAULT_THRESHOLD

if options.k is not None and options.threshold is not None:
    print >> sys.stderr, "You can't use both the -k and -t options."
    sys.exit(1)

feature_matrix = np.vstack(features_list)
feature_matrix = whiten(feature_matrix)

if options.k is not None:
    labels = scipy.cluster.hierarchy.fclusterdata(feature_matrix, options.k, criterion="maxclust")
else:
    labels = scipy.cluster.hierarchy.fclusterdata(feature_matrix, options.threshold)

clusters = {}
for i in range(len(labels)):
    clusters.setdefault(labels[i], [])
    clusters[labels[i]].append(rs_list[i])

clusters = [sorted(x) for x in clusters.values()]
clusters.sort(key = lambda x: (len(x), x))

for i, cluster in enumerate(clusters):
    print "group", ", ".join(rs.desc.nmapname or "--" for rs in cluster)
    for rs in sorted(cluster, key = lambda x: x.filename):
        print "file", rs.filename
    if i < len(clusters) - 1:
        print