#!/usr/bin/env python import getopt import sys import tempfile import numpy as np import scipy.cluster.hierarchy import sys sys.path.append("liblinear-1.8/python") import liblinearutil as ll import common import parse import vectorize def usage(f = sys.stdout): print >> f, """\ Usage: %s -s [FP_FILENAME]... Do clustering of the given fingerprint files. -h, --help show this help. -k K generate K clusters (conflicts with -t). -s, --set=SET_FILENAME use the set of features in SET_FILENAME (required). -t THRESHOLD use THRESHOLD separation of clusters (conflicts with -k).\ """ % sys.argv[0] DEFAULT_THRESHOLD = 0.5 class options (object): k = None threshold = None set_filename = None opts, args = getopt.gnu_getopt(sys.argv[1:], "hk:s:t:", ["help", "set="]) for o, a in opts: if o == "-k": options.k = int(a) elif o == "-s" or o == "--set": options.set_filename = a elif o == "-t": options.threshold = float(a) if options.set_filename is None: usage(sys.stderr) exit(1) def whiten(a): """Divide columns by their standard deviation to set per-column variance to 1.0. Leaves columns with 0 variance unchanged.""" std = map(lambda z: z == 0 and 1 or z, np.std(a, 0)) return a / std feature_names = parse.parse_feature_set_file(options.set_filename) rs_list = [] features_list = [] for fp_filename, fp_f in common.find_files_or_stdin(args, "*.6fp"): rs = parse.parse_6fp(fp_f) rs.filename = fp_filename features = vectorize.vectorize(feature_names, rs) features = map(lambda z: (z is vectorize.MISSING or z is vectorize.UNKNOWN) and -1 or z, features) rs_list.append(rs) features_list.append(features) if options.k is None: options.threshold = options.threshold or DEFAULT_THRESHOLD if options.k is not None and options.threshold is not None: print >> sys.stderr, "You can't use both the -k and -t options." sys.exit(1) feature_matrix = np.vstack(features_list) feature_matrix = whiten(feature_matrix) if options.k is not None: labels = scipy.cluster.hierarchy.fclusterdata(feature_matrix, options.k, criterion="maxclust") else: labels = scipy.cluster.hierarchy.fclusterdata(feature_matrix, options.threshold) clusters = {} for i in range(len(labels)): clusters.setdefault(labels[i], []) clusters[labels[i]].append(rs_list[i]) clusters = [sorted(x) for x in clusters.values()] clusters.sort(key = lambda x: (len(x), x)) for i, cluster in enumerate(clusters): print "group", ", ".join(rs.desc.nmapname or "--" for rs in cluster) for rs in sorted(cluster, key = lambda x: x.filename): print "file", rs.filename if i < len(clusters) - 1: print