#!/usr/bin/env python import getopt import sys import tempfile import liblinearutil as ll from fp import * from vectorize import * def usage(f = sys.stdout): print >> f, """\ Usage: %s -s [FP_FILENAME]... Train a classifier using the given set of features and fingerprint files. The output is a .model file. -s, --set=SET_FILENAME use the set of features in SET_FILENAME (required).\ """ % sys.argv[0] class options (object): set_filename = None def parse_set_file(filename): feature_names = [] f = open(filename) try: for line in f: line = line.strip() if line and not line.startswith("#"): feature_names.append(line) finally: f.close() return feature_names def make_liblinear(fps, tier): y = [] x = [] labels = [] label_map = {} for fp in fps: osclass = fp[0][:tier] numeric_label = label_map.get(osclass) if numeric_label is None: numeric_label = len(labels) label_map[osclass] = numeric_label labels.append(osclass) vector = map(lambda z: z is None and -1 or z, fp[1]) y.append(numeric_label) x.append(vector) return y, x, labels def train(fps): max_tier = max(len(fp[0]) for fp in fps) models = [] for tier in range(1, max_tier + 1): print >> sys.stderr, "Training tier", tier y, x, labels = make_liblinear(fps, tier) # -s 0: L2-regularized logistic regression (primal) # -q: quiet mode param = ll.parameter("-s 0 -q") prob = ll.problem(y, x) model = ll.train(prob, param) models.append((model, labels)) # Re-train to get an accuracy estimate. param.cross_validation = True param.nr_fold = 5 acc = ll.train(prob, param) print >> sys.stderr, "Accuracy", acc return models def save_model(f, feature_set, models): print "begin features" for feature_name in feature_set: print feature_name print "end features" for tier, model_elem in enumerate(models, 1): model, labels = model_elem print print "tier", tier for label in labels: print "class", " | ".join(label) print "begin liblinear" # Save the model to a temporary file and read it back. model_file = tempfile.NamedTemporaryFile() ll.save_model(model_file.name, model) sys.stdout.write(model_file.read()) model_file.close() print "end liblinear" opts, args = getopt.gnu_getopt(sys.argv[1:], "s:", ["set="]) for o, a in opts: if o == "-s" or o == "--set": options.set_filename = a if options.set_filename is None: usage(sys.stderr) exit(1) feature_set = parse_set_file(options.set_filename) fps = [] for path in args: for fp_filename in find_files(path, "*.6fp"): rs = ResponseSet() rs.parse_file(fp_filename) if not rs.osclass: print >> sys.stderr, "Skipping %s because of no osclass." % fp_filename continue features = vectorize(feature_set, rs) fps.append((rs.osclass, features)) models = train(fps) save_model(sys.stdout, feature_set, models)