#!/usr/bin/env python import getopt import sys import tempfile import numpy as np sys.path.append("liblinear-1.8/python") import liblinearutil as ll import common import impute import parse import vectorize def usage(f = sys.stdout): print >> f, """\ Usage: %s -s [FP_FILENAME]... Train a classifier using the given set of features and fingerprint files. The output is a .model file. -c COST set the regularization cost (default 1). -g, --group=GROUP_FILENAME read groups from GROUP_FILENAME (required). -h, --help show this help. -s, --set=SET_FILENAME use the set of features in SET_FILENAME (required). --scale scale feature vectors to the range [0, 1].\ """ % sys.argv[0] class options (object): cost = None group_filename = None set_filename = None scale = False def make_liblinear(groups): y = [] x = [] numeric_label = 0 for group in groups: for features in group.features: y.append(numeric_label) x.append(list(features)) numeric_label += 1 return y, x def scale(features): m, n = features.shape s_min = np.zeros(n) s_max = np.zeros(n) for i in range(n): assigned = [x for x in features[:,i] if x >= 0] if assigned: mn = min(assigned) mx = max(assigned) else: mn = 0.0 mx = 0.0 s_min[i] = mn s_max[i] = mx if mn == mx: denom = 1.0 else: denom = mx - mn for j in range(m): if features[j, i] >= 0: features[j, i] = (features[j, i] - mn) / denom return features, s_min, s_max def prepare_features(groups): """Impute and scale features, and assign them back to groups.""" feature_list = [] for group in groups: for features in group.features: feature_list.append(features) feature_matrix = np.vstack(feature_list) feature_matrix = impute.impute(feature_matrix) if options.scale: feature_matrix, s_min, s_max = scale(feature_matrix) scale_params = zip(s_min, s_max) else: scale_params = None f_i = iter(feature_matrix) for group in groups: group.features = [] for i in range(len(group.rs_list)): group.features.append(f_i.next()) return scale_params def train(groups): y, x = make_liblinear(groups) # -s 0: L2-regularized logistic regression (primal) # -q: quiet mode param_str = "-s 0 -q" if options.cost is not None: # -c cost: set the parameter C (default 1) param_str += " -c " + str(options.cost) param = ll.parameter(param_str) prob = ll.problem(y, x) model = ll.train(prob, param) # Re-train to get an accuracy estimate. param.cross_validation = True param.nr_fold = 5 acc = ll.train(prob, param) print >> sys.stderr, "Accuracy", acc return model def mean(a): """Like numpy.mean(a, 0), but don't treat a 1 by n [[a_1, a_2, ...]] array as a special case that only returns the mean of the flattened array.""" h, w = a.shape if h == 1: return a[0] else: return np.mean(a, 0) def var(a): """Like numpy.var(a, 0), but don't treat a 1 by n [[a_1, a_2, ...]] array as a special case that only returns the variance of the flattened array.""" h, w = a.shape if h == 1: return np.array([np.nan] * w) else: return np.vectorize(common.clamp)(np.var(a, 0)) def group_means(group): return mean(np.array(group.features)) def group_variances(group): # Map NaN for undefined variance to 0.0 (as if we had two identical samples). return np.nan_to_num(var(np.array(group.features))) def save_model(f, feature_names, groups, model, scale_params = None): print "begin features" for feature_name in feature_names: print feature_name print "end features" if scale_params is not None: print "begin scale" for s_min, s_max in scale_params: print "%g,%g" % (s_min, s_max) print "end scale" print print "c", model.param.C print for group in groups: print "class" print "nmapname", group.desc.nmapname for nmapclass in group.desc.nmapclasses: print "nmapclass", parse.format_nmapclass(nmapclass.nmapclass) for cpe in nmapclass.cpe: print "cpe", cpe means = group_means(group) print "means", " ".join(str(m) for m in means) variances = group_variances(group) print "variances", " ".join(str(v) for v in variances) print "begin liblinear" # Save the model to a temporary file and read it back. model_file = tempfile.NamedTemporaryFile() ll.save_model(model_file.name, model) sys.stdout.write(model_file.read()) model_file.close() print "end liblinear" opts, args = getopt.gnu_getopt(sys.argv[1:], "c:g:hs:", ["group=", "help", "set=", "scale"]) for o, a in opts: if o == "-c": options.cost = float(a) elif o == "-g" or o == "--group": options.group_filename = a elif o == "-h" or o == "--help": usage() sys.exit() elif o == "-s" or o == "--set": options.set_filename = a elif o == "--scale": options.scale = True if options.set_filename is None: usage(sys.stderr) exit(1) if options.group_filename is None: usage(sys.stderr) exit(1) feature_names = parse.parse_feature_set_file(options.set_filename) groups = parse.parse_groups_file(options.group_filename) for group in groups: group.features = [] for rs in group.rs_list: features = vectorize.vectorize(feature_names, rs) group.features.append(features) scale_params = prepare_features(groups) print >> sys.stderr, "Training." model = train(groups) save_model(sys.stdout, feature_names, groups, model, scale_params)