#!/usr/bin/env python import getopt import sys import tempfile import numpy as np import matplotlib.pyplot as plt sys.path.append("liblinear-1.8/python") import liblinearutil as ll import common import impute import parse import vectorize np.set_printoptions(threshold = 10000) def usage(f = sys.stdout): print >> f, """\ Usage: %s -s [FP_FILENAME]... Make a visualization of the projection of the principal components of feature vectors. -g, --group=GROUP_FILENAME read groups from GROUP_FILENAME (required). -s, --set=SET_FILENAME use the set of features in SET_FILENAME (required). --scale scale feature vectors to the range [0, 1].\ """ % sys.argv[0] class options (object): group_filename = None set_filename = None scale = False def scale(features): m, n = features.shape s_min = np.zeros(n) s_max = np.zeros(n) for i in range(n): assigned = [x for x in features[:,i] if x >= 0] if assigned: mn = min(assigned) mx = max(assigned) else: mn = 0.0 mx = 0.0 s_min[i] = mn s_max[i] = mx if mn == mx: denom = 1.0 else: denom = mx - mn for j in range(m): if features[j, i] >= 0: features[j, i] = (features[j, i] - mn) / denom return features, s_min, s_max def prepare_features(groups): """Impute and scale features, and assign them back to groups.""" feature_list = [] for group in groups: for features in group.features: feature_list.append(features) feature_matrix = np.vstack(feature_list) feature_matrix = impute.impute(feature_matrix) if options.scale: feature_matrix, s_min, s_max = scale(feature_matrix) scale_params = zip(s_min, s_max) else: scale_params = None f_i = iter(feature_matrix) for group in groups: group.features = [] for i in range(len(group.rs_list)): group.features.append(f_i.next()) return scale_params # M = numpy.transpose(numpy.vstack(x)) # M -= numpy.mean(M, axis = 0) # M = numpy.nan_to_num(M / numpy.std(M, axis = 0)) # # U, d, Vh = numpy.linalg.svd(M, full_matrices = False) # # dim_2 = U[:,:2] * d[:2] # coords = numpy.dot(numpy.transpose(M), dim_2) def do_pca(groups): # X has features along rows (individual features in columns). X = np.vstack(g.features for g in groups) # print # print "X" # print X X -= np.mean(X, axis = 0) # print # print "X meaned" # print X X = np.nan_to_num(X / np.std(X, axis = 0)) # print # print "X std" # print X U, d, Vh = np.linalg.svd(X, full_matrices = False) # print U # print d # print Vh coords = U[:,:2] * d[:2] pca_groups = [] f_i = iter(coords) for group in groups: pca_group = [] pca_groups.append(pca_group) group.features = [] for i in range(len(group.rs_list)): pca_group.append(f_i.next()) return pca_groups opts, args = getopt.gnu_getopt(sys.argv[1:], "g:s:", ["group=", "set=", "scale"]) for o, a in opts: if o == "-g" or o == "--group": options.group_filename = a elif o == "-s" or o == "--set": options.set_filename = a elif o == "--scale": options.scale = True if options.set_filename is None: usage(sys.stderr) exit(1) if options.group_filename is None: usage(sys.stderr) exit(1) feature_names = parse.parse_feature_set_file(options.set_filename) groups = parse.parse_groups_file(options.group_filename) for group in groups: group.features = [] for rs in group.rs_list: features = vectorize.vectorize(feature_names, rs) group.features.append(features) scale_params = prepare_features(groups) pca_groups = do_pca(groups) COLORS = { "Linux": "#888888", "Windows": "lightgreen", "AIX": "blue", "OpenBSD": "orange", "FreeBSD": "coral", "Mac OS X": "cornflowerblue", "iOS": "cornflowerblue", "OpenIndiana": "purple", "OpenSolaris": "purple", } for i, gg in enumerate(zip(groups, pca_groups)): group, pca_group = gg pca_group = np.asarray(pca_group) plt.scatter(pca_group[:,0], pca_group[:,1], c = COLORS.get(group.desc.nmapclasses[0].nmapclass[1], "black")) plt.show()