#!/usr/bin/env python import getopt import sys import common import parse import vectorize def usage(f = sys.stdout): print >> f, """\ Usage: %s -s [FP_FILENAME]... Output feature vectors in the format used by R. -s, --set=SET_FILENAME use the set of features in SET_FILENAME (required). --variance omit zero-variance columns.\ """ % sys.argv[0] class options (object): set_filename = None require_variance = False opts, args = getopt.gnu_getopt(sys.argv[1:], "s:", ["set=", "variance"]) for o, a in opts: if o == "-s" or o == "--set": options.set_filename = a elif o == "--variance": options.require_variance = True if options.set_filename is None: usage(sys.stderr) exit(1) feature_names = parse.parse_feature_set_file(options.set_filename) descs = [] fps = [] for fp_filename, fp_f in common.find_files_or_stdin(args, "*.6fp"): rs = parse.parse_6fp(fp_f) if not rs.desc.osclass: print >> sys.stderr, "Skipping %s because of no osclass." % fp_filename continue features = vectorize.vectorize(feature_names, rs) descs.append(rs.desc) fps.append(features) def varies(l): l = [x for x in l if x is not None] if len(l) <= 1: return False for i in range(1, len(l)): if l[i] != l[0]: return True return False def remove_non_variant(fps): keep = [] for i in range(len(fps[0])): col = [] for fp in fps: col.append(fp[i]) if varies(col): keep.append(True) else: keep.append(False) result = [] for fp in fps: result.append([x for b, x in zip(keep, fp) if b]) return result if options.require_variance: fps = remove_non_variant(fps) for desc, features in zip(descs, fps): if desc.osclass is None: osclass_str = "NA" else: osclass_str= "|".join(desc.osclass).replace(" ", "") print "%30s" % osclass_str, for feature in features: if feature is None: feature = "NA" else: feature = "%d" % feature print "%5s" % feature, print