#!/usr/bin/env python import getopt import hashlib import struct import common import parse from scapy.all import * __all__ = ["vectorize", "MISSING", "UNKNOWN"] # MISSING means that the feature was probed, but there was no response. The # value is known to be a non-response and should not have its value imputed. # MISSING is used, for example, when we send a probe to an open port and don't # receive a response, or when when we look for an option in a TCP packet that # was received but is missing that option. # UNKNOWN means that the target might have responded to a probe, or might # not--we didn't get a chance to test. UNKNOWN values should be imputed. UNKNOWN # is used, for example, when we lack an open port and so don't send those # probes. class Symbol (object): def __init__(self, label): self.label = label def __str__(self): return self.label def __repr__(self): return str(self) MISSING = Symbol("MISSING") UNKNOWN = Symbol("UNKNOWN") # Features to do. # # IPV6_FLOWLABEL (sequence generation algorithm, like SEQ.TI, SEQ.CI, SEQ.II) # IPV6_HOPLIMIT (guessed original) # IPV6_NH_0, IPV6_NH_1, ..., IPV6_NH_9 # IPV6_HLEN_0, IPV6_HLEN_1, ..., IPV6_HLEN_9 # IPV6_FRAG_OFFSET # IPV6_FRAG_ID # IPV6_FRAG_RES # IPV6_FRAG_RES0 # IPV6_FRAG_RES1 # IPV6_FRAG_M # # TCP_OFFSET # TCP_GCD # TCP_SP # TCP_TS (rate) # TCP_TS_TSVAL (zero/nonzero) # TCP_TS_TSECR (zero/nonzero) # TCP RST data # TCP_SEQ (zero?, difference from S value, difference from A value) # TCP_ACK (zero?, difference from S value, difference from A value) # # ICMPV6_LEN # ICMPV6_TYPE # ICMPV6_CODE # ICMPV6_PARAMPROBLEM_PTR # ICMPV6_CHECKSUM (good/bad/zero) # # Contents of specific extension headers. # # Node Information, grep for strings like "iPhone". # # Distinguish "no response" from "unknown." "Unknown" should not count either # way towards matching or non-matching. # Returns only the first reply (if any), otherwise None. def get_reply(rs, probe_name): packets = rs.get(probe_name) if packets is None or len(packets) == 0: return None return packets[0] def find_tcp(packet): if packet is None: return None return packet.getlayer(TCP) def vectorize_plen(ip, rs): if ip is None: return UNKNOWN return ip.plen def vectorize_tc(ip, rs): if ip is None: return UNKNOWN return ip.tc def vectorize_tcp_window(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN return tcp.window def vectorize_tcp_urgp(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN return tcp.urgptr def make_vectorize_tcp_flag(flag): def fn(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN # Get the reserved bits too. flags, = struct.unpack_from(">H", str(tcp), 12) flags = flags & 0xfff if flags & flag == 0: return 0 else: return 1 return fn def tcp_options_iter(tcp): # Raw options string. o = tcp.get_field("options").i2m(tcp, tcp.options) i = 0 while i < len(o): type = ord(o[i]) if type == 0 or type == 1: i += 1 yield (type, 1, None) continue if i + 1 >= len(o): break length = ord(o[i + 1]) if length < 2: # Bogus, bail out. break # Ignore range errors here. value = o[i + 2:i + length] i += length yield (type, length, value) def find_tcp_nth_option(tcp, n): i = 0 for opt in tcp_options_iter(tcp): if i == n: return opt i += 1 def make_vectorize_tcp_opt(n): def fn(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN opt = find_tcp_nth_option(tcp, n) if opt is None: return MISSING return opt[0] return fn def make_vectorize_tcp_optlen(n): def fn(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN opt = find_tcp_nth_option(tcp, n) if opt is None: return MISSING return opt[1] return fn def find_tcp_option(options, key): """Returns the value of the first matching TCP option, in Scapy's representation.""" for option in options: if option[0] == key: return option[1] return MISSING def vectorize_tcp_mss(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN return find_tcp_option(tcp.options, "MSS") def vectorize_tcp_sackok(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN val = find_tcp_option(tcp.options, "SAckOK") if val != MISSING: val = 1 return val def vectorize_tcp_wscale(ip, rs): tcp = find_tcp(ip) if tcp is None: return UNKNOWN return find_tcp_option(tcp.options, "WScale") # Shorthand tests that just need to get a value from a single response. INDIVIDUAL_TESTS = { "PLEN": vectorize_plen, "TC": vectorize_tc, "TCP_WINDOW": vectorize_tcp_window, "TCP_URGP": vectorize_tcp_urgp, "TCP_FLAG_F": make_vectorize_tcp_flag(1 << 0), "TCP_FLAG_S": make_vectorize_tcp_flag(1 << 1), "TCP_FLAG_R": make_vectorize_tcp_flag(1 << 2), "TCP_FLAG_P": make_vectorize_tcp_flag(1 << 3), "TCP_FLAG_A": make_vectorize_tcp_flag(1 << 4), "TCP_FLAG_U": make_vectorize_tcp_flag(1 << 5), "TCP_FLAG_E": make_vectorize_tcp_flag(1 << 6), "TCP_FLAG_C": make_vectorize_tcp_flag(1 << 7), "TCP_FLAG_RES8": make_vectorize_tcp_flag(1 << 8), "TCP_FLAG_RES9": make_vectorize_tcp_flag(1 << 9), "TCP_FLAG_RES10": make_vectorize_tcp_flag(1 << 10), "TCP_FLAG_RES11": make_vectorize_tcp_flag(1 << 11), "TCP_OPT_0": make_vectorize_tcp_opt(0), "TCP_OPT_1": make_vectorize_tcp_opt(1), "TCP_OPT_2": make_vectorize_tcp_opt(2), "TCP_OPT_3": make_vectorize_tcp_opt(3), "TCP_OPT_4": make_vectorize_tcp_opt(4), "TCP_OPT_5": make_vectorize_tcp_opt(5), "TCP_OPT_6": make_vectorize_tcp_opt(6), "TCP_OPT_7": make_vectorize_tcp_opt(7), "TCP_OPT_8": make_vectorize_tcp_opt(8), "TCP_OPT_9": make_vectorize_tcp_opt(9), "TCP_OPT_10": make_vectorize_tcp_opt(10), "TCP_OPT_11": make_vectorize_tcp_opt(11), "TCP_OPT_12": make_vectorize_tcp_opt(12), "TCP_OPT_13": make_vectorize_tcp_opt(13), "TCP_OPT_14": make_vectorize_tcp_opt(14), "TCP_OPT_15": make_vectorize_tcp_opt(15), "TCP_OPTLEN_0": make_vectorize_tcp_optlen(0), "TCP_OPTLEN_1": make_vectorize_tcp_optlen(1), "TCP_OPTLEN_2": make_vectorize_tcp_optlen(2), "TCP_OPTLEN_3": make_vectorize_tcp_optlen(3), "TCP_OPTLEN_4": make_vectorize_tcp_optlen(4), "TCP_OPTLEN_5": make_vectorize_tcp_optlen(5), "TCP_OPTLEN_6": make_vectorize_tcp_optlen(6), "TCP_OPTLEN_7": make_vectorize_tcp_optlen(7), "TCP_OPTLEN_8": make_vectorize_tcp_optlen(8), "TCP_OPTLEN_9": make_vectorize_tcp_optlen(9), "TCP_OPTLEN_10": make_vectorize_tcp_optlen(10), "TCP_OPTLEN_11": make_vectorize_tcp_optlen(11), "TCP_OPTLEN_12": make_vectorize_tcp_optlen(12), "TCP_OPTLEN_13": make_vectorize_tcp_optlen(13), "TCP_OPTLEN_14": make_vectorize_tcp_optlen(14), "TCP_OPTLEN_15": make_vectorize_tcp_optlen(15), "TCP_MSS": vectorize_tcp_mss, "TCP_SACKOK": vectorize_tcp_sackok, "TCP_WSCALE": vectorize_tcp_wscale, } def array_constant(a): """If all the values in an array are the same and the array is non-empty, return the value. Otherwise, return None.""" if len(a) == 0: return None v = a[0] for x in a[1:]: if x != v: return None return v def differences_mod(a, m): result = [] for i in range(len(a) - 1): result.append((a[i + 1] - a[i]) % m) return result def gcd(m, n): while True: if n == 0: r = m else: r = m % n if r == 0: break m = n n = r return n def array_gcd(a): if len(a) <= 1: return None return reduce(gcd, a) # Flow label sequence generation. # We extract three features from a sequence of flow labels. # FLOWLABEL_ALG: overall sequence generation algorithm. # 0: constant # 1: echoed (same as in requesting probe) # 2: incremental # 3: random # FLOWLABEL_VALUE: Only set if FLOWLABEL_ALG is 0 (constant). # It's the constant value of the flow label. # FLOWLABEL_INCREMENT: Only set if FLOWLABEL_ALG is 1 (incremental). # It's the average rate of increase of the flow labels. FLOWLABEL_ALG_CONSTANT, FLOWLABEL_ALG_ECHOED, \ FLOWLABEL_ALG_INCREMENTAL, FLOWLABEL_ALG_RANDOM = range(4) def vectorize_flowlabel_alg(flowlabels, rs): if len(flowlabels) < 2: return UNKNOWN c = array_constant(flowlabels) if c is not None: if c == 0: # This takes precedence in the case that rs.flowlabel == 0. return FLOWLABEL_ALG_CONSTANT if c == rs.flow_label: return FLOWLABEL_ALG_ECHOED else: return FLOWLABEL_ALG_CONSTANT diff = differences_mod(flowlabels, 0xfffff) g = array_gcd(diff) if g is None: return UNKNOWN # Any differences greater than 10 * GCD? for d in diff: if d > 10 * g: return FLOWLABEL_ALG_RANDOM return FLOWLABEL_ALG_INCREMENTAL def vectorize_flowlabel_value(flowlabels, rs): alg = vectorize_flowlabel_alg(flowlabels, rs) if alg == UNKNOWN: return UNKNOWN elif alg == FLOWLABEL_ALG_CONSTANT: return array_constant(flowlabels) else: return MISSING def vectorize_flowlabel_increment(flowlabels, rs): alg = vectorize_flowlabel_alg(flowlabels, rs) if alg == UNKNOWN: return UNKNOWN elif alg == FLOWLABEL_ALG_INCREMENTAL: return float((flowlabels[-1] - flowlabels[0]) % 0xfffff) / (len(flowlabels) - 1) else: return MISSING OPEN_TCP_FLOWLABEL_PROBES = ( "S1", "S2", "S3", "S4", "S5", "S6", ) CLOSED_TCP_FLOWLABEL_PROBES = ( "T5", "T6", "T7", ) def extract_flowlabels(rs, probe_names): result = [] for probe_name in probe_names: resp = get_reply(rs, probe_name) if resp is not None: result.append(resp.p.fl) return result def vectorize_flowlabel_alg_tcp_open(rs): return vectorize_flowlabel_alg(extract_flowlabels(rs, OPEN_TCP_FLOWLABEL_PROBES), rs) def vectorize_flowlabel_value_tcp_open(rs): return vectorize_flowlabel_value(extract_flowlabels(rs, OPEN_TCP_FLOWLABEL_PROBES), rs) def vectorize_flowlabel_increment_tcp_open(rs): return vectorize_flowlabel_increment(extract_flowlabels(rs, OPEN_TCP_FLOWLABEL_PROBES), rs) def vectorize_flowlabel_alg_tcp_closed(rs): return vectorize_flowlabel_alg(extract_flowlabels(rs, CLOSED_TCP_FLOWLABEL_PROBES), rs) def vectorize_flowlabel_value_tcp_closed(rs): return vectorize_flowlabel_value(extract_flowlabels(rs, CLOSED_TCP_FLOWLABEL_PROBES), rs) def vectorize_flowlabel_increment_tcp_closed(rs): return vectorize_flowlabel_increment(extract_flowlabels(rs, CLOSED_TCP_FLOWLABEL_PROBES), rs) TCP_SEQ_PROBES = ( "S1", "S2", "S3", "S4", "S5", "S6", ) class SeqInfo (object): def __init__(self, seq, t): self.seq = seq self.t = t def vectorize_tcp_isr(rs): seqs = [] for probe_name in TCP_SEQ_PROBES: resp = get_reply(rs, probe_name) if resp is None: continue tcp = find_tcp(resp.p) if tcp is None: continue seqs.append(SeqInfo(tcp.seq, resp.send_time)) if len(seqs) < 2: return UNKNOWN s = 0.0 for i in range(len(seqs) - 1): s += (seqs[i + 1].seq - seqs[i].seq) & 0xffffffff t = seqs[-1].t - seqs[0].t return s / t # These functions take a whole ResponseSet. COMBINED_TESTS = { "FLOWLABEL_ALG_TCP_OPEN": vectorize_flowlabel_alg_tcp_open, "FLOWLABEL_VALUE_TCP_OPEN": vectorize_flowlabel_value_tcp_open, "FLOWLABEL_INCREMENT_TCP_OPEN": vectorize_flowlabel_increment_tcp_open, "FLOWLABEL_ALG_TCP_CLOSED": vectorize_flowlabel_alg_tcp_closed, "FLOWLABEL_VALUE_TCP_CLOSED": vectorize_flowlabel_value_tcp_closed, "FLOWLABEL_INCREMENT_TCP_CLOSED": vectorize_flowlabel_increment_tcp_closed, "TCP_ISR": vectorize_tcp_isr, } def vectorize_unit(feature_name, rs): parts = feature_name.split(".") indiv = INDIVIDUAL_TESTS.get(parts[-1]) if indiv is not None: assert(len(parts) == 2) resp = get_reply(rs, parts[0]) if resp is not None: ip = resp.p else: ip = None return indiv(ip, rs) combined = COMBINED_TESTS.get(feature_name) if combined is not None: return combined(rs) raise ValueError("Don't know how to vectorize feature %s" % repr(feature_name)) def vectorize(feature_names, response_set): rs = response_set vector = [] for feature_name in feature_names: vector.append(vectorize_unit(feature_name, rs)) return vector # This file also works as a debugging program to print out feature vectors. def usage(f = sys.stdout): print >> f, """\ Usage: %s -s [FP_FILENAME]... Prints feature names and value for the given fingerprint file. -h, --help show this help. -s, --set=SET_FILENAME use the set of features in SET_FILENAME (required).\ """ % sys.argv[0] def main(): set_filename = None opts, args = getopt.gnu_getopt(sys.argv[1:], "hs:", ["help", "set="]) for o, a in opts: if o == "-h" or o == "--help": usage() sys.exit() elif o == "-s" or o == "--set": set_filename = a if set_filename is None or len(args) > 1: usage(sys.stderr) exit(1) feature_names = parse.parse_feature_set_file(set_filename) if len(args) == 0: rs = parse.parse_6fp(sys.stdin) else: rs = parse.parse_fp_file_magic(args[0]) features = vectorize(feature_names, rs) for name, value in zip(feature_names, features): print "%10s %s" % (str(value), name) if __name__ == "__main__": main()