#!/usr/bin/env python import getopt import re import sys import urllib LANG2CD = {'Spanish': 'es','Danish': 'da', 'Portuguese' :'pt','Italian': 'it', 'English': 'en', 'Czech': 'cs', 'Polish': 'pl', 'German': 'de', 'French':'fr'} CD2LANG = dict((v, k) for (k, v) in LANG2CD.items()) def usage(f = sys.stdout): print >> f, """\ Usage: %s nmap-service-probes > nmap-service-probes.new Tidies an nmap-service-probes file and checks for errors. Tidies: * Removes excess whitespace. * Sorts templates in the order m p v i d o h cpe:. * Canonicalizes template delimiters in the order / | %% = @ #. * Optionally removes templates with the --strip option. Warns about: * Unknown template types in match lines. * Duplicate template types (except cpe:). -h, --help Show this help. -n, --dry-run Don't write an output file, just issue warnings. --strip=TYPE Strip templates of TYPE (e.g. --strip=cpe:). This option accumulates.\ """ % sys.argv[0] def usage_error(): usage(sys.stderr) sys.exit(1) class options (object): dry_run = False strip_types = set() opts, args = getopt.gnu_getopt(sys.argv[1:], "hn", ["dry-run", "help", "strip="]) for o, a in opts: if o == "-h" or o == "--help": usage() sys.exit() elif o == "-n" or o == "--dry-run": options.dry_run = True elif o == "--strip": options.strip_types.update(a.split(",")) if len(args) == 0: input_file = sys.stdin elif len(args) == 1: input_file = open(args[0]) else: usage_error() # match aplus m|^\x01\xff\0\xff\x01\x1d\0\xfd\0\n\x03\x05A\+ API \(([\d.]+)\) - CCS \(([\d.]+)\)\0| p/Cleo A+/ i/API $1; CSS $2/ def grab_word(s): i = 0 while i < len(s) and s[i].isspace(): i += 1 if i >= len(s): return None, s[i:] j = i + 1 while j < len(s) and not s[j].isspace(): j += 1 return s[i:j], s[j:] def is_typechar(c): return c.isalpha() or c == ":" def is_delimchar(c): return not c.isspace() def is_flagchar(c): return c.isalpha() def grab_template(s): template = MatchTemplate() i = 0 while i < len(s) and s[i].isspace(): i += 1 if i >= len(s): return None, s[i:] j = i while j < len(s) and is_typechar(s[j]): j += 1 if j >= len(s) or not is_delimchar(s[j]): return None, s[j:] template.type = s[i:j] delim = s[j] j += 1 i = j while j < len(s) and s[j] != delim: j += 1 if j >= len(s): return None, s[j:] template.body = s[i:j] j += 1 i = j while j < len(s) and is_flagchar(s[j]): j += 1 if not (j >= len(s) or s[j].isspace()): return None, s[j:] template.flags = s[i:j] return template, s[j:] # Order in which to sort templates. They are additionally sorted alphabetically # by their contents, for cpe: which may appear more than once. TEMPLATE_TYPE_ORDER = [ "m", "p", "v", "i", "d", "o", "h", "cpe:" ] # Reverse index by type. TEMPLATE_TYPE_ORDER_INDEX = dict((type, i) for (i, type) in enumerate(TEMPLATE_TYPE_ORDER)) # Try delimiters in this order. DELIMITER_ORDER = "/|=%@#" # But m has different preferences (m|| is a strong convention). DELIMITER_ORDER_M = "|=%/@#" # Only these template types are allowed to appear more than once. DUPLICATES_OKAY = set(["cpe:"]) def cmp_none(a, b): """Compare two objects, with None being less than anything else.""" if a is None and b is None: return 0 elif a is None and b is not None: return -1 elif a is not None and b is None: return 1 else: return cmp(a, b) def delim_for(type, body): if type == "m": order = DELIMITER_ORDER_M else: order = DELIMITER_ORDER for delim in order: if delim not in body: return delim else: assert False, "Exhausted delimiter list for %r" % body class MatchTemplate (object): def __init__(self): self.type = None self.body = None self.flags = None def __cmp__(self, other): i_self = TEMPLATE_TYPE_ORDER_INDEX.get(self.type) i_other = TEMPLATE_TYPE_ORDER_INDEX.get(other.type) d = cmp_none(i_self, i_other) if d != 0: return d return cmp(self.body, other.body) def __str__(self): delim = delim_for(self.type, self.body) return "%(type)s%(delim)s%(body)s%(delim)s%(flags)s" % { "type": self.type, "delim": delim, "body": self.body, "flags": self.flags } class MiscLine (object): """A line that is copied verbatim to the output.""" def __init__(self, lineno, line): self.lineno = lineno self.line = line def warn(self, msg): print >> sys.stderr, "%d: %s" % (self.lineno, msg) def check(self): pass def tidy(self): pass def __str__(self): return self.line def group_templates(templates): """Returns a list of tuples: (type, [templates...]) where [templates...] are all the templates of that type.""" groups = [] current_type = None for t in sorted(templates): if t.type != current_type: groups.append((t.type, [t])) else: groups[-1][1].append(t) current_type = t.type return groups def find_duplicates(templates): return [g for g in group_templates(templates) if len(g[1]) > 1] def get_cstring(body, p): """Doesn't unescape the returned string.""" if body[p] != "\"": raise ValueError("cstring must start with '\"'") p += 1 begin = p while body[p] != "\"": if body[p] == "\\": p += 1 p += 1 end = p p += 1 return p, body[begin:end] def get_substcommand(body, p): q = body.find("(", p) if q == -1: raise ValueError("No '(' following '$' in subst command") command_name = body[p:q] args = [] p = q + 1 while True: while body[p].isspace(): p += 1 if body[p] == ")": p += 1 break if body[p] == "\"": # Argument is a string. p, arg = get_cstring(body, p) args.append(arg) else: # Must be an integer. q = p while body[q].isdigit(): q += 1 arg = int(body[p:q]) args.append(arg) p = q if body[p] == ",": p += 1 return p, (command_name,) + tuple(args) def parse_template(body): """Returns a sequence of either literal strings, or substitution tuples like $1 -> (None, 1) $P(4) -> ("P", 4) $SUBST(3,"_",".") -> ("SUBST", 3, "_", ".") Compare dotmplsubst in service_scan.cc.""" seq = [] p = 0 while p < len(body): q = body.find("$", p) if q == -1: seq.append(body[p:]) break if q > p: seq.append(body[p:q]) # Skip over '$'. p = q + 1 if body[p].isdigit(): # Simple variable like $1. seq.append((None, int(body[p]))) p += 1 else: # Command like $P(4). p, op = get_substcommand(body, p) seq.append(op) return seq def grab_captures(body): captures = set() tmpl = parse_template(body) for elem in tmpl: if isinstance(elem, basestring): continue # Assume that all integer args refer to a capture. This is the case for # P and SUBST, the only two subst commands supported. for arg in elem[1:]: if isinstance(arg, int): captures.add(arg) return captures class MatchLine (MiscLine): def __init__(self, lineno, line): self.lineno = lineno self.match, line = grab_word(line) assert self.match == "match" or self.match == "softmatch" self.service, line = grab_word(line) assert self.service is not None self.templates = [] while True: template, line = grab_template(line) if template is None: assert line == "" break self.templates.append(template) def check(self): # Check for unknown template types. for t in self.templates: if t.type not in TEMPLATE_TYPE_ORDER_INDEX: self.warn("unknown template \"%s\"." % t.type) dups = find_duplicates(self.templates) for type, templates in dups: if type not in DUPLICATES_OKAY: self.warn("duplicate templates \"%s\": %s" % (type, " ".join(str(t) for t in templates))) for t in self.templates: if t.type == "m": m_template = t break else: self.warn("no m template.") return # Let's hope Python regexes are close enough to PCRE. try: m_re = re.compile(m_template.body) except re.error, e: delim = delim_for(m_template.type, m_template.body) self.warn("can't parse m regex (%s): %s%s%s" % (e.message, delim, m_template.body, delim)) return num_captures = m_re.groups have_captures = set(range(1, num_captures + 1)) used_captures = set() i_template = self.lookup_first_template("i") #Sets two flags to compare the i// template when there is a cpe template i_lang=None if i_template is not None: i_lang=grep_language(i_template.body) for t in self.templates: if t.type == "cpe:": try: parsedcpe=parse_cpe(t.body) except ValueError as errmsg: self.warn("can't parse CPE: %s" % errmsg) continue # If we have a language let's check the i// template. if parsedcpe["language"]: try: neededlang = CD2LANG[parsedcpe["language"]] except KeyError: self.warn("unknown language code \"%s\" in CPE: %s/%s/" % (parsedcpe["language"], t.type, t.body)) continue if i_template is None or neededlang not in i_template.body: self.warn("CPE has language code \"%s\" but i template is missing \"%s\"" % (parsedcpe["language"], neededlang)) if i_lang is not None and LANG2CD[i_lang] != parsedcpe["language"]: self.warn("i template has language \"%s\" but cpe:/%s/ is missing \"%s\"" % (i_lang, t.body, LANG2CD[i_lang])) if t.type == "m": continue try: captures = grab_captures(t.body) except Exception, e: self.warn("can't get captures from %r" % t.body) raise used_captures.update(captures) for c in sorted(used_captures - have_captures): self.warn("used capture $%d (only have %d)" % (c, num_captures)) for c in sorted(have_captures - used_captures): self.warn("unused capture $%d (of %d)" % (c, num_captures)) def tidy(self): # Remove templates specified by --strip. self.templates = [t for t in self.templates if t.type not in options.strip_types] # Put in order. self.templates.sort() def lookup_first_template(self, type): """Find the first template of the given type, or None if no match exists.""" for t in self.templates: if t.type == type: return t return None def __str__(self): return self.match + " " + self.service + " " + " ".join(str(t) for t in self.templates) def parse_line(lineno, line): if line.startswith("match ") or line.startswith("softmatch "): return MatchLine(lineno, line) else: return MiscLine(lineno, line) def parse_cpe(cpe): parsedcpe = {} pseudocpe = cpe.rsplit(':') for x in range(0,len(pseudocpe)): pseudocpe[x] = urllib.unquote(pseudocpe[x]) # If we don't have all the fields filled in we will add the rest as blanks. if len(pseudocpe) < 7: for x in range(len(pseudocpe), 7): pseudocpe.append('') elif len(pseudocpe) > 7: raise ValueError("Too many fields in this cpe %s" % cpe) # Store them in a dictionary for ease of future lookup. parsedcpe["part"] = pseudocpe[0] parsedcpe["vendor"] = pseudocpe[1] parsedcpe["product"] = pseudocpe[2] parsedcpe["version"] = pseudocpe[3] parsedcpe["update"] = pseudocpe[4] parsedcpe["edition"] = pseudocpe[5] parsedcpe["language"] = pseudocpe[6] return parsedcpe def grep_language(maintext): #Loop over all the stored languages for k in LANG2CD: #If we find a matched language if k in maintext: return k for lineno, line in enumerate(input_file, 1): l = parse_line(lineno, line.strip()) l.check() l.tidy() if not options.dry_run: print str(l) input_file.close()