#!/usr/bin/env python

import getopt
import re
import sys
import urllib

LANG2CD = {'Spanish': 'es','Danish': 'da', 'Portuguese' :'pt','Italian': 'it', 'English': 'en', 'Czech': 'cs', 'Polish': 'pl', 'German': 'de', 'French':'fr'}
CD2LANG = dict((v, k) for (k, v) in LANG2CD.items())


def usage(f = sys.stdout):
    print >> f, """\
Usage: %s nmap-service-probes > nmap-service-probes.new

Tidies an nmap-service-probes file and checks for errors.
Tidies:
 * Removes excess whitespace.
 * Sorts templates in the order m p v i d o h cpe:.
 * Canonicalizes template delimiters in the order / | %% = @ #.
 * Optionally removes templates with the --strip option.
Warns about:
 * Unknown template types in match lines.
 * Duplicate template types (except cpe:).

  -h, --help        Show this help.
  -n, --dry-run     Don't write an output file, just issue warnings.
      --strip=TYPE  Strip templates of TYPE (e.g. --strip=cpe:).
                    This option accumulates.\
""" % sys.argv[0]

def usage_error():
    usage(sys.stderr)
    sys.exit(1)

class options (object):
    dry_run = False
    strip_types = set()

opts, args = getopt.gnu_getopt(sys.argv[1:], "hn", ["dry-run", "help", "strip="])
for o, a in opts:
    if o == "-h" or o == "--help":
        usage()
        sys.exit()
    elif o == "-n" or o == "--dry-run":
        options.dry_run = True
    elif o == "--strip":
        options.strip_types.update(a.split(","))

if len(args) == 0:
    input_file = sys.stdin
elif len(args) == 1:
    input_file = open(args[0])
else:
    usage_error()

# match aplus m|^\x01\xff\0\xff\x01\x1d\0\xfd\0\n\x03\x05A\+ API \(([\d.]+)\) - CCS \(([\d.]+)\)\0| p/Cleo A+/ i/API $1; CSS $2/

def grab_word(s):
    i = 0
    while i < len(s) and s[i].isspace():
        i += 1
    if i >= len(s):
        return None, s[i:]
    j = i + 1
    while j < len(s) and not s[j].isspace():
        j += 1
    return s[i:j], s[j:]

def is_typechar(c):
    return c.isalpha() or c == ":"

def is_delimchar(c):
    return not c.isspace()

def is_flagchar(c):
    return c.isalpha()

def grab_template(s):
    template = MatchTemplate()

    i = 0
    while i < len(s) and s[i].isspace():
        i += 1
    if i >= len(s):
        return None, s[i:]

    j = i
    while j < len(s) and is_typechar(s[j]):
        j += 1
    if j >= len(s) or not is_delimchar(s[j]):
        return None, s[j:]
    template.type = s[i:j]

    delim = s[j]

    j += 1
    i = j
    while j < len(s) and s[j] != delim:
        j += 1
    if j >= len(s):
        return None, s[j:]
    template.body = s[i:j]

    j += 1
    i = j
    while j < len(s) and is_flagchar(s[j]):
        j += 1
    if not (j >= len(s) or s[j].isspace()):
        return None, s[j:]
    template.flags = s[i:j]

    return template, s[j:]

# Order in which to sort templates. They are additionally sorted alphabetically
# by their contents, for cpe: which may appear more than once.
TEMPLATE_TYPE_ORDER = [
    "m", "p", "v", "i", "d", "o", "h", "cpe:"
]
# Reverse index by type.
TEMPLATE_TYPE_ORDER_INDEX = dict((type, i) for (i, type) in enumerate(TEMPLATE_TYPE_ORDER))

# Try delimiters in this order.
DELIMITER_ORDER = "/|=%@#"
# But m has different preferences (m|| is a strong convention).
DELIMITER_ORDER_M = "|=%/@#"

# Only these template types are allowed to appear more than once.
DUPLICATES_OKAY = set(["cpe:"])

def cmp_none(a, b):
    """Compare two objects, with None being less than anything else."""
    if a is None and b is None:
        return 0
    elif a is None and b is not None:
        return -1
    elif a is not None and b is None:
        return 1
    else:
        return cmp(a, b)

def delim_for(type, body):
    if type == "m":
        order = DELIMITER_ORDER_M
    else:
        order = DELIMITER_ORDER

    for delim in order:
        if delim not in body:
            return delim
    else:
        assert False, "Exhausted delimiter list for %r" % body

class MatchTemplate (object):
    def __init__(self):
        self.type = None
        self.body = None
        self.flags = None

    def __cmp__(self, other):
        i_self = TEMPLATE_TYPE_ORDER_INDEX.get(self.type)
        i_other = TEMPLATE_TYPE_ORDER_INDEX.get(other.type)
        d = cmp_none(i_self, i_other)
        if d != 0:
            return d
        return cmp(self.body, other.body)

    def __str__(self):
        delim = delim_for(self.type, self.body)
        return "%(type)s%(delim)s%(body)s%(delim)s%(flags)s" % {
            "type": self.type, "delim": delim, "body": self.body, "flags": self.flags
        }

class MiscLine (object):
    """A line that is copied verbatim to the output."""
    def __init__(self, lineno, line):
        self.lineno = lineno
        self.line = line

    def warn(self, msg):
        print >> sys.stderr, "%d: %s" % (self.lineno, msg)

    def check(self):
        pass

    def tidy(self):
        pass

    def __str__(self):
        return self.line

def group_templates(templates):
    """Returns a list of tuples: (type, [templates...]) where [templates...] are
    all the templates of that type."""
    groups = []
    current_type = None
    for t in sorted(templates):
        if t.type != current_type:
            groups.append((t.type, [t]))
        else:
            groups[-1][1].append(t)
        current_type = t.type
    return groups

def find_duplicates(templates):
    return [g for g in group_templates(templates) if len(g[1]) > 1]

def get_cstring(body, p):
    """Doesn't unescape the returned string."""

    if body[p] != "\"":
        raise ValueError("cstring must start with '\"'")
    p += 1
    begin = p
    while body[p] != "\"":
        if body[p] == "\\":
            p += 1
        p += 1
    end = p
    p += 1

    return p, body[begin:end]

def get_substcommand(body, p):
    q = body.find("(", p)
    if q == -1:
        raise ValueError("No '(' following '$' in subst command")
    command_name = body[p:q]

    args = []

    p = q + 1
    while True:
        while body[p].isspace():
            p += 1
        if body[p] == ")":
            p += 1
            break
        if body[p] == "\"":
            # Argument is a string.
            p, arg = get_cstring(body, p)
            args.append(arg)
        else:
            # Must be an integer.
            q = p
            while body[q].isdigit():
                q += 1
            arg = int(body[p:q])
            args.append(arg)
            p = q
        if body[p] == ",":
            p += 1

    return p, (command_name,) + tuple(args)

def parse_template(body):
    """Returns a sequence of either literal strings, or substitution tuples like
        $1                -> (None, 1)
        $P(4)             -> ("P", 4)
        $SUBST(3,"_",".") -> ("SUBST", 3, "_", ".")
    Compare dotmplsubst in service_scan.cc."""

    seq = []

    p = 0
    while p < len(body):
        q = body.find("$", p)
        if q == -1:
            seq.append(body[p:])
            break
        if q > p:
            seq.append(body[p:q])

        # Skip over '$'.
        p = q + 1
        if body[p].isdigit():
            # Simple variable like $1.
            seq.append((None, int(body[p])))
            p += 1
        else:
            # Command like $P(4).
            p, op = get_substcommand(body, p)
            seq.append(op)
    return seq

def grab_captures(body):
    captures = set()

    tmpl = parse_template(body)
    for elem in tmpl:
        if isinstance(elem, basestring):
            continue
        # Assume that all integer args refer to a capture. This is the case for
        # P and SUBST, the only two subst commands supported.
        for arg in elem[1:]:
            if isinstance(arg, int):
                captures.add(arg)
    return captures

class MatchLine (MiscLine):
    def __init__(self, lineno, line):
        self.lineno = lineno

        self.match, line = grab_word(line)
        assert self.match == "match" or self.match == "softmatch"

        self.service, line = grab_word(line)
        assert self.service is not None

        self.templates = []
        while True:
            template, line = grab_template(line)
            if template is None:
                assert line == ""
                break
            self.templates.append(template)

    def check(self):
        # Check for unknown template types.
        for t in self.templates:
            if t.type not in TEMPLATE_TYPE_ORDER_INDEX:
                self.warn("unknown template \"%s\"." % t.type)
        dups = find_duplicates(self.templates)
        for type, templates in dups:
            if type not in DUPLICATES_OKAY:
                self.warn("duplicate templates \"%s\": %s" % (type, " ".join(str(t) for t in templates)))

        for t in self.templates:
            if t.type == "m":
                m_template = t
                break
        else:
            self.warn("no m template.")
            return

        # Let's hope Python regexes are close enough to PCRE.
        try:
            m_re = re.compile(m_template.body)
        except re.error, e:
            delim = delim_for(m_template.type, m_template.body)
            self.warn("can't parse m regex (%s): %s%s%s" % (e.message,
                delim, m_template.body, delim))
            return
        num_captures = m_re.groups

        have_captures = set(range(1, num_captures + 1))
        used_captures = set()
        i_template = self.lookup_first_template("i")
        #Sets two flags to compare the i// template when there is a cpe template
        i_lang=None
        if i_template is not None:
            i_lang=grep_language(i_template.body)
        for t in self.templates:
            if t.type == "cpe:":
                try:
                    parsedcpe=parse_cpe(t.body)
                except ValueError as errmsg:
                    self.warn("can't parse CPE: %s" % errmsg)
                    continue
                # If we have a language let's check the i// template.
                if parsedcpe["language"]:
                    try:
                        neededlang = CD2LANG[parsedcpe["language"]]
                    except KeyError:
                        self.warn("unknown language code \"%s\" in CPE: %s/%s/" % (parsedcpe["language"], t.type, t.body))
                        continue
                    if i_template is None or neededlang not in i_template.body:
                        self.warn("CPE has language code \"%s\" but i template is missing \"%s\""  % (parsedcpe["language"], neededlang))
                if i_lang is not None and LANG2CD[i_lang] != parsedcpe["language"]:
                    self.warn("i template has language \"%s\" but cpe:/%s/ is missing \"%s\"" % (i_lang, t.body, LANG2CD[i_lang]))
            if t.type == "m":
                continue
            try:
                captures = grab_captures(t.body)
            except Exception, e:
                self.warn("can't get captures from %r" % t.body)
                raise
            used_captures.update(captures)

        for c in sorted(used_captures - have_captures):
            self.warn("used capture $%d (only have %d)" % (c, num_captures))
        for c in sorted(have_captures - used_captures):
            self.warn("unused capture $%d (of %d)" % (c, num_captures))

    def tidy(self):
        # Remove templates specified by --strip.
        self.templates = [t for t in self.templates if t.type not in options.strip_types]
        # Put in order.
        self.templates.sort()

    def lookup_first_template(self, type):
        """Find the first template of the given type, or None if no match exists."""
        for t in self.templates:
            if t.type == type:
                return t
        return None

    def __str__(self):
        return self.match + " " + self.service + " " + " ".join(str(t) for t in self.templates)

def parse_line(lineno, line):
    if line.startswith("match ") or line.startswith("softmatch "):
        return MatchLine(lineno, line)
    else:
        return MiscLine(lineno, line)

def parse_cpe(cpe):
    parsedcpe = {}
    pseudocpe = cpe.rsplit(':')
    for x in range(0,len(pseudocpe)):
        pseudocpe[x] = urllib.unquote(pseudocpe[x])
    # If we don't have all the fields filled in we will add the rest as blanks.
    if len(pseudocpe) < 7:
        for x in range(len(pseudocpe), 7):
            pseudocpe.append('')
    elif len(pseudocpe) > 7:
        raise ValueError("Too many fields in this cpe %s" % cpe)
    # Store them in a dictionary for ease of future lookup.
    parsedcpe["part"] = pseudocpe[0]
    parsedcpe["vendor"] = pseudocpe[1]
    parsedcpe["product"] = pseudocpe[2]
    parsedcpe["version"] = pseudocpe[3]
    parsedcpe["update"] = pseudocpe[4]
    parsedcpe["edition"] = pseudocpe[5]
    parsedcpe["language"] = pseudocpe[6]
    return parsedcpe

def grep_language(maintext):
    #Loop over all the stored languages
    for k in LANG2CD:
        #If we find a matched language
        if k in maintext:
            return k

    
for lineno, line in enumerate(input_file, 1):
    l = parse_line(lineno, line.strip())
    l.check()
    l.tidy()
    if not options.dry_run:
        print str(l)

input_file.close()