[tor-commits] [trunnel/master] Neat feature to help seed fuzzers

nickm at torproject.org nickm at torproject.org
Fri Dec 30 03:48:51 UTC 2016


commit 1e47a583f7b7ad1cd1f6b24289c1a6f1cd0a9a73
Author: Nick Mathewson <nickm at torproject.org>
Date:   Thu Dec 29 20:53:49 2016 -0500

    Neat feature to help seed fuzzers
    
    To run it, write a trunnel description for what you want to fuzz,
    and run python -m trunnel.SeedFuzzer foo.trunnel .  The
    subdirectories of "fuzzing-inputs" will fill up with strings that
    conform to that description, suitable for consumption by afl or
    libfuzzer.
---
 lib/trunnel/CodeGen.py    |   1 +
 lib/trunnel/Grammar.py    |   2 +
 lib/trunnel/SeedFuzzer.py | 549 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 552 insertions(+)

diff --git a/lib/trunnel/CodeGen.py b/lib/trunnel/CodeGen.py
index ba33287..35073f9 100644
--- a/lib/trunnel/CodeGen.py
+++ b/lib/trunnel/CodeGen.py
@@ -266,6 +266,7 @@ class Checker(ASTVisitor):
         self.structUses[sd.name] = set(sd.contextList)
         self.structUsesContexts[sd.name] = set(sd.contextList)
         sd.visitChildren(self)
+        sd.constrainedIntFields = set(self.structIntFieldUsage.keys())
         self.structFieldNames = None
         self.structIntFieldNames = None
         self.structIntFieldUsage = None
diff --git a/lib/trunnel/Grammar.py b/lib/trunnel/Grammar.py
index 25dc53a..3608122 100644
--- a/lib/trunnel/Grammar.py
+++ b/lib/trunnel/Grammar.py
@@ -247,6 +247,8 @@ class StructDecl(AST):
     #     for every field that is used as the length of a SMLenConstrained
     #   has_leftover_field -- boolean: true iff this struct contains
     #     an SMLenConstrained.
+    #   constrainedIntFields -- set: names of integer fields that
+    #     are referenced elsewhere in the structure.
 
     def __init__(self, name, members, contextList=(), isContext=False):
         self.name = name
diff --git a/lib/trunnel/SeedFuzzer.py b/lib/trunnel/SeedFuzzer.py
new file mode 100644
index 0000000..369eb88
--- /dev/null
+++ b/lib/trunnel/SeedFuzzer.py
@@ -0,0 +1,549 @@
+#!/usr/bin/python
+"""Use a trunnel input file to generate examples of that file for
+   fuzzing.
+
+    Here's the strategy:
+
+      First, sort all the types topologically so that we consider
+      every type before any type that depends on it.
+
+      Then, for we iterate over each type to make examples of it.  We do
+      a recursive descent on the syntax tree, yielding a sequence of
+      (entry, constraint) tuples.  The "entry" item is a list whose
+      members are bytestrings or NamedInt objects. The "constraint" item
+      is an instance of Constraint that describes which NamedInt entries
+      must have certain values.
+
+      As we handle each (entry,constraint) tuple, we replace each
+      NamedInt value in the entry with its constrained value, then merge
+      the parts of the entry together.  If we haven't seen it before for
+      this type, we save it to disk.
+
+      To avoid combinatorial explosions, we limit the fan-out for each
+      step, and choose different combinatoric strategies depending
+      on the number of items to be considered at once.
+"""
+
+
+import trunnel.CodeGen
+import trunnel.Grammar
+
+import os
+import hashlib
+import random
+
+
+class Constraints(object):
+    """A Constraints object represents a set of constraints on named integer
+       values.  It may also represent a 'failed constraint', which is
+       impossible to satisfy.
+    """
+    def __init__(self):
+        pass
+
+    def isFailed(self):
+        """Return true iff this constraint is unsatisfiable."""
+        return False
+
+    def add(self, k, v):
+        """Return a new constraint made by adding the constraint "k=v" to this
+           constrint.
+        """
+        raise NotImplemented()
+
+    def merge(self, other):
+        """Return a (maybe) new constraint made by adding all the constraints
+           in 'other' to this constraint."""
+        raise NotImplemented()
+
+    def apply(self, item):
+        """Given an object that might be a NamedInt or a byte sequence, return
+           a byte sequence obtained by applying this constraint to
+           that item.
+        """
+        if isinstance(item, NamedInt):
+            return item.apply(self)
+        return item
+
+    def getConstraint(self, name):
+        """Return the (integer) value that the integer field 'name'
+           must have, or None if there is no such constraint.
+        """
+        return None
+
+
+class NoConstraints(Constraints):
+    """Represents the absence of any constraints.  Use the NIL singleton
+       instead of creating more of this object.
+    """
+    def __init__(self):
+        Constraints.__init__(self)
+
+    def add(self, k, v):
+        # Nothing plus something is something
+        some = SomeConstraints({k: v})
+        return some
+
+    def merge(self, other):
+        # Nothing plus anything is that thing
+        return other
+
+
+NIL = NoConstraints()
+
+
+class FailedConstraint(Constraints):
+    """Represents an unsatisfiable constraint, probably created by setting
+       the same integer to two incompatible values."""
+    def __init__(self):
+        Constraints.__init__(self)
+
+    def isFailed(self):
+        return True
+
+    def add(self, k, v):
+        # Failed can't become any more failed
+        return self
+
+    def merge(self, other):
+        # Failed can't become any more failed
+        return self
+
+    def apply(self, item):
+        # You should never call apply on a failed constraint.
+        assert False
+
+
+FAILED = FailedConstraint()
+
+
+class SomeConstraints(Constraints):
+    """Represents a set of one or more constraints in a key-value dictionary.
+    """
+    def __init__(self, d):  # Owns reference to d!
+        Constraints.__init__(self)
+        self._d = d
+
+    def add(self, k, v):
+        try:
+            oldval = self._d[k]
+        except KeyError:
+            # We had no previous value for this, so we can just add it
+            # to our dict.
+            newd = self._d.copy()
+            newd[k] = v
+            return SomeConstraints(newd)
+
+        if oldval == v:
+            # No change, so no need to allocate a new object.
+            return self
+        else:
+            # Incompatible change; we can't satisfy it.
+            return FAILED
+
+    def merge(self, other):
+        if not isinstance(other, SomeConstraints):
+            # 'other' is either NIL or FAILED, which have simple merge rules.
+            return other.merge(self)
+        if len(other._d) < len(self._d):
+            # This function runs in O(len(self._d)), so let's run it
+            # on the shorter item.
+            return other.merge(self)
+
+        newd = self._d.copy()
+        newd.update(other._d)
+        for k, v in self._d.iteritems():  # XXX Here's the inefficient O(n).
+            if newd[k] != v:
+                return FAILED
+        return SomeConstraints(newd)
+
+    def getConstraint(self, name):
+        return self._d.get(name)
+
+
+def constrain(k, v):
+    if k is None:
+        return NIL
+    else:
+        return SomeConstraints({k: v})
+
+
+class NamedInt(object):
+    """Represents an integer object with a name whose value (maybe)
+       depends on some other part of the structure.
+    """
+    def __init__(self, name, width, val=None):
+        self._name = name
+        self._width = width
+        self._val = val
+
+    def withVal(self, val):
+        assert self._val is None
+        return NamedInt(self._name, self._width, val)
+
+    def __len__(self):
+        return self._width
+
+    def apply(self, constraints):
+        val = constraints.getConstraint(self._name)
+        if val is None:
+            val = self._val
+        if val is None:
+            # We expected to have some constraint on this value, but we
+            # didn't.  How about 3? 3 is a nice number.
+            val = 3
+        # encode val little-endian in width bytes.
+        return b"".join(chr((val >> (self._width-i)) & 0xff)
+                        for i in xrange(1, self._width+1))
+
+
+def findLength(lst):
+    """Given a list of bytestrings and NamedInts, return the total
+       length of all items in the list.
+    """
+    return sum(len(item) for item in lst)
+
+
+def combineExamples(grp, n, maximum=256):
+    """Given a sequence of examples, yield up to 'maxiumum' values built
+       by concatenating n items from the sequence (chosen with
+       replacement).
+
+       If possible, do an exhaustive combination of values.  Otherwise,
+       take items randomly.
+
+    """
+    if len(grp) ** n > maximum:
+        # we have to sample.
+        for i in xrange(maximum):
+            result = []
+            for j in xrange(n):
+                result.append(random.choice(grp))
+            yield b"".join(result)
+        return
+    else:
+        for e in combineExhaustively(grp, n):
+            yield e
+
+
+def combineExhaustively(grp, n):
+    """Yield all bytestrings made by concatenating n members of grp
+       (with replacement)."""
+    if n == 0:
+        yield b""
+    elif n == 1:
+        for e in grp:
+            yield e
+    else:
+        for e in grp:
+            for rest in combineExhaustively(grp, n-1):
+                yield e + rest
+
+
+def crossProduct(lol):
+    """Given a list of lists of (entry, constraint) pairs,
+       yield the cross-product of those lists.
+    """
+    if len(lol) == 0:
+        return
+    elif len(lol) == 1:
+        for item, constraint in lol[0]:
+            yield item, constraint
+    else:
+        for item, constraint in lol[0]:
+            for irest, crest in crossProduct(lol[1:]):
+                c2 = constraint.merge(crest)
+                if not c2.isFailed():
+                    yield item + irest, c2
+
+
+def explore(lol):
+    """As cross-product, but for cases where we face a much more
+       combinatorically intense list of lists.  For this case,
+       we consider the inputs position by position.  For each position,
+       we let it vary over all its values, while choosing the simplest
+       value for the other positions that allows it to meet its constraints.
+
+       For example, if the lists had members (a), (x,y,z), (1,2,3), and no
+       constraints, we'd yield: ax1, ax1, ay1, az1, ax1, ax2, ax3.
+    """
+    if len(lol) == 0:
+        return
+    elif len(lol) == 1:
+        for item, constraint in lol[0]:
+            yield item, constraint
+    else:
+        for idx in xrange(len(lol)):
+            for item, constraint in exploreAt(lol, idx):
+                yield item, constraint
+
+
+def findComplying(lol, c):
+    """Find a single value from among crossproduct(lol) complying with c.
+       Return that value and its combined constraints."""
+    if len(lol) == 0:
+        return [], c
+
+    for i, c2 in lol[0]:
+        cboth = c.merge(c2)
+        if cboth.isFailed():
+            continue
+        rest, call = findComplying(lol[1:], cboth)
+        if call.isFailed():
+            continue
+        return rest, call
+
+    return [], FAILED
+
+
+def exploreAt(lol, idx):
+    """Helper for explore."""
+    before = lol[:idx]
+    at = lol[idx]
+    after = lol[idx+1:]
+    for item, constraint in at:
+        pre, c = findComplying(before, constraint)
+        post, c2 = findComplying(after, c)
+        yield pre + item + post, c2
+
+
+def take_n(iterator, n):
+    """Takes an iterator and yields up to the first n items
+       from that iterator."""
+    so_far = 0
+    for item in iterator:
+        so_far += 1
+        if so_far > n:
+            return
+        yield item
+
+
+class CorpusGenerator(trunnel.CodeGen.ASTVisitor):
+    # target_dir -- where to write items
+    # sort_order -- topologically sorted list of structure names
+    # structExamples -- map from structure name to possible
+    #   values that we generated for that structure
+    # _expandConst -- helper function that knows how to map constant
+    #   names to integers.
+    # _maxFanout -- used to limit the branching factor when running
+    #   combinatorically intense generators.
+    # _maxExamples -- maximum number of distinct examples to generate
+    #   for each structure
+    # _maxCombinatorics -- when building long sequences, we try a cross-product
+    #   approach when it would generate fewer than this many entries.
+    #   Otherwise, we try an alternative approach; see explore().
+    def __init__(self, target_dir):
+        trunnel.CodeGen.ASTVisitor.__init__(self)
+        self.target_dir = target_dir
+        self.structExamples = {}
+        self._maxFanout = 128
+        self._maxCombinatorics = 1024
+        self._maxExamples = 1024
+        self._constrainedIntFieldNames = None
+        self._strictFail = False   # DOCDOC
+
+    def setChecker(self, ch):
+        self.sort_order = ch.sortedStructs
+        self._expandConst = ch.expandConstant
+
+    def expandConst(self, v):
+        """If v is a constant name, expand it.  Otherwise return v."""
+        if isinstance(v, str):
+            return self._expandConst(v)
+        else:
+            return v
+
+    def visitFile(self, f):
+        f.visitChildrenSorted(self.sort_order, self)
+
+    def visitConstDecl(self, cd):
+        pass
+
+    def visitStructDecl(self, sd):
+        self._constrainedIntFieldNames = sd.constrainedIntFields
+        target = os.path.join(self.target_dir, sd.name)
+        if not os.path.exists(target):
+            os.makedirs(target)
+        examples = set()
+        for item in self.enumerateStructValues(sd):
+            if item in examples:
+                continue
+            digest = hashlib.sha256(item).hexdigest()
+            fname = os.path.join(target, digest)
+            print fname
+            with open(fname, 'wb') as f:
+                f.write(item)
+            examples.add(item)
+            if len(examples) >= self._maxExamples:
+                break
+        self.structExamples[sd.name] = sorted(examples, key=len)
+        self._constrainedIntFieldNames = None
+
+    def enumerateStructValues(self, sd):
+        """Helper: yields bytestrings that match a StructDecl."""
+        for members, constraints in self.visitListOfMembers(sd.members):
+            if constraints.isFailed():
+                continue
+            result = b"".join(constraints.apply(m) for m in members)
+            yield result
+
+    def visitSMInteger(self, smi):
+        width = smi.inttype.width
+        ni = NamedInt(smi.name, width // 8)
+        if smi.name in self._constrainedIntFieldNames:
+            # This will be set elsewhere, I hope.
+            yield [ni], NIL
+        elif smi.constraints is None:
+            yield [ni.withVal(0)], NIL
+            yield [ni.withVal((1L << width) - 1)], NIL
+        else:
+            for lo, hi in smi.constraints.ranges:
+                lo = self.expandConst(lo)
+                hi = self.expandConst(hi)
+                yield [ni.withVal(lo)], NIL
+                if lo != hi:
+                    yield [ni.withVal(hi)], NIL
+
+    def visitListOfMembers(self, members):
+        results = []
+        n_vals = 1
+        for m in members:
+            results.append(list(take_n(self.visit(m), self._maxFanout)))
+            n_vals *= len(results[-1])
+        if n_vals < self._maxCombinatorics:
+            for i, c in crossProduct(results):
+                yield i, c
+        else:
+            for i, c in explore(results):
+                yield i, c
+
+        # if len(members) == 0:
+        #     return
+        # elif len(members) == 1:
+        #     for i, c in take_n(self.visit(members[0]), self._maxFanout):
+        #         yield i, c
+        #     return
+
+        # for i, c in take_n(self.visit(members[0]), self._maxFanout):
+        #     for irest, crest in self.visitListOfMembers(members[1:]):
+        #         c2 = c.merge(crest)
+        #         if not c2.isFailed():
+        #             yield i + irest, c2
+
+    def visitSMStruct(self, sms):
+        for e in self.structExamples[sms.structname][:self._maxFanout]:
+            yield [e], NIL
+
+    def visitSMString(self, sms):
+        yield [b"\0"], NIL
+        yield [b"a\0"], NIL
+        yield [b"abc\0"], NIL
+
+    def visitSMFixedArray(self, sma):
+        w = self.expandConst(sma.width)
+        if type(sma.basetype) == str:
+            examples = self.structExamples[sma.basetype]
+            for e in combineExamples(examples, w, self._maxFanout):
+                yield [e], NIL
+        elif str(sma.basetype) == 'char':
+            yield [b"x"*w], NIL
+            yield [b"\xff"*w], NIL
+        else:
+            bitwidth = sma.basetype.width
+            nbytes = w * (bitwidth // 8)
+            yield [b"\0"*nbytes], NIL
+            yield [b"\xff"*nbytes], NIL
+
+    def visitSMVarArray(self, smva):
+        widthfield = smva.widthfield
+        if type(smva.basetype) == str:
+            examples = self.structExamples[smva.basetype]
+            yield [b""], constrain(widthfield, 0)
+            c = constrain(widthfield, 1)
+            for e in examples[:self._maxFanout]:
+                yield [e], c
+            c = constrain(widthfield, 2)
+            for e in combineExamples(examples, 2, self._maxFanout):
+                yield [e], c
+        elif str(smva.basetype) == 'char':
+            yield [b""], constrain(widthfield, 0)
+            yield [b"h"], constrain(widthfield, 1)
+            yield [b"hi"], constrain(widthfield, 2)
+        else:
+            w = smva.basetype.width // 8
+            yield [b""], constrain(widthfield, 0)
+            yield [b"\x00"*w], constrain(widthfield, 1)
+            yield [b"\x00"*w*2], constrain(widthfield, 2)
+
+    def visitSMLenConstrained(self, smlc):
+        varname = smlc.lengthfield
+        assert len(smlc.members) == 1   # XXX limitation
+        for item, constraints in self.visit(smlc.members[0]):
+            c = constraints.add(varname, findLength(item))
+            if not c.isFailed():
+                yield item, c
+
+    def visitSMUnion(self, smu):
+        tagfield = smu.tagfield
+        for m in smu.members:
+            for item, constraints in take_n(
+                    self.visitListOfMembers(m.decls), self._maxFanout):
+                if m.is_default:
+                    c = constraints
+                else:
+                    oneval = m.tagvalue[0][0]
+                    c = constraints.add(tagfield, self.expandConst(oneval))
+                if not c.isFailed():
+                    yield item, c
+
+    def visitSMFail(self, x):
+        if self._strictFail:
+            return
+        else:
+            yield [b""], NIL
+
+    def visitSMEos(self, x):
+        yield [b""], NIL
+
+    def visitSMIgnore(self, x):
+        yield [b""], NIL
+        yield [b"bla"], NIL
+
+    def visitSMPosition(self, x):
+        yield [b""], NIL
+
+
+def generate_corpus(input_fnames, target_dir):
+    generator = CorpusGenerator(target_dir)
+    for input_fname in input_fnames:
+        inp = open(input_fname, 'r')
+        t = trunnel.Grammar.Lexer().tokenize(inp.read())
+        inp.close()
+        parsed = trunnel.Grammar.Parser().parse(t)
+
+        c = trunnel.CodeGen.Checker()
+        c.visit(parsed)
+
+        generator.setChecker(c)
+        generator.visit(parsed)
+
+
+if __name__ == '__main__':
+    import getopt
+    import sys
+
+    opts, args = getopt.gnu_getopt(sys.argv[1:],
+                                   "o:",
+                                   ["output-dir="])
+
+    target_dir = "fuzzing-inputs"
+    for (k, v) in opts:
+        if k in ("-o", "--output-dir"):
+            target_dir = v
+
+    if len(args) == 0:
+        sys.stderr.write("Syntax: python -m trunnel.SeedFuzzer [-o <dir>] "
+                         "<fname...>\n")
+        sys.exit(1)
+
+    generate_corpus(args, target_dir)



More information about the tor-commits mailing list