[tor-commits] [ooni-probe/master] Implement Eigenvector based classifier

art at torproject.org art at torproject.org
Sun Aug 19 07:49:43 UTC 2012


commit 346cd4e6cd1b028bb5f772b1a0bf15c2b2e12ae3
Author: Arturo Filastò <hellais at torproject.org>
Date:   Sun Aug 19 00:48:03 2012 -0700

    Implement Eigenvector based classifier
    This allows creation of databases containing the feature sets of websites to be
    monitored for censorship. If we find that the page being loaded does not have
    similar eigenvectors to the original page, it is probable that we are in front
    of a block page.
---
 ooni/plugins/daphn3.py   |    2 +-
 ooni/plugins/domclass.py |  133 ++++++++++++++++++++++++++++++++++++++++++++++
 ooni/protocols/http.py   |    7 ++-
 3 files changed, 138 insertions(+), 4 deletions(-)

diff --git a/ooni/plugins/daphn3.py b/ooni/plugins/daphn3.py
index 6911623..de3df26 100644
--- a/ooni/plugins/daphn3.py
+++ b/ooni/plugins/daphn3.py
@@ -84,7 +84,7 @@ class daphn3Test(OONITest):
         if not self.local_options:
             self.end()
             return
-        #pass
+
         self.factory = Daphn3ClientFactory()
         self.factory.test = self
 
diff --git a/ooni/plugins/domclass.py b/ooni/plugins/domclass.py
new file mode 100644
index 0000000..5c9b6fb
--- /dev/null
+++ b/ooni/plugins/domclass.py
@@ -0,0 +1,133 @@
+"""
+This is a self genrated test created by scaffolding.py.
+you will need to fill it up with all your necessities.
+Safe hacking :).
+"""
+from zope.interface import implements
+from twisted.python import usage
+from twisted.plugin import IPlugin
+from ooni.plugoo.tests import ITest, OONITest
+from ooni.plugoo.assets import Asset
+from ooni.utils import log
+from ooni.protocols.http import HTTPTest
+
+class domclassArgs(usage.Options):
+    optParameters = [['output', 'o', None, 'Output to write'],
+                     ['file', 'f', None, 'Corpus file'],
+                     ['fileb', 'b', None, 'Corpus file'],
+                     ['asset', 'a', None, 'URL List'],
+                     ['resume', 'r', 0, 'Resume at this index']]
+alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE',
+           'BASEFONT', 'BD', 'BIG', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
+           'CENTER', 'CITE', 'CODE', 'COL', 'COLGROUP', 'DD', 'DEL', 'DFN', 'DIR', 'DIV',
+           'DL', 'DT', 'E M', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2',
+           'H3', 'H4', 'H5', 'H6', 'HEAD', 'HR', 'HTML', 'I', 'IFRAME ', 'IMG',
+           'INPUT', 'INS', 'ISINDEX', 'KBD', 'LABEL', 'LEGEND', 'LI', 'LINK', 'MAP',
+           'MENU', 'META', 'NOFRAMES', 'NOSCRIPT', 'OBJECT', 'OL', 'OPTGROUP', 'OPTION',
+           'P', 'PARAM', 'PRE', 'Q', 'S', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN',
+           'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
+           'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL', 'VAR']
+
+commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
+           'CENTER', 'CITE', 'CODE', 'COL', 'DD', 'DIV',
+           'DL', 'DT', 'EM', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2',
+           'H3', 'H4', 'H5', 'H6', 'HEAD', 'HR', 'HTML', 'IFRAME ', 'IMG',
+           'INPUT', 'INS', 'LABEL', 'LEGEND', 'LI', 'LINK', 'MAP',
+           'MENU', 'META', 'NOFRAMES', 'NOSCRIPT', 'OBJECT', 'OL', 'OPTION',
+           'P', 'PRE', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN',
+           'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
+           'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL']
+
+thetags = ['A',
+           'DIV',
+           'FRAME', 'H1', 'H2',
+           'H3', 'H4', 'IFRAME ', 'INPUT', 'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
+           'STYLE',
+           'TR']
+
+def compute_matrix(dataset):
+    import itertools
+    import numpy
+    ret = {}
+    matrix = numpy.zeros((len(thetags) + 1, len(thetags) + 1))
+
+    for data in dataset:
+        x = data[0].upper()
+        y = data[1].upper()
+        try:
+            x = thetags.index(x)
+        except:
+            x = len(thetags)
+
+        try:
+            y = thetags.index(y)
+        except:
+            y = len(thetags)
+
+        matrix[x,y] += 1
+    ret['matrix'] = matrix
+    ret['eigen'] = numpy.linalg.eigvals(matrix)
+    return ret
+
+def readDOM(fn):
+    from bs4 import BeautifulSoup
+    #f = open(fn)
+    #content = ''.join(f.readlines())
+    content = fn
+    dom = BeautifulSoup(content)
+    couples = []
+    for x in dom.findAll():
+        couples.append((str(x.parent.name), str(x.name)))
+    #f.close()
+    return couples
+
+class domclassTest(HTTPTest):
+    implements(IPlugin, ITest)
+
+    shortName = "domclass"
+    description = "domclass"
+    requirements = None
+    options = domclassArgs
+    blocking = False
+
+    def runTool(self):
+        import yaml, numpy
+        site_a = readDOM(self.local_options['file'])
+        site_b = readDOM(self.local_options['fileb'])
+        a = compute_matrix(site_a)
+        self.result['eigenvalues'] = str(a['eigen'])
+        self.result['matrix'] = str(a['matrix'])
+        self.result['content'] = data[:200]
+        b = compute_matrix(site_b)
+        print "A: %s" % a
+        print "B: %s" % b
+        correlation = numpy.vdot(a['eigen'],b['eigen'])
+        correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
+        correlation = (correlation + 1)/2
+        print "Corelation: %s" % correlation
+
+    def processResponseBody(self, data):
+        import yaml, numpy
+        site_a = readDOM(data)
+        #site_b = readDOM(self.local_options['fileb'])
+        a = compute_matrix(site_a)
+
+        if len(data) == 0:
+            self.result['eigenvalues'] = None
+            self.result['matrix'] = None
+        else:
+            self.result['eigenvalues'] = str(a['eigen'])
+            self.result['matrix'] = str(a['matrix'])
+        #self.result['content'] = data[:200]
+        #b = compute_matrix(site_b)
+        print "A: %s" % a
+        return a
+        #print "B: %s" % b
+        #correlation = numpy.vdot(a['eigen'],b['eigen'])
+        #correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
+        #correlation = (correlation + 1)/2
+        # print "Corelation: %s" % correlation
+
+# We need to instantiate it otherwise getPlugins does not detect it
+# XXX Find a way to load plugins without instantiating them.
+domclass = domclassTest(None, None, None)
diff --git a/ooni/protocols/http.py b/ooni/protocols/http.py
index d5573b3..5254a5c 100644
--- a/ooni/protocols/http.py
+++ b/ooni/protocols/http.py
@@ -51,7 +51,7 @@ class HTTPTest(OONITest):
 
     def _processResponseBody(self, data):
         self.response['body'] = data
-        self.result['response'] = self.response
+        #self.result['response'] = self.response
         self.processResponseBody(data)
 
     def processResponseBody(self, data):
@@ -101,7 +101,7 @@ class HTTPTest(OONITest):
         if str(self.response['code']).startswith('3'):
             self.processRedirect(response.headers.getRawHeaders('Location')[0])
         self.processResponseHeaders(self.response['headers'])
-        self.result['response'] = self.response
+        #self.result['response'] = self.response
 
         finished = defer.Deferred()
         response.deliverBody(BodyReceiver(finished))
@@ -119,7 +119,8 @@ class HTTPTest(OONITest):
         if self.randomize_ua:
             self.randomize_useragent()
 
-        self.result['request'] = self.request
+        #self.result['request'] = self.request
+        self.result['url'] = url
         return self.agent.request(self.request['method'], self.request['url'],
                                   Headers(self.request['headers']),
                                   self.request['body'])



More information about the tor-commits mailing list