commit 346cd4e6cd1b028bb5f772b1a0bf15c2b2e12ae3
Author: Arturo Filastò <hellais(a)torproject.org>
Date: Sun Aug 19 00:48:03 2012 -0700
Implement Eigenvector based classifier
This allows creation of databases containing the feature sets of websites to be
monitored for censorship. If we find that the page being loaded does not have
similar eigenvectors to the original page, it is probable that we are in front
of a block page.
---
ooni/plugins/daphn3.py | 2 +-
ooni/plugins/domclass.py | 133 ++++++++++++++++++++++++++++++++++++++++++++++
ooni/protocols/http.py | 7 ++-
3 files changed, 138 insertions(+), 4 deletions(-)
diff --git a/ooni/plugins/daphn3.py b/ooni/plugins/daphn3.py
index 6911623..de3df26 100644
--- a/ooni/plugins/daphn3.py
+++ b/ooni/plugins/daphn3.py
@@ -84,7 +84,7 @@ class daphn3Test(OONITest):
if not self.local_options:
self.end()
return
- #pass
+
self.factory = Daphn3ClientFactory()
self.factory.test = self
diff --git a/ooni/plugins/domclass.py b/ooni/plugins/domclass.py
new file mode 100644
index 0000000..5c9b6fb
--- /dev/null
+++ b/ooni/plugins/domclass.py
@@ -0,0 +1,133 @@
+"""
+This is a self genrated test created by scaffolding.py.
+you will need to fill it up with all your necessities.
+Safe hacking :).
+"""
+from zope.interface import implements
+from twisted.python import usage
+from twisted.plugin import IPlugin
+from ooni.plugoo.tests import ITest, OONITest
+from ooni.plugoo.assets import Asset
+from ooni.utils import log
+from ooni.protocols.http import HTTPTest
+
+class domclassArgs(usage.Options):
+ optParameters = [['output', 'o', None, 'Output to write'],
+ ['file', 'f', None, 'Corpus file'],
+ ['fileb', 'b', None, 'Corpus file'],
+ ['asset', 'a', None, 'URL List'],
+ ['resume', 'r', 0, 'Resume at this index']]
+alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE',
+ 'BASEFONT', 'BD', 'BIG', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
+ 'CENTER', 'CITE', 'CODE', 'COL', 'COLGROUP', 'DD', 'DEL', 'DFN', 'DIR', 'DIV',
+ 'DL', 'DT', 'E M', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2',
+ 'H3', 'H4', 'H5', 'H6', 'HEAD', 'HR', 'HTML', 'I', 'IFRAME ', 'IMG',
+ 'INPUT', 'INS', 'ISINDEX', 'KBD', 'LABEL', 'LEGEND', 'LI', 'LINK', 'MAP',
+ 'MENU', 'META', 'NOFRAMES', 'NOSCRIPT', 'OBJECT', 'OL', 'OPTGROUP', 'OPTION',
+ 'P', 'PARAM', 'PRE', 'Q', 'S', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN',
+ 'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
+ 'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL', 'VAR']
+
+commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
+ 'CENTER', 'CITE', 'CODE', 'COL', 'DD', 'DIV',
+ 'DL', 'DT', 'EM', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2',
+ 'H3', 'H4', 'H5', 'H6', 'HEAD', 'HR', 'HTML', 'IFRAME ', 'IMG',
+ 'INPUT', 'INS', 'LABEL', 'LEGEND', 'LI', 'LINK', 'MAP',
+ 'MENU', 'META', 'NOFRAMES', 'NOSCRIPT', 'OBJECT', 'OL', 'OPTION',
+ 'P', 'PRE', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN',
+ 'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
+ 'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL']
+
+thetags = ['A',
+ 'DIV',
+ 'FRAME', 'H1', 'H2',
+ 'H3', 'H4', 'IFRAME ', 'INPUT', 'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
+ 'STYLE',
+ 'TR']
+
+def compute_matrix(dataset):
+ import itertools
+ import numpy
+ ret = {}
+ matrix = numpy.zeros((len(thetags) + 1, len(thetags) + 1))
+
+ for data in dataset:
+ x = data[0].upper()
+ y = data[1].upper()
+ try:
+ x = thetags.index(x)
+ except:
+ x = len(thetags)
+
+ try:
+ y = thetags.index(y)
+ except:
+ y = len(thetags)
+
+ matrix[x,y] += 1
+ ret['matrix'] = matrix
+ ret['eigen'] = numpy.linalg.eigvals(matrix)
+ return ret
+
+def readDOM(fn):
+ from bs4 import BeautifulSoup
+ #f = open(fn)
+ #content = ''.join(f.readlines())
+ content = fn
+ dom = BeautifulSoup(content)
+ couples = []
+ for x in dom.findAll():
+ couples.append((str(x.parent.name), str(x.name)))
+ #f.close()
+ return couples
+
+class domclassTest(HTTPTest):
+ implements(IPlugin, ITest)
+
+ shortName = "domclass"
+ description = "domclass"
+ requirements = None
+ options = domclassArgs
+ blocking = False
+
+ def runTool(self):
+ import yaml, numpy
+ site_a = readDOM(self.local_options['file'])
+ site_b = readDOM(self.local_options['fileb'])
+ a = compute_matrix(site_a)
+ self.result['eigenvalues'] = str(a['eigen'])
+ self.result['matrix'] = str(a['matrix'])
+ self.result['content'] = data[:200]
+ b = compute_matrix(site_b)
+ print "A: %s" % a
+ print "B: %s" % b
+ correlation = numpy.vdot(a['eigen'],b['eigen'])
+ correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
+ correlation = (correlation + 1)/2
+ print "Corelation: %s" % correlation
+
+ def processResponseBody(self, data):
+ import yaml, numpy
+ site_a = readDOM(data)
+ #site_b = readDOM(self.local_options['fileb'])
+ a = compute_matrix(site_a)
+
+ if len(data) == 0:
+ self.result['eigenvalues'] = None
+ self.result['matrix'] = None
+ else:
+ self.result['eigenvalues'] = str(a['eigen'])
+ self.result['matrix'] = str(a['matrix'])
+ #self.result['content'] = data[:200]
+ #b = compute_matrix(site_b)
+ print "A: %s" % a
+ return a
+ #print "B: %s" % b
+ #correlation = numpy.vdot(a['eigen'],b['eigen'])
+ #correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
+ #correlation = (correlation + 1)/2
+ # print "Corelation: %s" % correlation
+
+# We need to instantiate it otherwise getPlugins does not detect it
+# XXX Find a way to load plugins without instantiating them.
+domclass = domclassTest(None, None, None)
diff --git a/ooni/protocols/http.py b/ooni/protocols/http.py
index d5573b3..5254a5c 100644
--- a/ooni/protocols/http.py
+++ b/ooni/protocols/http.py
@@ -51,7 +51,7 @@ class HTTPTest(OONITest):
def _processResponseBody(self, data):
self.response['body'] = data
- self.result['response'] = self.response
+ #self.result['response'] = self.response
self.processResponseBody(data)
def processResponseBody(self, data):
@@ -101,7 +101,7 @@ class HTTPTest(OONITest):
if str(self.response['code']).startswith('3'):
self.processRedirect(response.headers.getRawHeaders('Location')[0])
self.processResponseHeaders(self.response['headers'])
- self.result['response'] = self.response
+ #self.result['response'] = self.response
finished = defer.Deferred()
response.deliverBody(BodyReceiver(finished))
@@ -119,7 +119,8 @@ class HTTPTest(OONITest):
if self.randomize_ua:
self.randomize_useragent()
- self.result['request'] = self.request
+ #self.result['request'] = self.request
+ self.result['url'] = url
return self.agent.request(self.request['method'], self.request['url'],
Headers(self.request['headers']),
self.request['body'])