commit 78fe6dea0b46c4055fbc195cf9329001c062cb17 Author: Arturo Filastò hellais@torproject.org Date: Tue Aug 21 01:51:58 2012 +0200
Write some documentation for DOMClass --- ooni/plugins/domclass.py | 97 ++++++++++++++++++++++++++++++++++----------- ooni/protocols/http.py | 5 ++ 2 files changed, 78 insertions(+), 24 deletions(-)
diff --git a/ooni/plugins/domclass.py b/ooni/plugins/domclass.py index 31e2e41..cdcd508 100644 --- a/ooni/plugins/domclass.py +++ b/ooni/plugins/domclass.py @@ -1,8 +1,33 @@ -""" -This is a self genrated test created by scaffolding.py. -you will need to fill it up with all your necessities. -Safe hacking :). -""" +# +# +# domclass +# ******** +# +# :copyright: (c) 2012 by Arturo Filastò +# :license: see LICENSE for more details. +# +# how this works +# -------------- +# +# This classifier uses the DOM structure of a website to determine how similar +# the two sites are. +# The procedure we use is the following: +# * First we parse all the DOM tree of the web page and we build a list of +# TAG parent child relationships (ex. <html><a><b></b></a><c></c></html> => +# (html, a), (a, b), (html, c)). +# +# * We then use this information to build a matrix (M) where m[i][j] = P(of +# transitioning from tag[i] to tag[j]). If tag[i] does not exists P() = 0. +# Note: M is a square matrix that is number_of_tags wide. +# +# * We then calculate the eigenvectors (v_i) and eigenvalues (e) of M. +# +# * The corelation between page A and B is given via this formula: +# correlation = dot_product(e_A, e_B), where e_A and e_B are +# resepectively the eigenvalues for the probability matrix A and the +# probability matrix B. +# + from zope.interface import implements from twisted.python import usage from twisted.plugin import IPlugin @@ -18,6 +43,8 @@ class domclassArgs(usage.Options): ['asset', 'a', None, 'URL List'], ['resume', 'r', 0, 'Resume at this index']]
+# All HTML4 tags +# XXX add link to W3C page where these came from alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE', 'BASEFONT', 'BD', 'BIG', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION', 'CENTER', 'CITE', 'CODE', 'COL', 'COLGROUP', 'DD', 'DEL', 'DFN', 'DIR', 'DIV', @@ -29,6 +56,7 @@ alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE', 'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD', 'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL', 'VAR']
+# Reduced subset of only the most common tags commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION', 'CENTER', 'CITE', 'CODE', 'COL', 'DD', 'DIV', 'DL', 'DT', 'EM', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2', @@ -39,12 +67,18 @@ commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION', 'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD', 'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL']
+# The tags we are intested in using for our analysis thetags = ['A', 'DIV', 'FRAME', 'H1', 'H2', 'H3', 'H4', 'IFRAME ', 'INPUT', 'LABEL','LI', 'P', 'SCRIPT', 'SPAN', 'STYLE', 'TR']
-def compute_matrix(dataset): +def compute_probability_matrix(dataset): + """ + Compute the probability matrix based on the input dataset. + + :dataset: an array of pairs representing the parent child relationships. + """ import itertools import numpy ret = {} @@ -74,21 +108,38 @@ def compute_matrix(dataset): if possibilities != 0: matrix[x][i] = matrix[x][i]/possibilities
- ret['matrix'] = matrix - ret['eigen'] = numpy.linalg.eigvals(matrix) - return ret + return matrix + +def compute_eigenvalues(matrix): + """ + Returns the eigenvalues of the supplied square matrix. + + :matrix: must be a square matrix and diagonalizable. + """ + return numpy.linalg.eigvals(matrix)
def readDOM(content=None, filename=None): + """ + Parses the DOM of the HTML page and returns an array of parent, child + pairs. + + :content: the content of the HTML page to be read. + + :filename: the filename to be read from for getting the content of the + page. + """ from bs4 import BeautifulSoup + if filename: f = open(filename) content = ''.join(f.readlines()) + f.close()
dom = BeautifulSoup(content) couples = [] for x in dom.findAll(): couples.append((str(x.parent.name), str(x.name))) - #f.close() + return couples
class domclassTest(HTTPTest): @@ -100,17 +151,20 @@ class domclassTest(HTTPTest): options = domclassArgs blocking = False
- tool = True + follow_redirects = True + #tool = True
def runTool(self): import yaml, numpy site_a = readDOM(filename=self.local_options['file']) site_b = readDOM(filename=self.local_options['fileb']) - a = compute_matrix(site_a) + a['matrix'] = compute_probability_matrix(site_a) + a['eigen'] = compute_eigenvalue(a['matrix']) + self.result['eigenvalues'] = a['eigen'] - #self.result['matrix'] = str(a['matrix'] - #self.result['content'] = data[:200] - b = compute_matrix(site_b) + b['matrix'] = compute_probability_matrix(site_b) + b['eigen'] = compute_eigenvalue(b['matrix']) + #print "A: %s" % a #print "B: %s" % b correlation = numpy.vdot(a['eigen'],b['eigen']) @@ -122,24 +176,19 @@ class domclassTest(HTTPTest): import yaml, numpy site_a = readDOM(data) #site_b = readDOM(self.local_options['fileb']) - a = compute_matrix(site_a) + a['matrix'] = compute_probability_matrix(site_a) + a['eigen'] = compute_eigenvalue(a['matrix']) +
if len(data) == 0: self.result['eigenvalues'] = None self.result['matrix'] = None else: self.result['eigenvalues'] = a['eigen'] - #self.result['matrix'] = str(a['matrix']) + #self.result['matrix'] = a['matrix'] #self.result['content'] = data[:200] #b = compute_matrix(site_b) print "A: %s" % a return a['eigen'] - #print "B: %s" % b - #correlation = numpy.vdot(a['eigen'],b['eigen']) - #correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen']) - #correlation = (correlation + 1)/2 - #print "Corelation: %s" % correlation
-# We need to instantiate it otherwise getPlugins does not detect it -# XXX Find a way to load plugins without instantiating them. domclass = domclassTest(None, None, None) diff --git a/ooni/protocols/http.py b/ooni/protocols/http.py index 5254a5c..2b38f28 100644 --- a/ooni/protocols/http.py +++ b/ooni/protocols/http.py @@ -40,12 +40,17 @@ class HTTPTest(OONITest): and once the request body has been received. """ randomize_ua = True + follow_redirects = False
def initialize(self): from twisted.web.client import Agent import yaml
self.agent = Agent(self.reactor) + if self.follow_redirects: + from twisted.web.client import RedirectAgent + self.agent = RedirectAgent(self.agent) + self.request = {} self.response = {}
tor-commits@lists.torproject.org