commit 3d1afe284b3437ee460841828e7a7175f93564e3 Author: Arturo Filastò arturo@filasto.net Date: Sun Oct 7 17:05:33 2012 +0000
Implement collector for classifying website with domclass. Next step run this on all the sites we want to test. --- nettests/experimental/domclass_collector.py | 33 +++++++++++++++++++++++++++ ooni/kit/__init__.py | 1 + ooni/kit/domclass.py | 30 ++++++++++++++++-------- 3 files changed, 54 insertions(+), 10 deletions(-)
diff --git a/nettests/experimental/domclass_collector.py b/nettests/experimental/domclass_collector.py new file mode 100644 index 0000000..9b2c8d8 --- /dev/null +++ b/nettests/experimental/domclass_collector.py @@ -0,0 +1,33 @@ +# -*- encoding: utf-8 -*- +# +# The purpose of this collector is to compute the eigenvector for the input +# file containing a list of sites. +# +# +# :authors: Arturo Filastò +# :licence: see LICENSE + +from twisted.internet import threads, defer + +from ooni.kit import domclass +from ooni.templates import httpt + +class DOMClassCollector(httpt.HTTPTest): + name = "DOM class collector" + author = "Arturo Filastò" + version = 0.1 + + inputs = ['http://news.google.com/', 'http://wikileaks.org/'] + #inputFile = ['f', 'file', None, 'The list of urls to build a domclass for'] + + def test_collect(self): + if self.input: + url = self.input + return self.doRequest(url) + else: + raise Exception("No input specified") + + def processResponseBody(self, body): + eigenvalues = domclass.compute_eigenvalues_from_DOM(content=body) + self.report['eigenvalues'] = eigenvalues + diff --git a/ooni/kit/__init__.py b/ooni/kit/__init__.py new file mode 100644 index 0000000..55374c9 --- /dev/null +++ b/ooni/kit/__init__.py @@ -0,0 +1 @@ +__all__ = ['domclass'] diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py index 02c26dc..1cf33a0 100644 --- a/ooni/kit/domclass.py +++ b/ooni/kit/domclass.py @@ -108,7 +108,7 @@ def compute_eigenvalues(matrix): """ return numpy.linalg.eigvals(matrix)
-def readDOM(content=None, filename=None): +def readDOM(content=None, filename=None, debug=False): """ Parses the DOM of the HTML page and returns an array of parent, child pairs. @@ -124,20 +124,30 @@ def readDOM(content=None, filename=None): content = ''.join(f.readlines()) f.close()
- start = time.time() - print "Running BeautifulSoup on content" + if debug: + start = time.time() + print "Running BeautifulSoup on content" dom = BeautifulSoup(content) - print "done in %s" % (time.time() - start) + if debug: + print "done in %s" % (time.time() - start)
- start = time.time() - print "Creating couples matrix" + if debug: + start = time.time() + print "Creating couples matrix" couples = [] for x in dom.findAll(): couples.append((str(x.parent.name), str(x.name))) - print "done in %s" % (time.time() - start) + if debug: + print "done in %s" % (time.time() - start)
return couples
+def compute_eigenvalues_from_DOM(*arg,**kw): + dom = readDOM(*arg, **kw) + probability_matrix = compute_probability_matrix(dom) + eigenvalues = compute_eigenvalues(probability_matrix) + return eigenvalues + def compute_correlation(matrix_a, matrix_b): correlation = numpy.vdot(matrix_a, matrix_b) correlation /= numpy.linalg.norm(matrix_a)*numpy.linalg.norm(matrix_b) @@ -192,13 +202,13 @@ def benchmark(): """ start = time.time() print "Read file B" - site_a = readDOM(filename='filea.txt') + site_a = readDOM(filename='filea.txt', debug=True) print "--------" print "total done in %s" % (time.time() - start)
start = time.time() print "Read file A" - site_b = readDOM(filename='fileb.txt') + site_b = readDOM(filename='fileb.txt', debug=True) print "--------" print "total done in %s" % (time.time() - start)
@@ -233,4 +243,4 @@ def benchmark():
print "Corelation: %s" % correlation
-benchmark() +#benchmark()
tor-commits@lists.torproject.org