[tor-commits] [ooni-probe/master] Implement collector for classifying website with domclass.

art at torproject.org art at torproject.org
Sun Oct 7 17:06:23 UTC 2012


commit 3d1afe284b3437ee460841828e7a7175f93564e3
Author: Arturo Filastò <arturo at filasto.net>
Date:   Sun Oct 7 17:05:33 2012 +0000

    Implement collector for classifying website with domclass.
    Next step run this on all the sites we want to test.
---
 nettests/experimental/domclass_collector.py |   33 +++++++++++++++++++++++++++
 ooni/kit/__init__.py                        |    1 +
 ooni/kit/domclass.py                        |   30 ++++++++++++++++--------
 3 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/nettests/experimental/domclass_collector.py b/nettests/experimental/domclass_collector.py
new file mode 100644
index 0000000..9b2c8d8
--- /dev/null
+++ b/nettests/experimental/domclass_collector.py
@@ -0,0 +1,33 @@
+# -*- encoding: utf-8 -*-
+#
+# The purpose of this collector is to compute the eigenvector for the input
+# file containing a list of sites.
+#
+#
+# :authors: Arturo Filastò
+# :licence: see LICENSE
+
+from twisted.internet import threads, defer
+
+from ooni.kit import domclass
+from ooni.templates import httpt
+
+class DOMClassCollector(httpt.HTTPTest):
+    name = "DOM class collector"
+    author = "Arturo Filastò"
+    version = 0.1
+
+    inputs = ['http://news.google.com/', 'http://wikileaks.org/']
+    #inputFile = ['f', 'file', None, 'The list of urls to build a domclass for']
+
+    def test_collect(self):
+        if self.input:
+            url = self.input
+            return self.doRequest(url)
+        else:
+            raise Exception("No input specified")
+
+    def processResponseBody(self, body):
+        eigenvalues = domclass.compute_eigenvalues_from_DOM(content=body)
+        self.report['eigenvalues'] = eigenvalues
+
diff --git a/ooni/kit/__init__.py b/ooni/kit/__init__.py
new file mode 100644
index 0000000..55374c9
--- /dev/null
+++ b/ooni/kit/__init__.py
@@ -0,0 +1 @@
+__all__ = ['domclass']
diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
index 02c26dc..1cf33a0 100644
--- a/ooni/kit/domclass.py
+++ b/ooni/kit/domclass.py
@@ -108,7 +108,7 @@ def compute_eigenvalues(matrix):
     """
     return numpy.linalg.eigvals(matrix)
 
-def readDOM(content=None, filename=None):
+def readDOM(content=None, filename=None, debug=False):
     """
     Parses the DOM of the HTML page and returns an array of parent, child
     pairs.
@@ -124,20 +124,30 @@ def readDOM(content=None, filename=None):
         content = ''.join(f.readlines())
         f.close()
 
-    start = time.time()
-    print "Running BeautifulSoup on content"
+    if debug:
+        start = time.time()
+        print "Running BeautifulSoup on content"
     dom = BeautifulSoup(content)
-    print "done in %s" % (time.time() - start)
+    if debug:
+        print "done in %s" % (time.time() - start)
 
-    start = time.time()
-    print "Creating couples matrix"
+    if debug:
+        start = time.time()
+        print "Creating couples matrix"
     couples = []
     for x in dom.findAll():
         couples.append((str(x.parent.name), str(x.name)))
-    print "done in %s" % (time.time() - start)
+    if debug:
+        print "done in %s" % (time.time() - start)
 
     return couples
 
+def compute_eigenvalues_from_DOM(*arg,**kw):
+    dom = readDOM(*arg, **kw)
+    probability_matrix = compute_probability_matrix(dom)
+    eigenvalues = compute_eigenvalues(probability_matrix)
+    return eigenvalues
+
 def compute_correlation(matrix_a, matrix_b):
     correlation = numpy.vdot(matrix_a, matrix_b)
     correlation /= numpy.linalg.norm(matrix_a)*numpy.linalg.norm(matrix_b)
@@ -192,13 +202,13 @@ def benchmark():
     """
     start = time.time()
     print "Read file B"
-    site_a = readDOM(filename='filea.txt')
+    site_a = readDOM(filename='filea.txt', debug=True)
     print "--------"
     print "total done in %s" % (time.time() - start)
 
     start = time.time()
     print "Read file A"
-    site_b = readDOM(filename='fileb.txt')
+    site_b = readDOM(filename='fileb.txt', debug=True)
     print "--------"
     print "total done in %s" % (time.time() - start)
 
@@ -233,4 +243,4 @@ def benchmark():
 
     print "Corelation: %s" % correlation
 
-benchmark()
+#benchmark()



More information about the tor-commits mailing list