[tor-commits] [ooni-probe/master] Do some benckmarks on domclass and figure out where optimization should be

art at torproject.org art at torproject.org
Sun Oct 7 16:32:51 UTC 2012


commit c30049b0d9daf748378f3da717411f4c0b85a23d
Author: Arturo Filastò <arturo at filasto.net>
Date:   Sun Oct 7 16:32:00 2012 +0000

    Do some benckmarks on domclass and figure out where optimization should be
    done.
    (We must optimize how the DOM tree is parsed and how we compute the couple
    matrix)
---
 ooni/kit/domclass.py |   53 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
index d50647c..33c960a 100644
--- a/ooni/kit/domclass.py
+++ b/ooni/kit/domclass.py
@@ -136,14 +136,63 @@ def compute_correlation(matrix_a, matrix_b):
     correlation = (correlation + 1)/2
     return correlation
 
-def example():
+def benchmark():
+    """
+    Running some very basic benchmarks we assets this:
+
+    Read file B
+    done in 0.74356508255
+    Read file A
+    done in 0.94336104393
+    Computing prob matrix
+    done in 0.0432229042053
+    Computing eigenvalues
+    done in 0.00188422203064
+    Corelation: 0.999999079331
+
+    this was with:
+    683 filea.txt
+    678 fileb.txt
+
+    diff file* | wc -l
+    283
+
+
+    What this means is that the bottleneck is not in the maths, but is rather
+    in the computation of the DOM tree matrix.
+
+    XXX We should focus on optimizing the parsing of the HTML and the
+    computation of the couple matrix.
+    """
+    import time
+    start = time.time()
+
+    print "Read file B"
     site_a = readDOM(filename='filea.txt')
+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+    print "Read file A"
     site_b = readDOM(filename='fileb.txt')
+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+
 
     a = {}
+    print "Computing prob matrix"
     a['matrix'] = compute_probability_matrix(site_a)
+
+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+    print "Computing eigenvalues"
     a['eigen'] = compute_eigenvalues(a['matrix'])
 
+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+
     b = {}
     b['matrix'] = compute_probability_matrix(site_b)
     b['eigen'] = compute_eigenvalues(b['matrix'])
@@ -151,4 +200,4 @@ def example():
     correlation = compute_correlation(a['eigen'], b['eigen'])
     print "Corelation: %s" % correlation
 
-
+#benchmark()



More information about the tor-commits mailing list