# [tor-commits] [ooni-probe/master] Do some benckmarks on domclass and figure out where optimization should be

art at torproject.org art at torproject.org
Sun Oct 7 16:32:51 UTC 2012

commit c30049b0d9daf748378f3da717411f4c0b85a23d
Author: Arturo FilastÃ² <arturo at filasto.net>
Date:   Sun Oct 7 16:32:00 2012 +0000

Do some benckmarks on domclass and figure out where optimization should be
done.
(We must optimize how the DOM tree is parsed and how we compute the couple
matrix)
---
ooni/kit/domclass.py |   53 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
index d50647c..33c960a 100644
--- a/ooni/kit/domclass.py
+++ b/ooni/kit/domclass.py
@@ -136,14 +136,63 @@ def compute_correlation(matrix_a, matrix_b):
correlation = (correlation + 1)/2
return correlation

-def example():
+def benchmark():
+    """
+    Running some very basic benchmarks we assets this:
+
+    done in 0.74356508255
+    done in 0.94336104393
+    Computing prob matrix
+    done in 0.0432229042053
+    Computing eigenvalues
+    done in 0.00188422203064
+    Corelation: 0.999999079331
+
+    this was with:
+    683 filea.txt
+    678 fileb.txt
+
+    diff file* | wc -l
+    283
+
+
+    What this means is that the bottleneck is not in the maths, but is rather
+    in the computation of the DOM tree matrix.
+
+    XXX We should focus on optimizing the parsing of the HTML and the
+    computation of the couple matrix.
+    """
+    import time
+    start = time.time()
+
+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+

a = {}
+    print "Computing prob matrix"
a['matrix'] = compute_probability_matrix(site_a)
+
+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+    print "Computing eigenvalues"
a['eigen'] = compute_eigenvalues(a['matrix'])

+    print "done in %s" % (time.time() - start)
+    start = time.time()
+
+
b = {}
b['matrix'] = compute_probability_matrix(site_b)
b['eigen'] = compute_eigenvalues(b['matrix'])
@@ -151,4 +200,4 @@ def example():
correlation = compute_correlation(a['eigen'], b['eigen'])
print "Corelation: %s" % correlation

-
+#benchmark()