[torcommits] [ooniprobe/master] Do some benckmarks on domclass and figure out where optimization should be
art at torproject.org
art at torproject.org
Sun Oct 7 16:32:51 UTC 2012
commit c30049b0d9daf748378f3da717411f4c0b85a23d
Author: Arturo FilastÃ² <arturo at filasto.net>
Date: Sun Oct 7 16:32:00 2012 +0000
Do some benckmarks on domclass and figure out where optimization should be
done.
(We must optimize how the DOM tree is parsed and how we compute the couple
matrix)

ooni/kit/domclass.py  53 ++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 51 insertions(+), 2 deletions()
diff git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
index d50647c..33c960a 100644
 a/ooni/kit/domclass.py
+++ b/ooni/kit/domclass.py
@@ 136,14 +136,63 @@ def compute_correlation(matrix_a, matrix_b):
correlation = (correlation + 1)/2
return correlation
def example():
+def benchmark():
+ """
+ Running some very basic benchmarks we assets this:
+
+ Read file B
+ done in 0.74356508255
+ Read file A
+ done in 0.94336104393
+ Computing prob matrix
+ done in 0.0432229042053
+ Computing eigenvalues
+ done in 0.00188422203064
+ Corelation: 0.999999079331
+
+ this was with:
+ 683 filea.txt
+ 678 fileb.txt
+
+ diff file*  wc l
+ 283
+
+
+ What this means is that the bottleneck is not in the maths, but is rather
+ in the computation of the DOM tree matrix.
+
+ XXX We should focus on optimizing the parsing of the HTML and the
+ computation of the couple matrix.
+ """
+ import time
+ start = time.time()
+
+ print "Read file B"
site_a = readDOM(filename='filea.txt')
+ print "done in %s" % (time.time()  start)
+ start = time.time()
+
+ print "Read file A"
site_b = readDOM(filename='fileb.txt')
+ print "done in %s" % (time.time()  start)
+ start = time.time()
+
+
a = {}
+ print "Computing prob matrix"
a['matrix'] = compute_probability_matrix(site_a)
+
+ print "done in %s" % (time.time()  start)
+ start = time.time()
+
+ print "Computing eigenvalues"
a['eigen'] = compute_eigenvalues(a['matrix'])
+ print "done in %s" % (time.time()  start)
+ start = time.time()
+
+
b = {}
b['matrix'] = compute_probability_matrix(site_b)
b['eigen'] = compute_eigenvalues(b['matrix'])
@@ 151,4 +200,4 @@ def example():
correlation = compute_correlation(a['eigen'], b['eigen'])
print "Corelation: %s" % correlation

+#benchmark()
