commit c30049b0d9daf748378f3da717411f4c0b85a23d Author: Arturo Filastò arturo@filasto.net Date: Sun Oct 7 16:32:00 2012 +0000
Do some benckmarks on domclass and figure out where optimization should be done. (We must optimize how the DOM tree is parsed and how we compute the couple matrix) --- ooni/kit/domclass.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py index d50647c..33c960a 100644 --- a/ooni/kit/domclass.py +++ b/ooni/kit/domclass.py @@ -136,14 +136,63 @@ def compute_correlation(matrix_a, matrix_b): correlation = (correlation + 1)/2 return correlation
-def example(): +def benchmark(): + """ + Running some very basic benchmarks we assets this: + + Read file B + done in 0.74356508255 + Read file A + done in 0.94336104393 + Computing prob matrix + done in 0.0432229042053 + Computing eigenvalues + done in 0.00188422203064 + Corelation: 0.999999079331 + + this was with: + 683 filea.txt + 678 fileb.txt + + diff file* | wc -l + 283 + + + What this means is that the bottleneck is not in the maths, but is rather + in the computation of the DOM tree matrix. + + XXX We should focus on optimizing the parsing of the HTML and the + computation of the couple matrix. + """ + import time + start = time.time() + + print "Read file B" site_a = readDOM(filename='filea.txt') + print "done in %s" % (time.time() - start) + start = time.time() + + print "Read file A" site_b = readDOM(filename='fileb.txt') + print "done in %s" % (time.time() - start) + start = time.time() + +
a = {} + print "Computing prob matrix" a['matrix'] = compute_probability_matrix(site_a) + + print "done in %s" % (time.time() - start) + start = time.time() + + print "Computing eigenvalues" a['eigen'] = compute_eigenvalues(a['matrix'])
+ print "done in %s" % (time.time() - start) + start = time.time() + + b = {} b['matrix'] = compute_probability_matrix(site_b) b['eigen'] = compute_eigenvalues(b['matrix']) @@ -151,4 +200,4 @@ def example(): correlation = compute_correlation(a['eigen'], b['eigen']) print "Corelation: %s" % correlation
- +#benchmark()