[tor-commits] [ooni-probe/master] Turn DOMClass into a kit

Sun Oct 7 16:32:51 UTC 2012

commit 9590d92f9e36b10b6960f83246f35e73d72f55e5
Author: Arturo FilastÃ² <arturo at filasto.net>
Date:   Sun Oct 7 16:24:46 2012 +0000

    Turn DOMClass into a kit
---
 ooni/kit/domclass.py |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 154 insertions(+), 0 deletions(-)

diff --git a/ooni/kit/domclass.py b/ooni/kit/domclass.py
new file mode 100644
index 0000000..d50647c
--- /dev/null
+++ b/ooni/kit/domclass.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+#-*- encoding: utf-8 -*-
+#
+#    domclass
+#    ********
+#
+#    :copyright: (c) 2012 by Arturo FilastÃ²
+#    :license: see LICENSE for more details.
+#
+#    how this works
+#    --------------
+#
+#    This classifier uses the DOM structure of a website to determine how similar
+#    the two sites are.
+#    The procedure we use is the following:
+#        * First we parse all the DOM tree of the web page and we build a list of
+#          TAG parent child relationships (ex. <html><a><b></b></a><c></c></html> =>
+#          (html, a), (a, b), (html, c)).
+#
+#        * We then use this information to build a matrix (M) where m[i][j] = P(of
+#          transitioning from tag[i] to tag[j]). If tag[i] does not exists P() = 0.
+#          Note: M is a square matrix that is number_of_tags wide.
+#
+#        * We then calculate the eigenvectors (v_i) and eigenvalues (e) of M.
+#
+#        * The corelation between page A and B is given via this formula:
+#          correlation = dot_product(e_A, e_B), where e_A and e_B are
+#          resepectively the eigenvalues for the probability matrix A and the
+#          probability matrix B.
+#
+
+import yaml
+import numpy
+from bs4 import BeautifulSoup
+
+# All HTML4 tags
+# XXX add link to W3C page where these came from
+alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE',
+           'BASEFONT', 'BD', 'BIG', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
+           'CENTER', 'CITE', 'CODE', 'COL', 'COLGROUP', 'DD', 'DEL', 'DFN', 'DIR', 'DIV',
+           'DL', 'DT', 'E M', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2',
+           'H3', 'H4', 'H5', 'H6', 'HEAD', 'HR', 'HTML', 'I', 'IFRAME ', 'IMG',
+           'INPUT', 'INS', 'ISINDEX', 'KBD', 'LABEL', 'LEGEND', 'LI', 'LINK', 'MAP',
+           'MENU', 'META', 'NOFRAMES', 'NOSCRIPT', 'OBJECT', 'OL', 'OPTGROUP', 'OPTION',
+           'P', 'PARAM', 'PRE', 'Q', 'S', 'SAMP', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN',
+           'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
+           'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL', 'VAR']
+
+# Reduced subset of only the most common tags
+commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
+           'CENTER', 'CITE', 'CODE', 'COL', 'DD', 'DIV',
+           'DL', 'DT', 'EM', 'FIELDSET', 'FONT', 'FORM', 'FRAME', 'FRAMESET', 'H1', 'H2',
+           'H3', 'H4', 'H5', 'H6', 'HEAD', 'HR', 'HTML', 'IFRAME ', 'IMG',
+           'INPUT', 'INS', 'LABEL', 'LEGEND', 'LI', 'LINK', 'MAP',
+           'MENU', 'META', 'NOFRAMES', 'NOSCRIPT', 'OBJECT', 'OL', 'OPTION',
+           'P', 'PRE', 'SCRIPT', 'SELECT', 'SMALL', 'SPAN',
+           'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
+           'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL']
+
+# The tags we are intested in using for our analysis
+thetags = ['A', 'DIV', 'FRAME', 'H1', 'H2',
+           'H3', 'H4', 'IFRAME ', 'INPUT',
+           'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
+           'STYLE', 'TR']
+
+def compute_probability_matrix(dataset):
+    """
+    Compute the probability matrix based on the input dataset.
+
+    :dataset: an array of pairs representing the parent child relationships.
+    """
+    import itertools
+    ret = {}
+    matrix = numpy.zeros((len(thetags) + 1, len(thetags) + 1))
+
+    for data in dataset:
+        x = data[0].upper()
+        y = data[1].upper()
+        try:
+            x = thetags.index(x)
+        except:
+            x = len(thetags)
+
+        try:
+            y = thetags.index(y)
+        except:
+            y = len(thetags)
+
+        matrix[x,y] += 1
+
+    for x in xrange(len(thetags) + 1):
+        possibilities = 0
+        for y in matrix[x]:
+            possibilities += y
+
+        for i in xrange(len(matrix[x])):
+            if possibilities != 0:
+                matrix[x][i] = matrix[x][i]/possibilities
+
+    return matrix
+
+def compute_eigenvalues(matrix):
+    """
+    Returns the eigenvalues of the supplied square matrix.
+
+    :matrix: must be a square matrix and diagonalizable.
+    """
+    return numpy.linalg.eigvals(matrix)
+
+def readDOM(content=None, filename=None):
+    """
+    Parses the DOM of the HTML page and returns an array of parent, child
+    pairs.
+
+    :content: the content of the HTML page to be read.
+
+    :filename: the filename to be read from for getting the content of the
+               page.
+    """
+
+    if filename:
+        f = open(filename)
+        content = ''.join(f.readlines())
+        f.close()
+
+    dom = BeautifulSoup(content)
+    couples = []
+    for x in dom.findAll():
+        couples.append((str(x.parent.name), str(x.name)))
+
+    return couples
+
+def compute_correlation(matrix_a, matrix_b):
+    correlation = numpy.vdot(matrix_a, matrix_b)
+    correlation /= numpy.linalg.norm(matrix_a)*numpy.linalg.norm(matrix_b)
+    correlation = (correlation + 1)/2
+    return correlation
+
+def example():
+    site_a = readDOM(filename='filea.txt')
+    site_b = readDOM(filename='fileb.txt')
+
+    a = {}
+    a['matrix'] = compute_probability_matrix(site_a)
+    a['eigen'] = compute_eigenvalues(a['matrix'])
+
+    b = {}
+    b['matrix'] = compute_probability_matrix(site_b)
+    b['eigen'] = compute_eigenvalues(b['matrix'])
+
+    correlation = compute_correlation(a['eigen'], b['eigen'])
+    print "Corelation: %s" % correlation
+
+