[tor-commits] [ooni-probe/master] Expand the heuristics for detecting blockpages in HTTP response

art at torproject.org art at torproject.org
Mon May 30 16:28:34 UTC 2016


commit 7f8021efd69d28beb034a7cd25c60ea2c0016bf6
Author: Arturo Filastò <arturo at filasto.net>
Date:   Tue May 24 18:06:00 2016 +0200

    Expand the heuristics for detecting blockpages in HTTP response
    
    * Extract the title of the response
    
    * Consider only common HTTP headers
---
 ooni/nettests/blocking/web_connectivity.py | 76 ++++++++++++++++++++++--------
 ooni/utils/net.py                          | 39 +++++++++++++++
 2 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/ooni/nettests/blocking/web_connectivity.py b/ooni/nettests/blocking/web_connectivity.py
index e640a5f..8f048e1 100644
--- a/ooni/nettests/blocking/web_connectivity.py
+++ b/ooni/nettests/blocking/web_connectivity.py
@@ -1,7 +1,6 @@
 # -*- encoding: utf-8 -*-
 
 import csv
-import json
 from urlparse import urlparse
 
 from ipaddr import IPv4Address, AddressValueError
@@ -20,7 +19,7 @@ from ooni.utils import log
 
 from ooni.backend_client import WebConnectivityClient
 
-from ooni.utils.net import StringProducer, BodyReceiver
+from ooni.utils.net import COMMON_SERVER_HEADERS, extract_title
 from ooni.templates import httpt, dnst
 from ooni.errors import failureToString
 
@@ -50,6 +49,7 @@ class UsageOptions(usage.Options):
         ['url', 'u', None, 'Specify a single URL to test'],
         ['dns-discovery', 'd', 'whoami.akamai.net', 'Specify the dns discovery test helper'],
         ['backend', 'b', None, 'The web_consistency backend test helper'],
+        ['retries', 'r', 1, 'Number of retries for the HTTP request'],
     ]
 
 
@@ -158,6 +158,12 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest):
         if not self.input:
             raise Exception("No input specified")
 
+        try:
+            self.localOptions['retries'] = int(self.localOptions['retries'])
+        except ValueError:
+            self.localOptions['retries'] = 2
+
+        self.report['retries'] = self.localOptions['retries']
         self.report['client_resolver'] = self.resolverIp
         self.report['dns_consistency'] = None
         self.report['body_length_match'] = None
@@ -188,7 +194,8 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest):
                 'body_length': -1,
                 'failure': None,
                 'status_code': -1,
-                'headers': {}
+                'headers': {},
+                'title': ''
             }
         }
         if isinstance(self.localOptions['backend'], dict):
@@ -240,24 +247,36 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest):
         )
         self.report['control'] = self.control
 
+    @defer.inlineCallbacks
     def experiment_http_get_request(self):
-        return self.doRequest(self.input, headers=REQUEST_HEADERS)
+        retries = 0
+        while True:
+            try:
+                result = yield self.doRequest(self.input,
+                                              headers=REQUEST_HEADERS)
+                break
+            except:
+                if self.localOptions['retries'] > retries:
+                    raise
+                retries += 1
+
+        defer.returnValue(result)
 
     def compare_headers(self, experiment_http_response):
-        count = 0
         control_headers_lower = {k.lower(): v for k, v in
-                self.report['control']['http_request']['headers'].items()}
+                self.report['control']['http_request']['headers'].items()
+        }
+        experiment_headers_lower = {k.lower(): v for k, v in
+            experiment_http_response.headers.getAllRawHeaders()
+        }
 
-        for header_name, header_value in \
-                experiment_http_response.headers.getAllRawHeaders():
-            try:
-                control_headers_lower[header_name.lower()]
-            except KeyError:
-                log.debug("Did not find the key {}".format(header_name))
-                return False
-            count += 1
+        uncommon_ctrl_headers = (set(control_headers_lower.keys()) -
+                                 set(COMMON_SERVER_HEADERS))
+        uncommon_exp_headers = (set(experiment_headers_lower.keys()) -
+                                set(COMMON_SERVER_HEADERS))
 
-        return count == len(self.report['control']['http_request']['headers'])
+        return len(uncommon_ctrl_headers.intersection(
+                            uncommon_exp_headers)) > 0
 
     def compare_body_lengths(self, experiment_http_response):
         control_body_length = self.control['http_request']['body_length']
@@ -279,6 +298,17 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest):
         else:
             return False
 
+    def compare_titles(self, experiment_http_response):
+        experiment_title = extract_title(experiment_http_response.body).strip()
+        control_title = self.control['http_request']['title'].strip()
+        first_exp_word = experiment_title.split(' ')[0]
+        first_ctrl_word = control_title.split(' ')[0]
+        if len(first_exp_word) < 5:
+            # We don't consider to match words that are shorter than 5
+            # characters (5 is the average word length for english)
+            return False
+        return (first_ctrl_word.lower() == first_exp_word.lower())
+
     def compare_http_experiments(self, experiment_http_response):
 
         self.report['body_length_match'] = \
@@ -292,6 +322,8 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest):
             self.control['http_request']['status_code']
         )
 
+        self.report['title_match'] = self.compare_titles(experiment_http_response)
+
     def compare_dns_experiments(self, experiment_dns_answers):
         if self.control['dns']['failure'] is not None and \
                 self.control['dns']['failure'] == self.report['dns_experiment_failure']:
@@ -359,11 +391,15 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest):
             self.report['dns_consistency'] = 'inconsistent'
         tcp_connect = self.compare_tcp_experiments()
 
-        got_expected_web_page = (
-            (self.report['body_length_match'] is True or
-             self.report['headers_match'] is True)
-            and self.report['status_code_match'] is True
-        )
+        got_expected_web_page = None
+        if (experiment_http_failure is None and
+                    control_http_failure is None):
+            got_expected_web_page = (
+                (self.report['body_length_match'] is True or
+                 self.report['headers_match'] is True or
+                 self.report['title_match'])
+                and self.report['status_code_match'] is True
+            )
 
         if (dns_consistent == True and tcp_connect == False and
                 experiment_http_failure is not None):
diff --git a/ooni/utils/net.py b/ooni/utils/net.py
index ad5454e..20f5a42 100644
--- a/ooni/utils/net.py
+++ b/ooni/utils/net.py
@@ -1,3 +1,4 @@
+import re
 import sys
 import socket
 from random import randint
@@ -46,10 +47,48 @@ PLATFORMS = {'LINUX': sys.platform.startswith("linux"),
              'SOLARIS': sys.platform.startswith("sunos"),
              'WINDOWS': sys.platform.startswith("win32")}
 
+# These are the 25 most common server headers for the sites in the
+# citizenlab global testing list.
+COMMON_SERVER_HEADERS = (
+    "date",
+    "content-type",
+    "server",
+    "cache-control",
+    "vary",
+    "set-cookie",
+    "location",
+    "expires",
+    "x-powered-by",
+    "content-encoding",
+    "last-modified",
+    "accept-ranges",
+    "pragma",
+    "x-frame-options",
+    "etag",
+    "x-content-type-options",
+    "age",
+    "via",
+    "p3p",
+    "x-xss-protection",
+    "content-language",
+    "cf-ray",
+    "strict-transport-security",
+    "link",
+    "x-varnish"
+)
+
 # This is used as a default for checking if we get the expected result when
 # fetching URLs over some proxy.
 GOOGLE_HUMANS = ('http://www.google.com/humans.txt', 'Google is built by a large')
 
+TITLE_REGEXP = re.compile("<title>(.*?)</title>", re.IGNORECASE | re.DOTALL)
+
+def extract_title(body):
+    m = TITLE_REGEXP.search(body)
+    if m:
+        return m.group(1)
+    return ''
+
 class StringProducer(object):
     implements(IBodyProducer)
 





More information about the tor-commits mailing list