commit 7f8021efd69d28beb034a7cd25c60ea2c0016bf6 Author: Arturo Filastò arturo@filasto.net Date: Tue May 24 18:06:00 2016 +0200
Expand the heuristics for detecting blockpages in HTTP response
* Extract the title of the response
* Consider only common HTTP headers --- ooni/nettests/blocking/web_connectivity.py | 76 ++++++++++++++++++++++-------- ooni/utils/net.py | 39 +++++++++++++++ 2 files changed, 95 insertions(+), 20 deletions(-)
diff --git a/ooni/nettests/blocking/web_connectivity.py b/ooni/nettests/blocking/web_connectivity.py index e640a5f..8f048e1 100644 --- a/ooni/nettests/blocking/web_connectivity.py +++ b/ooni/nettests/blocking/web_connectivity.py @@ -1,7 +1,6 @@ # -*- encoding: utf-8 -*-
import csv -import json from urlparse import urlparse
from ipaddr import IPv4Address, AddressValueError @@ -20,7 +19,7 @@ from ooni.utils import log
from ooni.backend_client import WebConnectivityClient
-from ooni.utils.net import StringProducer, BodyReceiver +from ooni.utils.net import COMMON_SERVER_HEADERS, extract_title from ooni.templates import httpt, dnst from ooni.errors import failureToString
@@ -50,6 +49,7 @@ class UsageOptions(usage.Options): ['url', 'u', None, 'Specify a single URL to test'], ['dns-discovery', 'd', 'whoami.akamai.net', 'Specify the dns discovery test helper'], ['backend', 'b', None, 'The web_consistency backend test helper'], + ['retries', 'r', 1, 'Number of retries for the HTTP request'], ]
@@ -158,6 +158,12 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest): if not self.input: raise Exception("No input specified")
+ try: + self.localOptions['retries'] = int(self.localOptions['retries']) + except ValueError: + self.localOptions['retries'] = 2 + + self.report['retries'] = self.localOptions['retries'] self.report['client_resolver'] = self.resolverIp self.report['dns_consistency'] = None self.report['body_length_match'] = None @@ -188,7 +194,8 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest): 'body_length': -1, 'failure': None, 'status_code': -1, - 'headers': {} + 'headers': {}, + 'title': '' } } if isinstance(self.localOptions['backend'], dict): @@ -240,24 +247,36 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest): ) self.report['control'] = self.control
+ @defer.inlineCallbacks def experiment_http_get_request(self): - return self.doRequest(self.input, headers=REQUEST_HEADERS) + retries = 0 + while True: + try: + result = yield self.doRequest(self.input, + headers=REQUEST_HEADERS) + break + except: + if self.localOptions['retries'] > retries: + raise + retries += 1 + + defer.returnValue(result)
def compare_headers(self, experiment_http_response): - count = 0 control_headers_lower = {k.lower(): v for k, v in - self.report['control']['http_request']['headers'].items()} + self.report['control']['http_request']['headers'].items() + } + experiment_headers_lower = {k.lower(): v for k, v in + experiment_http_response.headers.getAllRawHeaders() + }
- for header_name, header_value in \ - experiment_http_response.headers.getAllRawHeaders(): - try: - control_headers_lower[header_name.lower()] - except KeyError: - log.debug("Did not find the key {}".format(header_name)) - return False - count += 1 + uncommon_ctrl_headers = (set(control_headers_lower.keys()) - + set(COMMON_SERVER_HEADERS)) + uncommon_exp_headers = (set(experiment_headers_lower.keys()) - + set(COMMON_SERVER_HEADERS))
- return count == len(self.report['control']['http_request']['headers']) + return len(uncommon_ctrl_headers.intersection( + uncommon_exp_headers)) > 0
def compare_body_lengths(self, experiment_http_response): control_body_length = self.control['http_request']['body_length'] @@ -279,6 +298,17 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest): else: return False
+ def compare_titles(self, experiment_http_response): + experiment_title = extract_title(experiment_http_response.body).strip() + control_title = self.control['http_request']['title'].strip() + first_exp_word = experiment_title.split(' ')[0] + first_ctrl_word = control_title.split(' ')[0] + if len(first_exp_word) < 5: + # We don't consider to match words that are shorter than 5 + # characters (5 is the average word length for english) + return False + return (first_ctrl_word.lower() == first_exp_word.lower()) + def compare_http_experiments(self, experiment_http_response):
self.report['body_length_match'] = \ @@ -292,6 +322,8 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest): self.control['http_request']['status_code'] )
+ self.report['title_match'] = self.compare_titles(experiment_http_response) + def compare_dns_experiments(self, experiment_dns_answers): if self.control['dns']['failure'] is not None and \ self.control['dns']['failure'] == self.report['dns_experiment_failure']: @@ -359,11 +391,15 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest): self.report['dns_consistency'] = 'inconsistent' tcp_connect = self.compare_tcp_experiments()
- got_expected_web_page = ( - (self.report['body_length_match'] is True or - self.report['headers_match'] is True) - and self.report['status_code_match'] is True - ) + got_expected_web_page = None + if (experiment_http_failure is None and + control_http_failure is None): + got_expected_web_page = ( + (self.report['body_length_match'] is True or + self.report['headers_match'] is True or + self.report['title_match']) + and self.report['status_code_match'] is True + )
if (dns_consistent == True and tcp_connect == False and experiment_http_failure is not None): diff --git a/ooni/utils/net.py b/ooni/utils/net.py index ad5454e..20f5a42 100644 --- a/ooni/utils/net.py +++ b/ooni/utils/net.py @@ -1,3 +1,4 @@ +import re import sys import socket from random import randint @@ -46,10 +47,48 @@ PLATFORMS = {'LINUX': sys.platform.startswith("linux"), 'SOLARIS': sys.platform.startswith("sunos"), 'WINDOWS': sys.platform.startswith("win32")}
+# These are the 25 most common server headers for the sites in the +# citizenlab global testing list. +COMMON_SERVER_HEADERS = ( + "date", + "content-type", + "server", + "cache-control", + "vary", + "set-cookie", + "location", + "expires", + "x-powered-by", + "content-encoding", + "last-modified", + "accept-ranges", + "pragma", + "x-frame-options", + "etag", + "x-content-type-options", + "age", + "via", + "p3p", + "x-xss-protection", + "content-language", + "cf-ray", + "strict-transport-security", + "link", + "x-varnish" +) + # This is used as a default for checking if we get the expected result when # fetching URLs over some proxy. GOOGLE_HUMANS = ('http://www.google.com/humans.txt', 'Google is built by a large')
+TITLE_REGEXP = re.compile("<title>(.*?)</title>", re.IGNORECASE | re.DOTALL) + +def extract_title(body): + m = TITLE_REGEXP.search(body) + if m: + return m.group(1) + return '' + class StringProducer(object): implements(IBodyProducer)