commit 582f85501a072dfd8f2f6ebbdb28dd615a8db325 Author: Arturo Filastò arturo@filasto.net Date: Tue Apr 4 16:48:07 2017 +0000
Use the same logic for encoding header values and keys as measurement… (#739)
* Use the same logic for encoding header values and keys as measurement-kit
* Fix edge case that lead us to not do stripping of response headers when response body was null.
* Bump all the tests version that depend on httpt test template
This will allow us to distinguish when the fix for the body length is rolled out --- ooni/common/http_utils.py | 30 +++------------------- ooni/nettests/blocking/facebook_messenger.py | 2 +- ooni/nettests/blocking/http_requests.py | 2 +- ooni/nettests/blocking/meek_fronted_requests.py | 2 +- ooni/nettests/blocking/web_connectivity.py | 2 +- ooni/nettests/blocking/whatsapp.py | 2 +- ooni/nettests/experimental/domclass_collector.py | 2 +- .../experimental/http_keyword_filtering.py | 2 +- .../experimental/http_uk_mobile_networks.py | 1 + ooni/nettests/experimental/squid.py | 2 +- ooni/nettests/manipulation/captiveportal.py | 2 +- .../manipulation/http_header_field_manipulation.py | 2 +- ooni/nettests/manipulation/http_host.py | 2 +- ooni/nettests/scanning/http_url_list.py | 2 +- ooni/nettests/third_party/psiphon.py | 2 +- ooni/templates/httpt.py | 24 +++++++++++------ ooni/tests/test_common.py | 22 +++++----------- 17 files changed, 39 insertions(+), 64 deletions(-)
diff --git a/ooni/common/http_utils.py b/ooni/common/http_utils.py index 57c3a15c..b44f5c0b 100644 --- a/ooni/common/http_utils.py +++ b/ooni/common/http_utils.py @@ -1,37 +1,13 @@ import re -import codecs from base64 import b64encode
-META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"']*([^\s"'/>!;]+)')
def representBody(body): if not body: return body - # XXX perhaps add support for decoding gzip in the future. - body = body.replace('\0', '') - decoded = False - charsets = ['ascii', 'utf-8'] - - # If we are able to detect the charset of body from the meta tag - # try to decode using that one first - charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE) - if charset: - try: - encoding = charset.group(1).lower() - codecs.lookup(encoding) - charsets.insert(0, encoding) - except (LookupError, IndexError): - # Skip invalid codecs and partial regexp match - pass - - for encoding in charsets: - try: - body = unicode(body, encoding) - decoded = True - break - except UnicodeDecodeError: - pass - if not decoded: + try: + body = unicode(body, 'utf-8') + except UnicodeDecodeError: body = { 'data': b64encode(body), 'format': 'base64' diff --git a/ooni/nettests/blocking/facebook_messenger.py b/ooni/nettests/blocking/facebook_messenger.py index 651fca0d..9d711fd8 100644 --- a/ooni/nettests/blocking/facebook_messenger.py +++ b/ooni/nettests/blocking/facebook_messenger.py @@ -46,7 +46,7 @@ class FacebookMessengerTest(httpt.HTTPTest, dnst.DNSTest): description = ("This test examines the reachability of Facebook " "Messenger in your network.") author = "Arturo Filastò" - version = "0.4.0" + version = "0.5.0"
requiresRoot = False requiresTor = False diff --git a/ooni/nettests/blocking/http_requests.py b/ooni/nettests/blocking/http_requests.py index 6f0276be..483f92ef 100644 --- a/ooni/nettests/blocking/http_requests.py +++ b/ooni/nettests/blocking/http_requests.py @@ -40,7 +40,7 @@ class HTTPRequestsTest(httpt.HTTPTest): description = ("Performs a HTTP GET request over Tor and one over the " "local network and compares the two results.") author = "Arturo Filastò" - version = "0.2.5" + version = "0.3.0"
usageOptions = UsageOptions
diff --git a/ooni/nettests/blocking/meek_fronted_requests.py b/ooni/nettests/blocking/meek_fronted_requests.py index 5918cd4b..f516f84e 100644 --- a/ooni/nettests/blocking/meek_fronted_requests.py +++ b/ooni/nettests/blocking/meek_fronted_requests.py @@ -30,7 +30,7 @@ class meekTest(httpt.HTTPTest): name = "Meek fronted requests test" description = "This test examines whether the domains used by Meek "\ "(a type of Tor bridge) work in your network." - version = "0.0.1" + version = "0.1.0"
usageOptions = UsageOptions inputFile = ['file', 'f', None, diff --git a/ooni/nettests/blocking/web_connectivity.py b/ooni/nettests/blocking/web_connectivity.py index e5085380..600a9e4e 100644 --- a/ooni/nettests/blocking/web_connectivity.py +++ b/ooni/nettests/blocking/web_connectivity.py @@ -48,7 +48,7 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest): "connect to the resolved IPs and then fetching the page " "and comparing all these results with those of a control.") author = "Arturo Filastò" - version = "0.2.0" + version = "0.3.0"
contentDecoders = [('gzip', GzipDecoder)]
diff --git a/ooni/nettests/blocking/whatsapp.py b/ooni/nettests/blocking/whatsapp.py index 6957e620..b7a670b4 100644 --- a/ooni/nettests/blocking/whatsapp.py +++ b/ooni/nettests/blocking/whatsapp.py @@ -264,7 +264,7 @@ class WhatsappTest(httpt.HTTPTest, dnst.DNSTest): description = ("This test examines the reachability of WhatsApp " " and WhatsApp's web interface (web.whatsapp.com) in your network.") author = "Arturo Filastò" - version = "0.5.0" + version = "0.6.0"
requiresRoot = False requiresTor = False diff --git a/ooni/nettests/experimental/domclass_collector.py b/ooni/nettests/experimental/domclass_collector.py index efd5dbc3..94dd1f66 100644 --- a/ooni/nettests/experimental/domclass_collector.py +++ b/ooni/nettests/experimental/domclass_collector.py @@ -13,7 +13,7 @@ from ooni.templates import httpt class DOMClassCollector(httpt.HTTPTest): name = "DOM class collector" author = "Arturo Filastò" - version = 0.1 + version = "0.2.0"
followRedirects = True
diff --git a/ooni/nettests/experimental/http_keyword_filtering.py b/ooni/nettests/experimental/http_keyword_filtering.py index cbf12d1e..865441e0 100644 --- a/ooni/nettests/experimental/http_keyword_filtering.py +++ b/ooni/nettests/experimental/http_keyword_filtering.py @@ -21,7 +21,7 @@ class HTTPKeywordFiltering(httpt.HTTPTest): """ name = "HTTP Keyword Filtering" author = "Arturo Filastò" - version = "0.1.1" + version = "0.2.0"
inputFile = ['file', 'f', None, 'List of keywords to use for censorship testing']
diff --git a/ooni/nettests/experimental/http_uk_mobile_networks.py b/ooni/nettests/experimental/http_uk_mobile_networks.py index 2ac0bd92..bf9a2fec 100644 --- a/ooni/nettests/experimental/http_uk_mobile_networks.py +++ b/ooni/nettests/experimental/http_uk_mobile_networks.py @@ -25,6 +25,7 @@ class HTTPUKMobileNetworksTest(httpt.HTTPTest): XXX port the knowledge from the trac ticket into this test docstring """ name = "HTTP UK mobile network redirect test" + version = "0.1.0"
usageOptions = UsageOptions
diff --git a/ooni/nettests/experimental/squid.py b/ooni/nettests/experimental/squid.py index cf976ba6..4e44091b 100644 --- a/ooni/nettests/experimental/squid.py +++ b/ooni/nettests/experimental/squid.py @@ -18,7 +18,7 @@ class SquidTest(httpt.HTTPTest): """ name = "Squid test" author = "Arturo Filastò" - version = "0.1" + version = "0.2.0"
optParameters = [['backend', 'b', 'http://ooni.nu/test/', 'Test backend to use']]
diff --git a/ooni/nettests/manipulation/captiveportal.py b/ooni/nettests/manipulation/captiveportal.py index 844e4119..8cc945e5 100644 --- a/ooni/nettests/manipulation/captiveportal.py +++ b/ooni/nettests/manipulation/captiveportal.py @@ -62,7 +62,7 @@ class CaptivePortal(httpt.HTTPTest, dnst.DNSTest):
name = "captiveportal" description = "Captive Portal Test." - version = '0.3' + version = "0.4.0" author = "Isis Lovecruft" usageOptions = UsageOptions requiresRoot = False diff --git a/ooni/nettests/manipulation/http_header_field_manipulation.py b/ooni/nettests/manipulation/http_header_field_manipulation.py index fcd5e0e1..a9c92d56 100644 --- a/ooni/nettests/manipulation/http_header_field_manipulation.py +++ b/ooni/nettests/manipulation/http_header_field_manipulation.py @@ -50,7 +50,7 @@ class HTTPHeaderFieldManipulation(httpt.HTTPTest): description = "Checks if the HTTP request the server " \ "sees is the same as the one that the client has created." author = "Arturo Filastò" - version = "0.1.5" + version = "0.2.0"
randomizeUA = False usageOptions = UsageOptions diff --git a/ooni/nettests/manipulation/http_host.py b/ooni/nettests/manipulation/http_host.py index 2e0a8e1a..40d8d355 100644 --- a/ooni/nettests/manipulation/http_host.py +++ b/ooni/nettests/manipulation/http_host.py @@ -42,7 +42,7 @@ class HTTPHost(httpt.HTTPTest): description = "Tests a variety of different filter bypassing techniques "\ "based on the HTTP Host header field." author = "Arturo Filastò" - version = "0.2.4" + version = "0.3.0"
randomizeUA = False usageOptions = UsageOptions diff --git a/ooni/nettests/scanning/http_url_list.py b/ooni/nettests/scanning/http_url_list.py index 8d268dfa..dedc7ad8 100644 --- a/ooni/nettests/scanning/http_url_list.py +++ b/ooni/nettests/scanning/http_url_list.py @@ -25,7 +25,7 @@ class HTTPURLList(httpt.HTTPTest): """ name = "HTTP URL List" author = "Arturo Filastò" - version = "0.1.3" + version = "0.2.0"
usageOptions = UsageOptions
diff --git a/ooni/nettests/third_party/psiphon.py b/ooni/nettests/third_party/psiphon.py index f4b0033a..2821e0a8 100644 --- a/ooni/nettests/third_party/psiphon.py +++ b/ooni/nettests/third_party/psiphon.py @@ -34,7 +34,7 @@ class PsiphonTest(httpt.HTTPTest, process.ProcessTest): description = ("Bootstraps Psiphon and " "does a HTTP GET for the specified URL.") author = "juga" - version = "0.1.0" + version = "0.2.0" timeout = 120 usageOptions = UsageOptions
diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py index ab080c44..c6850384 100644 --- a/ooni/templates/httpt.py +++ b/ooni/templates/httpt.py @@ -167,19 +167,27 @@ class HTTPTest(NetTestCase): getattr(response.request, 'absoluteURI', None)): session['request']['url'] = response.request.absoluteURI
- if self.localOptions.get('withoutbody', 0) is 0: - response_body = representBody(response_body) - else: - response_body = '' + response_headers = {} + for name, value in response.headers.getAllRawHeaders(): + response_headers[name] = value[0]
- response_headers = _representHeaders(response.headers) # Attempt to redact the IP address of the probe from the responses - if (config.privacy.includeip is False and probe_ip.address is not None and - (isinstance(response_body, str) or isinstance(response_body, unicode))): - response_body = response_body.replace(probe_ip.address, "[REDACTED]") + if config.privacy.includeip is False and \ + probe_ip.address is not None: + if isinstance(response_body, (str, unicode)): + response_body = response_body.replace(probe_ip.address, "[REDACTED]") + for key, value in response_headers.items(): response_headers[key] = value.replace(probe_ip.address, "[REDACTED]") + for key, value in response_headers.items(): + response_headers[key] = representBody(value) + + if self.localOptions.get('withoutbody', 0) is 0: + response_body = representBody(response_body) + else: + response_body = '' + session['response'] = { 'headers': response_headers, 'body': response_body, diff --git a/ooni/tests/test_common.py b/ooni/tests/test_common.py index 40d3859f..c8437683 100644 --- a/ooni/tests/test_common.py +++ b/ooni/tests/test_common.py @@ -5,26 +5,16 @@ from twisted.web.client import readBody
from . import is_internet_connected
-from ooni.common.http_utils import META_CHARSET_REGEXP +from ooni.common.http_utils import representBody from ooni.common.ip_utils import is_public_ipv4_address, is_private_ipv4_address from ooni.common.txextra import FixedRedirectAgent, TrueHeadersAgent, TrueHeaders
class TestHTTPUtils(unittest.TestCase): - def test_charset_detection(self): - no_charset_html = """ - <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> -<html> -<head> - <title>Foo</title> -""" - with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">' - with_empty_charset = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=">' - with_two_charsets = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8;charset=utf-8">' - self.assertEqual(META_CHARSET_REGEXP.search(no_charset_html), None) - self.assertEqual(META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1') - self.assertEqual(META_CHARSET_REGEXP.search( - with_two_charsets).group(1), 'UTF-8') - self.assertEqual(META_CHARSET_REGEXP.search(with_empty_charset), None) + def test_represent_body(self): + self.assertEqual(representBody(None), None) + self.assertEqual(representBody("spam\xcf\x83"), u'spam\u03c3') + self.assertEqual(representBody("\xff\x00"), + {'data': '/wA=', 'format': 'base64'})
class TestIPUtils(unittest.TestCase): def test_is_public_ipv4(self):