[ooni-probe/master] Use the same logic for encoding header values and keys as measurement… (#739)

art at torproject.org art at torproject.org
Fri Sep 22 18:41:07 UTC 2017


commit 582f85501a072dfd8f2f6ebbdb28dd615a8db325
Author: Arturo Filastò <arturo at filasto.net>
Date:   Tue Apr 4 16:48:07 2017 +0000

    Use the same logic for encoding header values and keys as measurement… (#739)
    
    * Use the same logic for encoding header values and keys as measurement-kit
    
    * Fix edge case that lead us to not do stripping of response headers
     when response body was null.
    
    * Bump all the tests version that depend on httpt test template
    
    This will allow us to distinguish when the fix for the body length is
    rolled out
---
 ooni/common/http_utils.py                          | 30 +++-------------------
 ooni/nettests/blocking/facebook_messenger.py       |  2 +-
 ooni/nettests/blocking/http_requests.py            |  2 +-
 ooni/nettests/blocking/meek_fronted_requests.py    |  2 +-
 ooni/nettests/blocking/web_connectivity.py         |  2 +-
 ooni/nettests/blocking/whatsapp.py                 |  2 +-
 ooni/nettests/experimental/domclass_collector.py   |  2 +-
 .../experimental/http_keyword_filtering.py         |  2 +-
 .../experimental/http_uk_mobile_networks.py        |  1 +
 ooni/nettests/experimental/squid.py                |  2 +-
 ooni/nettests/manipulation/captiveportal.py        |  2 +-
 .../manipulation/http_header_field_manipulation.py |  2 +-
 ooni/nettests/manipulation/http_host.py            |  2 +-
 ooni/nettests/scanning/http_url_list.py            |  2 +-
 ooni/nettests/third_party/psiphon.py               |  2 +-
 ooni/templates/httpt.py                            | 24 +++++++++++------
 ooni/tests/test_common.py                          | 22 +++++-----------
 17 files changed, 39 insertions(+), 64 deletions(-)

diff --git a/ooni/common/http_utils.py b/ooni/common/http_utils.py
index 57c3a15c..b44f5c0b 100644
--- a/ooni/common/http_utils.py
+++ b/ooni/common/http_utils.py
@@ -1,37 +1,13 @@
 import re
-import codecs
 from base64 import b64encode
 
-META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>!;]+)')
 
 def representBody(body):
     if not body:
         return body
-    # XXX perhaps add support for decoding gzip in the future.
-    body = body.replace('\0', '')
-    decoded = False
-    charsets = ['ascii', 'utf-8']
-
-    # If we are able to detect the charset of body from the meta tag
-    # try to decode using that one first
-    charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE)
-    if charset:
-        try:
-            encoding = charset.group(1).lower()
-            codecs.lookup(encoding)
-            charsets.insert(0, encoding)
-        except (LookupError, IndexError):
-            # Skip invalid codecs and partial regexp match
-            pass
-
-    for encoding in charsets:
-        try:
-            body = unicode(body, encoding)
-            decoded = True
-            break
-        except UnicodeDecodeError:
-            pass
-    if not decoded:
+    try:
+        body = unicode(body, 'utf-8')
+    except UnicodeDecodeError:
         body = {
             'data': b64encode(body),
             'format': 'base64'
diff --git a/ooni/nettests/blocking/facebook_messenger.py b/ooni/nettests/blocking/facebook_messenger.py
index 651fca0d..9d711fd8 100644
--- a/ooni/nettests/blocking/facebook_messenger.py
+++ b/ooni/nettests/blocking/facebook_messenger.py
@@ -46,7 +46,7 @@ class FacebookMessengerTest(httpt.HTTPTest, dnst.DNSTest):
     description = ("This test examines the reachability of Facebook "
                    "Messenger in your network.")
     author = "Arturo Filastò"
-    version = "0.4.0"
+    version = "0.5.0"
 
     requiresRoot = False
     requiresTor = False
diff --git a/ooni/nettests/blocking/http_requests.py b/ooni/nettests/blocking/http_requests.py
index 6f0276be..483f92ef 100644
--- a/ooni/nettests/blocking/http_requests.py
+++ b/ooni/nettests/blocking/http_requests.py
@@ -40,7 +40,7 @@ class HTTPRequestsTest(httpt.HTTPTest):
     description = ("Performs a HTTP GET request over Tor and one over the "
                   "local network and compares the two results.")
     author = "Arturo Filastò"
-    version = "0.2.5"
+    version = "0.3.0"
 
     usageOptions = UsageOptions
 
diff --git a/ooni/nettests/blocking/meek_fronted_requests.py b/ooni/nettests/blocking/meek_fronted_requests.py
index 5918cd4b..f516f84e 100644
--- a/ooni/nettests/blocking/meek_fronted_requests.py
+++ b/ooni/nettests/blocking/meek_fronted_requests.py
@@ -30,7 +30,7 @@ class meekTest(httpt.HTTPTest):
     name = "Meek fronted requests test"
     description = "This test examines whether the domains used by Meek "\
                   "(a type of Tor bridge) work in your network."
-    version = "0.0.1"
+    version = "0.1.0"
 
     usageOptions = UsageOptions
     inputFile = ['file', 'f', None,
diff --git a/ooni/nettests/blocking/web_connectivity.py b/ooni/nettests/blocking/web_connectivity.py
index e5085380..600a9e4e 100644
--- a/ooni/nettests/blocking/web_connectivity.py
+++ b/ooni/nettests/blocking/web_connectivity.py
@@ -48,7 +48,7 @@ class WebConnectivityTest(httpt.HTTPTest, dnst.DNSTest):
                    "connect to the resolved IPs and then fetching the page "
                    "and comparing all these results with those of a control.")
     author = "Arturo Filastò"
-    version = "0.2.0"
+    version = "0.3.0"
 
     contentDecoders = [('gzip', GzipDecoder)]
 
diff --git a/ooni/nettests/blocking/whatsapp.py b/ooni/nettests/blocking/whatsapp.py
index 6957e620..b7a670b4 100644
--- a/ooni/nettests/blocking/whatsapp.py
+++ b/ooni/nettests/blocking/whatsapp.py
@@ -264,7 +264,7 @@ class WhatsappTest(httpt.HTTPTest, dnst.DNSTest):
     description = ("This test examines the reachability of WhatsApp "
                    " and WhatsApp's web interface (web.whatsapp.com) in your network.")
     author = "Arturo Filastò"
-    version = "0.5.0"
+    version = "0.6.0"
 
     requiresRoot = False
     requiresTor = False
diff --git a/ooni/nettests/experimental/domclass_collector.py b/ooni/nettests/experimental/domclass_collector.py
index efd5dbc3..94dd1f66 100644
--- a/ooni/nettests/experimental/domclass_collector.py
+++ b/ooni/nettests/experimental/domclass_collector.py
@@ -13,7 +13,7 @@ from ooni.templates import httpt
 class DOMClassCollector(httpt.HTTPTest):
     name = "DOM class collector"
     author = "Arturo Filastò"
-    version = 0.1
+    version = "0.2.0"
 
     followRedirects = True
 
diff --git a/ooni/nettests/experimental/http_keyword_filtering.py b/ooni/nettests/experimental/http_keyword_filtering.py
index cbf12d1e..865441e0 100644
--- a/ooni/nettests/experimental/http_keyword_filtering.py
+++ b/ooni/nettests/experimental/http_keyword_filtering.py
@@ -21,7 +21,7 @@ class HTTPKeywordFiltering(httpt.HTTPTest):
     """
     name = "HTTP Keyword Filtering"
     author = "Arturo Filastò"
-    version = "0.1.1"
+    version = "0.2.0"
 
     inputFile = ['file', 'f', None, 'List of keywords to use for censorship testing']
 
diff --git a/ooni/nettests/experimental/http_uk_mobile_networks.py b/ooni/nettests/experimental/http_uk_mobile_networks.py
index 2ac0bd92..bf9a2fec 100644
--- a/ooni/nettests/experimental/http_uk_mobile_networks.py
+++ b/ooni/nettests/experimental/http_uk_mobile_networks.py
@@ -25,6 +25,7 @@ class HTTPUKMobileNetworksTest(httpt.HTTPTest):
     XXX port the knowledge from the trac ticket into this test docstring
     """
     name = "HTTP UK mobile network redirect test"
+    version = "0.1.0"
 
     usageOptions = UsageOptions
 
diff --git a/ooni/nettests/experimental/squid.py b/ooni/nettests/experimental/squid.py
index cf976ba6..4e44091b 100644
--- a/ooni/nettests/experimental/squid.py
+++ b/ooni/nettests/experimental/squid.py
@@ -18,7 +18,7 @@ class SquidTest(httpt.HTTPTest):
     """
     name = "Squid test"
     author = "Arturo Filastò"
-    version = "0.1"
+    version = "0.2.0"
 
     optParameters = [['backend', 'b', 'http://ooni.nu/test/', 'Test backend to use']]
 
diff --git a/ooni/nettests/manipulation/captiveportal.py b/ooni/nettests/manipulation/captiveportal.py
index 844e4119..8cc945e5 100644
--- a/ooni/nettests/manipulation/captiveportal.py
+++ b/ooni/nettests/manipulation/captiveportal.py
@@ -62,7 +62,7 @@ class CaptivePortal(httpt.HTTPTest, dnst.DNSTest):
 
     name = "captiveportal"
     description = "Captive Portal Test."
-    version = '0.3'
+    version = "0.4.0"
     author = "Isis Lovecruft"
     usageOptions = UsageOptions
     requiresRoot = False
diff --git a/ooni/nettests/manipulation/http_header_field_manipulation.py b/ooni/nettests/manipulation/http_header_field_manipulation.py
index fcd5e0e1..a9c92d56 100644
--- a/ooni/nettests/manipulation/http_header_field_manipulation.py
+++ b/ooni/nettests/manipulation/http_header_field_manipulation.py
@@ -50,7 +50,7 @@ class HTTPHeaderFieldManipulation(httpt.HTTPTest):
     description = "Checks if the HTTP request the server " \
                   "sees is the same as the one that the client has created."
     author = "Arturo Filastò"
-    version = "0.1.5"
+    version = "0.2.0"
 
     randomizeUA = False
     usageOptions = UsageOptions
diff --git a/ooni/nettests/manipulation/http_host.py b/ooni/nettests/manipulation/http_host.py
index 2e0a8e1a..40d8d355 100644
--- a/ooni/nettests/manipulation/http_host.py
+++ b/ooni/nettests/manipulation/http_host.py
@@ -42,7 +42,7 @@ class HTTPHost(httpt.HTTPTest):
     description = "Tests a variety of different filter bypassing techniques "\
                   "based on the HTTP Host header field."
     author = "Arturo Filastò"
-    version = "0.2.4"
+    version = "0.3.0"
 
     randomizeUA = False
     usageOptions = UsageOptions
diff --git a/ooni/nettests/scanning/http_url_list.py b/ooni/nettests/scanning/http_url_list.py
index 8d268dfa..dedc7ad8 100644
--- a/ooni/nettests/scanning/http_url_list.py
+++ b/ooni/nettests/scanning/http_url_list.py
@@ -25,7 +25,7 @@ class HTTPURLList(httpt.HTTPTest):
     """
     name = "HTTP URL List"
     author = "Arturo Filastò"
-    version = "0.1.3"
+    version = "0.2.0"
 
     usageOptions = UsageOptions
 
diff --git a/ooni/nettests/third_party/psiphon.py b/ooni/nettests/third_party/psiphon.py
index f4b0033a..2821e0a8 100644
--- a/ooni/nettests/third_party/psiphon.py
+++ b/ooni/nettests/third_party/psiphon.py
@@ -34,7 +34,7 @@ class PsiphonTest(httpt.HTTPTest,  process.ProcessTest):
     description = ("Bootstraps Psiphon and "
                    "does a HTTP GET for the specified URL.")
     author = "juga"
-    version = "0.1.0"
+    version = "0.2.0"
     timeout = 120
     usageOptions = UsageOptions
 
diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py
index ab080c44..c6850384 100644
--- a/ooni/templates/httpt.py
+++ b/ooni/templates/httpt.py
@@ -167,19 +167,27 @@ class HTTPTest(NetTestCase):
                     getattr(response.request, 'absoluteURI', None)):
                 session['request']['url'] = response.request.absoluteURI
 
-            if self.localOptions.get('withoutbody', 0) is 0:
-                response_body = representBody(response_body)
-            else:
-                response_body = ''
+            response_headers = {}
+            for name, value in response.headers.getAllRawHeaders():
+                response_headers[name] = value[0]
 
-            response_headers = _representHeaders(response.headers)
             # Attempt to redact the IP address of the probe from the responses
-            if (config.privacy.includeip is False and probe_ip.address is not None and
-                    (isinstance(response_body, str) or isinstance(response_body, unicode))):
-                response_body = response_body.replace(probe_ip.address, "[REDACTED]")
+            if config.privacy.includeip is False and \
+                            probe_ip.address is not None:
+                if isinstance(response_body, (str, unicode)):
+                    response_body = response_body.replace(probe_ip.address, "[REDACTED]")
+
                 for key, value in response_headers.items():
                     response_headers[key] = value.replace(probe_ip.address,
                                                           "[REDACTED]")
+            for key, value in response_headers.items():
+                response_headers[key] = representBody(value)
+
+            if self.localOptions.get('withoutbody', 0) is 0:
+                response_body = representBody(response_body)
+            else:
+                response_body = ''
+
             session['response'] = {
                 'headers': response_headers,
                 'body': response_body,
diff --git a/ooni/tests/test_common.py b/ooni/tests/test_common.py
index 40d3859f..c8437683 100644
--- a/ooni/tests/test_common.py
+++ b/ooni/tests/test_common.py
@@ -5,26 +5,16 @@ from twisted.web.client import readBody
 
 from . import is_internet_connected
 
-from ooni.common.http_utils import META_CHARSET_REGEXP
+from ooni.common.http_utils import representBody
 from ooni.common.ip_utils import is_public_ipv4_address, is_private_ipv4_address
 from ooni.common.txextra import FixedRedirectAgent, TrueHeadersAgent, TrueHeaders
 
 class TestHTTPUtils(unittest.TestCase):
-    def test_charset_detection(self):
-        no_charset_html = """
-        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html>
-<head>
-        <title>Foo</title>
-"""
-        with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
-        with_empty_charset = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=">'
-        with_two_charsets = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8;charset=utf-8">'
-        self.assertEqual(META_CHARSET_REGEXP.search(no_charset_html), None)
-        self.assertEqual(META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1')
-        self.assertEqual(META_CHARSET_REGEXP.search(
-            with_two_charsets).group(1), 'UTF-8')
-        self.assertEqual(META_CHARSET_REGEXP.search(with_empty_charset), None)
+    def test_represent_body(self):
+        self.assertEqual(representBody(None), None)
+        self.assertEqual(representBody("spam\xcf\x83"), u'spam\u03c3')
+        self.assertEqual(representBody("\xff\x00"),
+                         {'data': '/wA=', 'format': 'base64'})
 
 class TestIPUtils(unittest.TestCase):
     def test_is_public_ipv4(self):





More information about the tor-commits mailing list