[ooni-probe/master] Add support for detecting the charset of the HTML response body via the meta tag

29 Apr 2016

commit 37564aa2876ba18516bb28e0e8eec7b489eea1c3
Author: Arturo Filastò <arturo@filasto.net>
Date:   Thu Apr 14 17:16:46 2016 +0200

    Add support for detecting the charset of the HTML response body via the meta tag
---
 ooni/templates/httpt.py      | 39 ++++++++++++++++++++++++++-------------
 ooni/tests/test_templates.py | 10 ++++++++++
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py
index 6ca486a..edab3fa 100644
--- a/ooni/templates/httpt.py
+++ b/ooni/templates/httpt.py
@@ -1,3 +1,4 @@
+import re
 import random
 
 from twisted.internet import defer
@@ -16,6 +17,7 @@ from ooni.utils.net import BodyReceiver, StringProducer, userAgents
 from ooni.utils.trueheaders import TrueHeaders
 from ooni.errors import handleAllFailures
 
+META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>]*)')
 
 class InvalidSocksProxyOption(Exception):
     pass
@@ -35,6 +37,30 @@ class StreamListener(StreamListenerMixin):
         except:
             log.err("Tor Exit ip detection failed")
 
+
+
+def _representBody(body):
+    # XXX perhaps add support for decoding gzip in the future.
+    body = body.replace('\0', '')
+    decoded = False
+    charsets = ['ascii', 'utf-8']
+
+    # If we are able to detect the charset of body from the meta tag
+    # try to decode using that one first
+    charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE)
+    if charset:
+        charsets.insert(0, charset.group(1))
+    for encoding in charsets:
+        try:
+            body = unicode(body, encoding)
+            decoded = True
+            break
+        except UnicodeDecodeError:
+            pass
+    if not decoded:
+        body = base64Dict(body)
+    return body
+
 class HTTPTest(NetTestCase):
     """
     A utility class for dealing with HTTP based testing. It provides methods to
@@ -128,19 +154,6 @@ class HTTPTest(NetTestCase):
                 represented_headers[name] = value[0]
             return represented_headers
 
-        def _representBody(body):
-            # XXX perhaps add support for decoding gzip in the future.
-            try:
-                body = unicode(body, 'ascii')
-                body = body.replace('\0', '')
-            except UnicodeDecodeError:
-                try:
-                    body = unicode(body, 'utf-8')
-                    body = body.replace('\0', '')
-                except UnicodeDecodeError:
-                    body = base64Dict(body)
-            return body
-
         log.debug("Adding %s to report" % request)
         request_headers = TrueHeaders(request['headers'])
         session = {
diff --git a/ooni/tests/test_templates.py b/ooni/tests/test_templates.py
index bf05e56..931e052 100644
--- a/ooni/tests/test_templates.py
+++ b/ooni/tests/test_templates.py
@@ -44,6 +44,16 @@ class TestHTTPT(unittest.TestCase):
         yield self.assertFailure(http_test.doRequest('http://invaliddomain/'), DNSLookupError)
         assert http_test.report['requests'][0]['failure'] == 'dns_lookup_error'
 
+    def test_charset_detection(self):
+        no_charset_html = """
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html>
+<head>
+        <title>Foo</title>
+"""
+        with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
+        self.assertEqual(httpt.META_CHARSET_REGEXP.search(no_charset_html), None)
+        self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1')
 
 class TestDNST(unittest.TestCase):
     def test_represent_answer_a(self):

    

art＠torproject.org

tags

participants (1)