commit 37564aa2876ba18516bb28e0e8eec7b489eea1c3 Author: Arturo Filastò arturo@filasto.net Date: Thu Apr 14 17:16:46 2016 +0200
Add support for detecting the charset of the HTML response body via the meta tag --- ooni/templates/httpt.py | 39 ++++++++++++++++++++++++++------------- ooni/tests/test_templates.py | 10 ++++++++++ 2 files changed, 36 insertions(+), 13 deletions(-)
diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py index 6ca486a..edab3fa 100644 --- a/ooni/templates/httpt.py +++ b/ooni/templates/httpt.py @@ -1,3 +1,4 @@ +import re import random
from twisted.internet import defer @@ -16,6 +17,7 @@ from ooni.utils.net import BodyReceiver, StringProducer, userAgents from ooni.utils.trueheaders import TrueHeaders from ooni.errors import handleAllFailures
+META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"']*([^\s"'/>]*)')
class InvalidSocksProxyOption(Exception): pass @@ -35,6 +37,30 @@ class StreamListener(StreamListenerMixin): except: log.err("Tor Exit ip detection failed")
+ + +def _representBody(body): + # XXX perhaps add support for decoding gzip in the future. + body = body.replace('\0', '') + decoded = False + charsets = ['ascii', 'utf-8'] + + # If we are able to detect the charset of body from the meta tag + # try to decode using that one first + charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE) + if charset: + charsets.insert(0, charset.group(1)) + for encoding in charsets: + try: + body = unicode(body, encoding) + decoded = True + break + except UnicodeDecodeError: + pass + if not decoded: + body = base64Dict(body) + return body + class HTTPTest(NetTestCase): """ A utility class for dealing with HTTP based testing. It provides methods to @@ -128,19 +154,6 @@ class HTTPTest(NetTestCase): represented_headers[name] = value[0] return represented_headers
- def _representBody(body): - # XXX perhaps add support for decoding gzip in the future. - try: - body = unicode(body, 'ascii') - body = body.replace('\0', '') - except UnicodeDecodeError: - try: - body = unicode(body, 'utf-8') - body = body.replace('\0', '') - except UnicodeDecodeError: - body = base64Dict(body) - return body - log.debug("Adding %s to report" % request) request_headers = TrueHeaders(request['headers']) session = { diff --git a/ooni/tests/test_templates.py b/ooni/tests/test_templates.py index bf05e56..931e052 100644 --- a/ooni/tests/test_templates.py +++ b/ooni/tests/test_templates.py @@ -44,6 +44,16 @@ class TestHTTPT(unittest.TestCase): yield self.assertFailure(http_test.doRequest('http://invaliddomain/'), DNSLookupError) assert http_test.report['requests'][0]['failure'] == 'dns_lookup_error'
+ def test_charset_detection(self): + no_charset_html = """ + <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html> +<head> + <title>Foo</title> +""" + with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">' + self.assertEqual(httpt.META_CHARSET_REGEXP.search(no_charset_html), None) + self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1')
class TestDNST(unittest.TestCase): def test_represent_answer_a(self):