commit 55cd85351c02d7a7333f1c39e9993820b05b2b6d Author: Arturo Filastò arturo@filasto.net Date: Sun May 8 19:08:22 2016 +0200
Improve charset detection regexp --- ooni/templates/httpt.py | 12 ++++++++++-- ooni/tests/test_templates.py | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-)
diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py index 51fba1a..6b4c4b9 100644 --- a/ooni/templates/httpt.py +++ b/ooni/templates/httpt.py @@ -1,4 +1,5 @@ import re +import codecs import random
from txtorcon.interface import StreamListenerMixin @@ -17,7 +18,7 @@ from ooni.utils.net import StringProducer, userAgents from ooni.utils.trueheaders import TrueHeaders from ooni.errors import handleAllFailures
-META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"']*([^\s"'/>]+)') +META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"']*([^\s"'/>!;]+)')
class InvalidSocksProxyOption(Exception): pass @@ -56,7 +57,14 @@ def _representBody(body): # try to decode using that one first charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE) if charset: - charsets.insert(0, charset.group(1)) + try: + encoding = charset.group(1).lower() + codecs.lookup(encoding) + charsets.insert(0, encoding) + except (LookupError, IndexError): + # Skip invalid codecs and partial regexp match + pass + for encoding in charsets: try: body = unicode(body, encoding) diff --git a/ooni/tests/test_templates.py b/ooni/tests/test_templates.py index e8fe636..ebd5b2e 100644 --- a/ooni/tests/test_templates.py +++ b/ooni/tests/test_templates.py @@ -55,8 +55,11 @@ class TestHTTPT(unittest.TestCase): """ with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">' with_empty_charset = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=">' + with_two_charsets = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8;charset=utf-8">' self.assertEqual(httpt.META_CHARSET_REGEXP.search(no_charset_html), None) self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1') + self.assertEqual(httpt.META_CHARSET_REGEXP.search( + with_two_charsets).group(1), 'UTF-8') self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_empty_charset), None)
class TestDNST(unittest.TestCase):