[tor-commits] [ooni-probe/master] Improve charset detection regexp

art at torproject.org art at torproject.org
Mon May 30 16:28:33 UTC 2016


commit 55cd85351c02d7a7333f1c39e9993820b05b2b6d
Author: Arturo Filastò <arturo at filasto.net>
Date:   Sun May 8 19:08:22 2016 +0200

    Improve charset detection regexp
---
 ooni/templates/httpt.py      | 12 ++++++++++--
 ooni/tests/test_templates.py |  3 +++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/ooni/templates/httpt.py b/ooni/templates/httpt.py
index 51fba1a..6b4c4b9 100644
--- a/ooni/templates/httpt.py
+++ b/ooni/templates/httpt.py
@@ -1,4 +1,5 @@
 import re
+import codecs
 import random
 
 from txtorcon.interface import StreamListenerMixin
@@ -17,7 +18,7 @@ from ooni.utils.net import StringProducer, userAgents
 from ooni.utils.trueheaders import TrueHeaders
 from ooni.errors import handleAllFailures
 
-META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>]+)')
+META_CHARSET_REGEXP = re.compile('<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"\']*([^\s"\'/>!;]+)')
 
 class InvalidSocksProxyOption(Exception):
     pass
@@ -56,7 +57,14 @@ def _representBody(body):
     # try to decode using that one first
     charset = META_CHARSET_REGEXP.search(body, re.IGNORECASE)
     if charset:
-        charsets.insert(0, charset.group(1))
+        try:
+            encoding = charset.group(1).lower()
+            codecs.lookup(encoding)
+            charsets.insert(0, encoding)
+        except (LookupError, IndexError):
+            # Skip invalid codecs and partial regexp match
+            pass
+
     for encoding in charsets:
         try:
             body = unicode(body, encoding)
diff --git a/ooni/tests/test_templates.py b/ooni/tests/test_templates.py
index e8fe636..ebd5b2e 100644
--- a/ooni/tests/test_templates.py
+++ b/ooni/tests/test_templates.py
@@ -55,8 +55,11 @@ class TestHTTPT(unittest.TestCase):
 """
         with_charset_html = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
         with_empty_charset = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=">'
+        with_two_charsets = no_charset_html + '\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8;charset=utf-8">'
         self.assertEqual(httpt.META_CHARSET_REGEXP.search(no_charset_html), None)
         self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_charset_html).group(1), 'iso-8859-1')
+        self.assertEqual(httpt.META_CHARSET_REGEXP.search(
+            with_two_charsets).group(1), 'UTF-8')
         self.assertEqual(httpt.META_CHARSET_REGEXP.search(with_empty_charset), None)
 
 class TestDNST(unittest.TestCase):





More information about the tor-commits mailing list