[tor-commits] [gettor/master] Make locale parser more robust

cohosh at torproject.org cohosh at torproject.org
Fri Jan 31 14:27:36 UTC 2020


commit 277ba71b466332b4b8bd4752a2c46c5bcb9cf71c
Author: Cecylia Bocovich <cohosh at torproject.org>
Date:   Mon Jan 27 10:17:29 2020 -0500

    Make locale parser more robust
    
    This change expands the locale parse to have the following properties:
    - if only the language code is given, choses the regionalization that
    occurs first in the locale list (e.g., "en" --> "en-US"
    - if regionalization for the language is *not* present, choses the
    generalized language or a different regionalization (e.g. "pt-PT" -->
    "pt-BR")
    - parses both the subject and body looking for the most specific
    regionalization
    - defaults to en-US if no available language is found
---
 gettor/parse/email.py       | 15 ++++++++---
 tests/test_email_service.py | 66 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/gettor/parse/email.py b/gettor/parse/email.py
index d487684..874b1cd 100644
--- a/gettor/parse/email.py
+++ b/gettor/parse/email.py
@@ -116,8 +116,12 @@ class EmailParser(object):
     def parse_keywords(self, text, request):
 
         for word in re.split(r"\s+", text.strip()):
-            if word.lower() in self.locales:
-                request["language"] = word.lower()
+            for locale in self.locales:
+                if word.lower() == locale.lower():
+                    request["language"] = locale
+                elif (not request["language"]) and (word.lower()[:2] ==
+                        locale.lower()[:2]):
+                    request["language"] = locale
             if word.lower() in self.platforms:
                 request["command"] = "links"
                 request["platform"] = word.lower()
@@ -143,8 +147,11 @@ class EmailParser(object):
             subject = subject.group(1)
             request = self.parse_keywords(subject, request)
 
-        if not request["command"] or not request["language"]:
-            request = self.parse_keywords(msg_str, request)
+        # Always parse the body too, to see if there's more specific information
+        request = self.parse_keywords(msg_str, request)
+
+        if not request["language"]:
+            request["language"] = "en-US"
 
         return request
 
diff --git a/tests/test_email_service.py b/tests/test_email_service.py
index 407937c..00795c1 100644
--- a/tests/test_email_service.py
+++ b/tests/test_email_service.py
@@ -82,17 +82,61 @@ class EmailServiceTests(unittest.TestCase):
 
     def test_language_email_parser(self):
         ep = conftests.EmailParser(self.settings, "gettor at torproject.org")
-        ep.locales = ["en", "ru"]
-        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n Subject: \r\n Reply-To: hiro at torproject.org \nTo: gettor at torproject.org\n osx en")
-        self.assertEqual(request["command"], "links")
-        self.assertEqual(request["platform"], "osx")
-        self.assertEqual(request["language"], "en")
-
-        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n Subject: \r\n Reply-To: hiro at torproject.org \nTo: gettor at torproject.org\n linux ru")
-        self.assertEqual(request["command"], "links")
-        self.assertEqual(request["platform"], "linux")
-        self.assertEqual(request["language"], "ru")
-
+        ep.locales = ["en-US", "es-ES", "es-AR", "pt-BR", "fa"]
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n osx en")
+        self.assertEqual(request["language"], "en-US")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n osx ES")
+        self.assertEqual(request["language"], "es-ES")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n osx en-US")
+        self.assertEqual(request["language"], "en-US")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n linux fa")
+        self.assertEqual(request["language"], "fa")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n osx es")
+        self.assertEqual(request["language"], "es-ES")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n linux zz")
+        self.assertEqual(request["language"], "en-US")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n linux pt-PT")
+        self.assertEqual(request["language"], "pt-BR")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: \r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n linux es-AR")
+        self.assertEqual(request["language"], "es-AR")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: linux es\r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n linux es-AR")
+        self.assertEqual(request["language"], "es-AR")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: linux es\r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n linux")
+        self.assertEqual(request["language"], "es-ES")
+
+        request = ep.parse("From: \"silvia [hiro]\" <hiro at torproject.org>\n"
+                "Subject: linux es-AR\r\n Reply-To: hiro at torproject.org \nTo:"
+                "gettor at torproject.org\n linux es")
+        self.assertEqual(request["language"], "es-AR")
 
     def test_sent_links_message(self):
         ep = self.sm_client



More information about the tor-commits mailing list