[gettor/master] Make locale parser more robust

commit 277ba71b466332b4b8bd4752a2c46c5bcb9cf71c Author: Cecylia Bocovich <cohosh@torproject.org> Date: Mon Jan 27 10:17:29 2020 -0500 Make locale parser more robust This change expands the locale parse to have the following properties: - if only the language code is given, choses the regionalization that occurs first in the locale list (e.g., "en" --> "en-US" - if regionalization for the language is *not* present, choses the generalized language or a different regionalization (e.g. "pt-PT" --> "pt-BR") - parses both the subject and body looking for the most specific regionalization - defaults to en-US if no available language is found --- gettor/parse/email.py | 15 ++++++++--- tests/test_email_service.py | 66 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/gettor/parse/email.py b/gettor/parse/email.py index d487684..874b1cd 100644 --- a/gettor/parse/email.py +++ b/gettor/parse/email.py @@ -116,8 +116,12 @@ class EmailParser(object): def parse_keywords(self, text, request): for word in re.split(r"\s+", text.strip()): - if word.lower() in self.locales: - request["language"] = word.lower() + for locale in self.locales: + if word.lower() == locale.lower(): + request["language"] = locale + elif (not request["language"]) and (word.lower()[:2] == + locale.lower()[:2]): + request["language"] = locale if word.lower() in self.platforms: request["command"] = "links" request["platform"] = word.lower() @@ -143,8 +147,11 @@ class EmailParser(object): subject = subject.group(1) request = self.parse_keywords(subject, request) - if not request["command"] or not request["language"]: - request = self.parse_keywords(msg_str, request) + # Always parse the body too, to see if there's more specific information + request = self.parse_keywords(msg_str, request) + + if not request["language"]: + request["language"] = "en-US" return request diff --git a/tests/test_email_service.py b/tests/test_email_service.py index 407937c..00795c1 100644 --- a/tests/test_email_service.py +++ b/tests/test_email_service.py @@ -82,17 +82,61 @@ class EmailServiceTests(unittest.TestCase): def test_language_email_parser(self): ep = conftests.EmailParser(self.settings, "gettor@torproject.org") - ep.locales = ["en", "ru"] - request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n Subject: \r\n Reply-To: hiro@torproject.org \nTo: gettor@torproject.org\n osx en") - self.assertEqual(request["command"], "links") - self.assertEqual(request["platform"], "osx") - self.assertEqual(request["language"], "en") - - request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n Subject: \r\n Reply-To: hiro@torproject.org \nTo: gettor@torproject.org\n linux ru") - self.assertEqual(request["command"], "links") - self.assertEqual(request["platform"], "linux") - self.assertEqual(request["language"], "ru") - + ep.locales = ["en-US", "es-ES", "es-AR", "pt-BR", "fa"] + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx en") + self.assertEqual(request["language"], "en-US") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx ES") + self.assertEqual(request["language"], "es-ES") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx en-US") + self.assertEqual(request["language"], "en-US") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux fa") + self.assertEqual(request["language"], "fa") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx es") + self.assertEqual(request["language"], "es-ES") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux zz") + self.assertEqual(request["language"], "en-US") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux pt-PT") + self.assertEqual(request["language"], "pt-BR") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux es-AR") + self.assertEqual(request["language"], "es-AR") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: linux es\r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux es-AR") + self.assertEqual(request["language"], "es-AR") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: linux es\r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux") + self.assertEqual(request["language"], "es-ES") + + request = ep.parse("From: \"silvia [hiro]\" <hiro@torproject.org>\n" + "Subject: linux es-AR\r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux es") + self.assertEqual(request["language"], "es-AR") def test_sent_links_message(self): ep = self.sm_client
participants (1)
-
cohosh@torproject.org