commit 277ba71b466332b4b8bd4752a2c46c5bcb9cf71c Author: Cecylia Bocovich cohosh@torproject.org Date: Mon Jan 27 10:17:29 2020 -0500
Make locale parser more robust
This change expands the locale parse to have the following properties: - if only the language code is given, choses the regionalization that occurs first in the locale list (e.g., "en" --> "en-US" - if regionalization for the language is *not* present, choses the generalized language or a different regionalization (e.g. "pt-PT" --> "pt-BR") - parses both the subject and body looking for the most specific regionalization - defaults to en-US if no available language is found --- gettor/parse/email.py | 15 ++++++++--- tests/test_email_service.py | 66 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 15 deletions(-)
diff --git a/gettor/parse/email.py b/gettor/parse/email.py index d487684..874b1cd 100644 --- a/gettor/parse/email.py +++ b/gettor/parse/email.py @@ -116,8 +116,12 @@ class EmailParser(object): def parse_keywords(self, text, request):
for word in re.split(r"\s+", text.strip()): - if word.lower() in self.locales: - request["language"] = word.lower() + for locale in self.locales: + if word.lower() == locale.lower(): + request["language"] = locale + elif (not request["language"]) and (word.lower()[:2] == + locale.lower()[:2]): + request["language"] = locale if word.lower() in self.platforms: request["command"] = "links" request["platform"] = word.lower() @@ -143,8 +147,11 @@ class EmailParser(object): subject = subject.group(1) request = self.parse_keywords(subject, request)
- if not request["command"] or not request["language"]: - request = self.parse_keywords(msg_str, request) + # Always parse the body too, to see if there's more specific information + request = self.parse_keywords(msg_str, request) + + if not request["language"]: + request["language"] = "en-US"
return request
diff --git a/tests/test_email_service.py b/tests/test_email_service.py index 407937c..00795c1 100644 --- a/tests/test_email_service.py +++ b/tests/test_email_service.py @@ -82,17 +82,61 @@ class EmailServiceTests(unittest.TestCase):
def test_language_email_parser(self): ep = conftests.EmailParser(self.settings, "gettor@torproject.org") - ep.locales = ["en", "ru"] - request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n Subject: \r\n Reply-To: hiro@torproject.org \nTo: gettor@torproject.org\n osx en") - self.assertEqual(request["command"], "links") - self.assertEqual(request["platform"], "osx") - self.assertEqual(request["language"], "en") - - request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n Subject: \r\n Reply-To: hiro@torproject.org \nTo: gettor@torproject.org\n linux ru") - self.assertEqual(request["command"], "links") - self.assertEqual(request["platform"], "linux") - self.assertEqual(request["language"], "ru") - + ep.locales = ["en-US", "es-ES", "es-AR", "pt-BR", "fa"] + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx en") + self.assertEqual(request["language"], "en-US") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx ES") + self.assertEqual(request["language"], "es-ES") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx en-US") + self.assertEqual(request["language"], "en-US") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux fa") + self.assertEqual(request["language"], "fa") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n osx es") + self.assertEqual(request["language"], "es-ES") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux zz") + self.assertEqual(request["language"], "en-US") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux pt-PT") + self.assertEqual(request["language"], "pt-BR") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: \r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux es-AR") + self.assertEqual(request["language"], "es-AR") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: linux es\r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux es-AR") + self.assertEqual(request["language"], "es-AR") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: linux es\r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux") + self.assertEqual(request["language"], "es-ES") + + request = ep.parse("From: "silvia [hiro]" hiro@torproject.org\n" + "Subject: linux es-AR\r\n Reply-To: hiro@torproject.org \nTo:" + "gettor@torproject.org\n linux es") + self.assertEqual(request["language"], "es-AR")
def test_sent_links_message(self): ep = self.sm_client