[tor-commits] [bridgedb/master] Fix the parsing of Accept-Language to actually support fallbacks.

isis at torproject.org isis at torproject.org
Sun Jan 12 06:06:31 UTC 2014


commit e1ae7a0ac0b5a8363fce07732d1b0e685547e582
Author: Isis Lovecruft <isis at torproject.org>
Date:   Tue Oct 29 05:45:28 2013 +0000

    Fix the parsing of Accept-Language to actually support fallbacks.
    
    All right. Going through the most significant bugs in this function:
    
     * The line from the original function:
         ``langs = request.getHeader('accept-language').split(',')``
       getHeader() returns ``None`` if the header isn't present, so this results
       in a TypeError on the split().
    
     * The line from the original function:
         ``langs = filter(lambda x: re.match('^[a-z\-]{1,5}', x), langs)``
       This chucks locales with capital letters, and doesn't much at all to insure
       that we're actually getting a well-formed header, all at the expense of a
       (rather expensive; they're slow in Python) regex call.
    
     * These lines from the original:
           # add fallback languages
           langs_only = filter(lambda x: '-' in x, langs)
           langs.extend(map(lambda x: x.split('-')[0], langs_only))
       If my 'Accept-Language' header starts with 'en-GB,en-US;q=0.92[…]', then
       this would add ['en','en'] to the end of my header, without even checking
       if I already have 'en'. Instead, we should check if 'en' is already there,
       and iff not, then add it *after the other English headers*. Not after
       Mandarin, Japanese, Arabic, Russian, and the other slew of languages that I
       half-assedly learned at some point.
    
     * These lines from the original:
           # gettext wants _, not -
           map(lambda x: x.replace('-', '_'), langs)
       Great. A pretty mapping. Good thing the returned values weren't saved as
       anything; otherwise they might have been useful!
    
     * Lastly, the way that languages, once parsed were added to gettext, would
       raise UnhandledErrors, *and* it didn't even add the fallbacks correctly,
       meaning we could only get one language at a time.
---
 lib/bridgedb/HTTPServer.py    |   62 +++++++++++++++++++------------
 lib/bridgedb/parse/headers.py |   82 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 24 deletions(-)

diff --git a/lib/bridgedb/HTTPServer.py b/lib/bridgedb/HTTPServer.py
index ad76568..ca850d5 100644
--- a/lib/bridgedb/HTTPServer.py
+++ b/lib/bridgedb/HTTPServer.py
@@ -30,6 +30,7 @@ from bridgedb.Raptcha import Raptcha
 from bridgedb.Filters import filterBridgesByIP6, filterBridgesByIP4
 from bridgedb.Filters import filterBridgesByTransport
 from bridgedb.Filters import filterBridgesByNotBlockedIn
+from bridgedb.parse import headers
 from ipaddr import IPv4Address, IPv6Address
 from random import randint
 from mako.template import Template
@@ -328,29 +329,42 @@ def getAssumedChosenLang(langs):
     return lang
 
 def setLocaleFromRequestHeader(request):
+    """Retrieve the languages from the accept-language header and install them.
+
+    Parse the languages in the header, and attempt to install the first one in
+    the list. If that fails, we receive a :class:`gettext.NullTranslation`
+    object, if it worked then we have a :class:`gettext.GNUTranslation`
+    object. Whichever one we end up with, add the other get the other
+    languages and add them as fallbacks to the first. Lastly, install this
+    chain of translations.
+
+    :type request: :class:`twisted.web.server.Request`
+    :param request: An incoming request from a client.
+    :rtype: list
+    :returns: All requested languages.
     """
-    Retrieve the languages from the accept-language header and insall
-
-    Parse the languages in the header, if any of them contain locales then
-    add their languages to the list, also. Then install all of them using
-    gettext, it will choose the best one.
-
-    :param request twisted.web.server.Request: Incoming request
-    :returns list: All requested languages
-    """
-    langs = request.getHeader('accept-language').split(',')
-    logging.debug("Accept-Language: %s" % langs)
-    localedir=os.path.join(os.path.dirname(__file__), 'i18n/')
-
-    if langs:
-        langs = filter(lambda x: re.match('^[a-z\-]{1,5}', x), langs)
-        logging.debug("Languages: %s" % langs)
-        # add fallback languages
-        langs_only = filter(lambda x: '-' in x, langs)
-        langs.extend(map(lambda x: x.split('-')[0], langs_only))
-        # gettext wants _, not -
-        map(lambda x: x.replace('-', '_'), langs)
-        lang = gettext.translation("bridgedb", localedir=localedir,
-                 languages=langs, fallback=True)
-        lang.install(True)
+    logging.debug("Getting client 'Accept-Language' header...")
+    header = request.getHeader('accept-language')
+
+    if header is None:
+        logging.debug("Client sent no 'Accept-Language' header. Using fallback.")
+        header = 'en,en-US'
+
+    localedir = os.path.join(os.path.dirname(__file__), 'i18n/')
+    langs = headers.parseAcceptLanguage(header)
+    ## XXX the 'Accept-Language' header is potentially identifying
+    logging.debug("Client Accept-Language (top 5): %s" % langs[:4])
+
+    try:
+        language = gettext.translation("bridgedb", localedir=localedir,
+                                       languages=langs, fallback=True)
+        for lang in langs:
+            language.add_fallback(gettext.translation("bridgedb",
+                                                      localedir=localedir,
+                                                      languages=langs,
+                                                      fallback=True))
+    except IOError as error:
+        logging.error(error.message)
+
+    language.install(unicode=True)
     return langs
diff --git a/lib/bridgedb/parse/headers.py b/lib/bridgedb/parse/headers.py
new file mode 100644
index 0000000..d9f1c4b
--- /dev/null
+++ b/lib/bridgedb/parse/headers.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of BridgeDB, a Tor bridge distribution system.
+#
+# :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 <isis at torproject.org>
+#           please also see AUTHORS file
+# :copyright: (c) 2013 Isis Lovecruft
+#             (c) 2007-2013, The Tor Project, Inc.
+#             (c) 2007-2013, all entities within the AUTHORS file
+# :license: 3-clause BSD, see included LICENSE for information
+
+"""bridgedb.parse.headers -- Parsers for HTTP and Email headers.
+
+** Module Overview: **
+
+::
+
+ parseAcceptLanguage - Parse the contents of a client 'Accept-Language' header
+
+"""
+
+import logging
+log = logging.getLogger()
+
+import re
+import os
+
+def parseAcceptLanguage(header):
+    """Parse the contents of a client 'Accept-Language' header.
+
+    Parse the header in the following manner:
+
+      0. If ``header`` is None or an empty string, return an empty list.
+      1. Split the ``header`` string on any commas.
+      2. Chop of the RFC2616 quality/level suffix. We ignore these, and just
+         use the order of the list as the preference order, without any
+         parsing of quality/level assignments.
+      3. Add a fallback language of the same type if it is missing. For
+         example, if we only got ['es-ES', 'de-DE'], add 'es' after 'es-ES'
+         and add 'de' after 'de-DE'.
+      4. Change all hyphens to underscores.
+
+    :param string header: The contents of an 'Accept-Language' header, i.e. as
+        if taken from :func:`twisted.web.server.Request.getHeader`.
+    :rtype: list
+    :returns: A list of language codes (with and without locales), in order of
+        preference.
+    """
+    langs = []
+
+    if not header:
+        return langs
+
+    langHeader = header.split(',')
+
+    for lang in langHeader:
+        if lang.find(';') != -1:
+            # Chop off the RFC2616 Accept `q=` and `level=` feilds
+            code, _ = lang.split(';')
+            langs.append(code)
+        else:
+            langs.append(lang)
+
+    # Add a fallback language of the same type if it is missing.
+    langsWithLocales = filter(lambda x: '-' in x, langs)
+    langsOnly = map(lambda x: x.split('-')[0], langsWithLocales)
+    for only in langsOnly:
+        if only not in langs:
+            # Add the fallback after the other languages like it:
+            insertAfter = filter(lambda x: x.startswith(only),
+                                 [x for x in langs])
+            if insertAfter:
+                placement = langs.index(insertAfter[0]) + 1
+                langs.insert(placement, only)
+                continue
+            # Otherwise just put it at the end
+            langs.append(only)
+
+    # Gettext wants underderscores, because that is how it creates the
+    # directories under i18n/, not hyphens:
+    langs = map(lambda x: x.replace('-', '_'), [x for x in langs])
+    return langs





More information about the tor-commits mailing list