commit e1ae7a0ac0b5a8363fce07732d1b0e685547e582 Author: Isis Lovecruft isis@torproject.org Date: Tue Oct 29 05:45:28 2013 +0000
Fix the parsing of Accept-Language to actually support fallbacks.
All right. Going through the most significant bugs in this function:
* The line from the original function: ``langs = request.getHeader('accept-language').split(',')`` getHeader() returns ``None`` if the header isn't present, so this results in a TypeError on the split().
* The line from the original function: ``langs = filter(lambda x: re.match('^[a-z-]{1,5}', x), langs)`` This chucks locales with capital letters, and doesn't much at all to insure that we're actually getting a well-formed header, all at the expense of a (rather expensive; they're slow in Python) regex call.
* These lines from the original: # add fallback languages langs_only = filter(lambda x: '-' in x, langs) langs.extend(map(lambda x: x.split('-')[0], langs_only)) If my 'Accept-Language' header starts with 'en-GB,en-US;q=0.92[…]', then this would add ['en','en'] to the end of my header, without even checking if I already have 'en'. Instead, we should check if 'en' is already there, and iff not, then add it *after the other English headers*. Not after Mandarin, Japanese, Arabic, Russian, and the other slew of languages that I half-assedly learned at some point.
* These lines from the original: # gettext wants _, not - map(lambda x: x.replace('-', '_'), langs) Great. A pretty mapping. Good thing the returned values weren't saved as anything; otherwise they might have been useful!
* Lastly, the way that languages, once parsed were added to gettext, would raise UnhandledErrors, *and* it didn't even add the fallbacks correctly, meaning we could only get one language at a time. --- lib/bridgedb/HTTPServer.py | 62 +++++++++++++++++++------------ lib/bridgedb/parse/headers.py | 82 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 24 deletions(-)
diff --git a/lib/bridgedb/HTTPServer.py b/lib/bridgedb/HTTPServer.py index ad76568..ca850d5 100644 --- a/lib/bridgedb/HTTPServer.py +++ b/lib/bridgedb/HTTPServer.py @@ -30,6 +30,7 @@ from bridgedb.Raptcha import Raptcha from bridgedb.Filters import filterBridgesByIP6, filterBridgesByIP4 from bridgedb.Filters import filterBridgesByTransport from bridgedb.Filters import filterBridgesByNotBlockedIn +from bridgedb.parse import headers from ipaddr import IPv4Address, IPv6Address from random import randint from mako.template import Template @@ -328,29 +329,42 @@ def getAssumedChosenLang(langs): return lang
def setLocaleFromRequestHeader(request): + """Retrieve the languages from the accept-language header and install them. + + Parse the languages in the header, and attempt to install the first one in + the list. If that fails, we receive a :class:`gettext.NullTranslation` + object, if it worked then we have a :class:`gettext.GNUTranslation` + object. Whichever one we end up with, add the other get the other + languages and add them as fallbacks to the first. Lastly, install this + chain of translations. + + :type request: :class:`twisted.web.server.Request` + :param request: An incoming request from a client. + :rtype: list + :returns: All requested languages. """ - Retrieve the languages from the accept-language header and insall - - Parse the languages in the header, if any of them contain locales then - add their languages to the list, also. Then install all of them using - gettext, it will choose the best one. - - :param request twisted.web.server.Request: Incoming request - :returns list: All requested languages - """ - langs = request.getHeader('accept-language').split(',') - logging.debug("Accept-Language: %s" % langs) - localedir=os.path.join(os.path.dirname(__file__), 'i18n/') - - if langs: - langs = filter(lambda x: re.match('^[a-z-]{1,5}', x), langs) - logging.debug("Languages: %s" % langs) - # add fallback languages - langs_only = filter(lambda x: '-' in x, langs) - langs.extend(map(lambda x: x.split('-')[0], langs_only)) - # gettext wants _, not - - map(lambda x: x.replace('-', '_'), langs) - lang = gettext.translation("bridgedb", localedir=localedir, - languages=langs, fallback=True) - lang.install(True) + logging.debug("Getting client 'Accept-Language' header...") + header = request.getHeader('accept-language') + + if header is None: + logging.debug("Client sent no 'Accept-Language' header. Using fallback.") + header = 'en,en-US' + + localedir = os.path.join(os.path.dirname(__file__), 'i18n/') + langs = headers.parseAcceptLanguage(header) + ## XXX the 'Accept-Language' header is potentially identifying + logging.debug("Client Accept-Language (top 5): %s" % langs[:4]) + + try: + language = gettext.translation("bridgedb", localedir=localedir, + languages=langs, fallback=True) + for lang in langs: + language.add_fallback(gettext.translation("bridgedb", + localedir=localedir, + languages=langs, + fallback=True)) + except IOError as error: + logging.error(error.message) + + language.install(unicode=True) return langs diff --git a/lib/bridgedb/parse/headers.py b/lib/bridgedb/parse/headers.py new file mode 100644 index 0000000..d9f1c4b --- /dev/null +++ b/lib/bridgedb/parse/headers.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +# +# This file is part of BridgeDB, a Tor bridge distribution system. +# +# :authors: Isis Lovecruft 0xA3ADB67A2CDB8B35 isis@torproject.org +# please also see AUTHORS file +# :copyright: (c) 2013 Isis Lovecruft +# (c) 2007-2013, The Tor Project, Inc. +# (c) 2007-2013, all entities within the AUTHORS file +# :license: 3-clause BSD, see included LICENSE for information + +"""bridgedb.parse.headers -- Parsers for HTTP and Email headers. + +** Module Overview: ** + +:: + + parseAcceptLanguage - Parse the contents of a client 'Accept-Language' header + +""" + +import logging +log = logging.getLogger() + +import re +import os + +def parseAcceptLanguage(header): + """Parse the contents of a client 'Accept-Language' header. + + Parse the header in the following manner: + + 0. If ``header`` is None or an empty string, return an empty list. + 1. Split the ``header`` string on any commas. + 2. Chop of the RFC2616 quality/level suffix. We ignore these, and just + use the order of the list as the preference order, without any + parsing of quality/level assignments. + 3. Add a fallback language of the same type if it is missing. For + example, if we only got ['es-ES', 'de-DE'], add 'es' after 'es-ES' + and add 'de' after 'de-DE'. + 4. Change all hyphens to underscores. + + :param string header: The contents of an 'Accept-Language' header, i.e. as + if taken from :func:`twisted.web.server.Request.getHeader`. + :rtype: list + :returns: A list of language codes (with and without locales), in order of + preference. + """ + langs = [] + + if not header: + return langs + + langHeader = header.split(',') + + for lang in langHeader: + if lang.find(';') != -1: + # Chop off the RFC2616 Accept `q=` and `level=` feilds + code, _ = lang.split(';') + langs.append(code) + else: + langs.append(lang) + + # Add a fallback language of the same type if it is missing. + langsWithLocales = filter(lambda x: '-' in x, langs) + langsOnly = map(lambda x: x.split('-')[0], langsWithLocales) + for only in langsOnly: + if only not in langs: + # Add the fallback after the other languages like it: + insertAfter = filter(lambda x: x.startswith(only), + [x for x in langs]) + if insertAfter: + placement = langs.index(insertAfter[0]) + 1 + langs.insert(placement, only) + continue + # Otherwise just put it at the end + langs.append(only) + + # Gettext wants underderscores, because that is how it creates the + # directories under i18n/, not hyphens: + langs = map(lambda x: x.replace('-', '_'), [x for x in langs]) + return langs