[bridgedb/develop] Move email address parsers from bridgedb.Dist → bridgedb.parse.addr.

isis at torproject.org isis at torproject.org
Thu Apr 17 05:10:03 UTC 2014


commit dfe81deffb272e4585af141ad58ece7d68e352f8
Author: Isis Lovecruft <isis at torproject.org>
Date:   Tue Apr 8 21:37:11 2014 +0000

    Move email address parsers from bridgedb.Dist → bridgedb.parse.addr.
---
 lib/bridgedb/Dist.py                  |  117 +++-------------------
 lib/bridgedb/EmailServer.py           |   41 ++++----
 lib/bridgedb/Tests.py                 |    9 +-
 lib/bridgedb/parse/addr.py            |  171 +++++++++++++++++++++++++++++++--
 lib/bridgedb/test/test_EmailServer.py |    2 +-
 5 files changed, 208 insertions(+), 132 deletions(-)

diff --git a/lib/bridgedb/Dist.py b/lib/bridgedb/Dist.py
index 5b5a602..4e0a1aa 100644
--- a/lib/bridgedb/Dist.py
+++ b/lib/bridgedb/Dist.py
@@ -28,9 +28,20 @@ from bridgedb.Filters import filterAssignBridgesToRing
 from bridgedb.Filters import filterBridgesByRules
 from bridgedb.Filters import filterBridgesByIP4
 from bridgedb.Filters import filterBridgesByIP6
+from bridgedb.parse import addr
+from bridgedb.parse.addr import UnsupportedDomain
 from bridgedb.safelog import logSafely
 
 
+MAX_EMAIL_RATE = 3*3600
+
+class IgnoreEmail(addr.BadEmail):
+    """Raised when we get requests from this address after rate warning."""
+
+class TooSoonEmail(addr.BadEmail):
+    """Raised when we got a request from this address too recently."""
+
+
 def uniformMap(ip):
     """Map an IP to an arbitrary 'area' string, such that any two /24 addresses
     get the same string.
@@ -322,103 +333,6 @@ class IPBasedDistributor(Distributor):
     def dumpAssignments(self, f, description=""):
         self.splitter.dumpAssignments(f, description)
 
-
-# These characters are the ones that RFC2822 allows.
-#ASPECIAL = '!#$%&*+-/=?^_`{|}~'
-#ASPECIAL += "\\\'"
-# These are the ones we're pretty sure we can handle right.
-ASPECIAL = '-_+/=_~'
-
-ACHAR = r'[\w%s]' % "".join("\\%s"%c for c in ASPECIAL)
-DOTATOM = r'%s+(?:\.%s+)*' % (ACHAR,ACHAR)
-DOMAIN = r'\w+(?:\.\w+)*'
-ADDRSPEC = r'(%s)\@(%s)' % (DOTATOM, DOMAIN)
-
-SPACE_PAT = re.compile(r'\s+')
-ADDRSPEC_PAT = re.compile(ADDRSPEC)
-
-MAX_EMAIL_RATE = 3*3600
-
-class BadEmail(Exception):
-    """Exception raised when we get a bad email address."""
-    def __init__(self, msg, email):
-        Exception.__init__(self, msg)
-        self.email = email
-
-class UnsupportedDomain(BadEmail):
-    """Exception raised when we get an email address from a domain we
-       don't know."""
-
-class TooSoonEmail(BadEmail):
-    """Raised when we got a request from this address too recently."""
-
-class IgnoreEmail(BadEmail):
-    """Raised when we get requests from this address after rate warning."""
-
-def extractAddrSpec(addr):
-    """Given an email From line, try to extract and parse the addrspec
-       portion.  Returns localpart,domain on success; raises BadEmail
-       on failure.
-    """
-    orig_addr = addr
-    addr = SPACE_PAT.sub(' ', addr)
-    addr = addr.strip()
-    # Only works on usual-form addresses; raises BadEmail on weird
-    # address form.  That's okay, since we'll only get those when
-    # people are trying to fool us.
-    if '<' in addr:
-        # Take the _last_ index of <, so that we don't need to bother
-        # with quoting tricks.
-        idx = addr.rindex('<')
-        addr = addr[idx:]
-        m = re.search(r'<([^>]*)>', addr)
-        if m is None:
-            raise BadEmail("Couldn't extract address spec", orig_addr)
-        addr = m.group(1)
-
-    # At this point, addr holds a putative addr-spec.  We only allow the
-    # following form:
-    #   addr-spec = local-part "@" domain
-    #   local-part = dot-atom
-    #   domain = dot-atom
-    #
-    # In particular, we are disallowing: obs-local-part, obs-domain,
-    # comment, obs-FWS,
-    #
-    # Other forms exist, but none of the incoming services we recognize
-    # support them.
-    addr = addr.replace(" ", "")
-    m = ADDRSPEC_PAT.match(addr)
-    if not m:
-        raise BadEmail("Bad address spec format", orig_addr)
-    localpart, domain = m.groups()
-    return localpart, domain
-
-def normalizeEmail(addr, domainmap, domainrules):
-    """Given the contents of a from line, and a map of supported email
-       domains (in lowercase), raise BadEmail or return a normalized
-       email address.
-    """
-    addr = addr.lower()
-    localpart, domain = extractAddrSpec(addr)
-    if domainmap is not None:
-        domain = domainmap.get(domain, None)
-        if domain is None:
-            raise UnsupportedDomain("Domain not supported", addr)
-
-    #XXXX Do these rules also hold for Yahoo?
-
-    # addr+foo@ is an alias for addr@
-    idx = localpart.find('+')
-    if idx >= 0:
-        localpart = localpart[:idx]
-    rules = domainrules.get(domain, [])
-    if 'ignore_dots' in rules:
-        # j.doe@ is the same as jdoe at .
-        localpart = localpart.replace(".", "")
-
-    return "%s@%s"%(localpart, domain)
-
 class EmailBasedDistributor(Distributor):
     """Object that hands out bridges based on the email address of an incoming
     request and the current time period.
@@ -475,12 +389,13 @@ class EmailBasedDistributor(Distributor):
             bridgeFilterRules=[]
         now = time.time()
         try:
-            emailaddress = normalizeEmail(emailaddress, self.domainmap,
-                                          self.domainrules)
-        except BadEmail as err:
+            emailaddress = addr.normalizeEmail(emailaddress, self.domainmap,
+                                               self.domainrules)
+        except addr.BadEmail as err:
             logging.warn(err)
             return []
-        if emailaddress is None:
+
+        if not emailaddress:
             return [] #XXXX raise an exception.
 
         with bridgedb.Storage.getDB() as db:
diff --git a/lib/bridgedb/EmailServer.py b/lib/bridgedb/EmailServer.py
index f9f43bb..e1aa57e 100644
--- a/lib/bridgedb/EmailServer.py
+++ b/lib/bridgedb/EmailServer.py
@@ -26,7 +26,6 @@ from twisted.mail import smtp
 
 from zope.interface import implements
 
-from bridgedb.Dist import BadEmail, TooSoonEmail, IgnoreEmail
 from bridgedb import Dist
 from bridgedb import I18n
 from bridgedb import safelog
@@ -34,6 +33,10 @@ from bridgedb.Filters import filterBridgesByIP6
 from bridgedb.Filters import filterBridgesByIP4
 from bridgedb.Filters import filterBridgesByTransport
 from bridgedb.Filters import filterBridgesByNotBlockedIn
+from bridgedb.parse import addr
+from bridgedb.parse.addr import BadEmail
+from bridgedb.parse.addr import UnsupportedDomain
+from bridgedb.parse.addr import canonicalizeEmailDomain
 
 
 class MailFile:
@@ -97,23 +100,24 @@ def getMailResponse(lines, ctx):
     lang = getLocaleFromPlusAddr(clientToaddr)
     t = I18n.getLang(lang)
 
+    canon = ctx.cfg.EMAIL_DOMAIN_MAP
+    for domain, rule in ctx.cfg.EMAIL_DOMAIN_RULES.items():
+        if domain not in canon.keys():
+            canon[domain] = domain
+    for domain in ctx.cfg.EMAIL_DOMAINS:
+        canon[domain] = domain
+
     try:
-        _, addrdomain = Dist.extractAddrSpec(clientAddr.lower())
-    except BadEmail:
-        logging.info("Ignoring bad address on incoming email.")
+        _, clientDomain = addr.extractEmailAddress(clientAddr.lower())
+        canonical = canonicalizeEmailDomain(clientDomain, canon)
+    except UnsupportedDomain as error:
+        logging.warn(error)
         return None, None
-
-    if not addrdomain:
-        logging.info("Couldn't parse domain from %r" % clientAddr)
-
-    if addrdomain and ctx.cfg.EMAIL_DOMAIN_MAP:
-        addrdomain = ctx.cfg.EMAIL_DOMAIN_MAP.get(addrdomain, addrdomain)
-
-    if addrdomain not in ctx.cfg.EMAIL_DOMAINS:
-        logging.warn("Unrecognized email domain %r", addrdomain)
+    except BadEmail as error:
+        logging.warn(error)
         return None, None
 
-    rules = ctx.cfg.EMAIL_DOMAIN_RULES.get(addrdomain, [])
+    rules = ctx.cfg.EMAIL_DOMAIN_RULES.get(canonical, [])
 
     if 'dkim' in rules:
         # getheader() returns the last of a given kind of header; we want
@@ -123,8 +127,8 @@ def getMailResponse(lines, ctx):
         if dkimHeaders:
             dkimHeader = dkimHeaders[0]
         if not dkimHeader.startswith("pass"):
-            logging.info("Got a bad dkim header (%r) on an incoming mail; "
-                         "rejecting it.", dkimHeader)
+            logging.info("Rejecting bad DKIM header on incoming email: %r "
+                         % dkimHeader)
             return None, None
 
     # Was the magic string included
@@ -186,17 +190,16 @@ def getMailResponse(lines, ctx):
             bridgeFilterRules=bridgeFilterRules)
 
     # Handle rate limited email
-    except TooSoonEmail as err:
+    except Dist.TooSoonEmail as err:
         logging.info("Got a mail too frequently; warning '%s': %s."
                      % (clientAddr, err))
 
-        # Compose a warning email
         # MAX_EMAIL_RATE is in seconds, convert to hours
         body = buildSpamWarningTemplate(t) % (Dist.MAX_EMAIL_RATE / 3600)
         return composeEmail(ctx.fromAddr, clientAddr, subject, body, msgID,
                 gpgContext=ctx.gpgContext)
 
-    except IgnoreEmail as err:
+    except Dist.IgnoreEmail as err:
         logging.info("Got a mail too frequently; ignoring '%s': %s."
                      % (clientAddr, err))
         return None, None
diff --git a/lib/bridgedb/Tests.py b/lib/bridgedb/Tests.py
index 72dfe5e..4147549 100644
--- a/lib/bridgedb/Tests.py
+++ b/lib/bridgedb/Tests.py
@@ -232,10 +232,11 @@ class EmailBridgeDistTests(unittest.TestCase):
 
     def testUnsupportedDomain(self):
         db = self.db
-        self.assertRaises(bridgedb.Dist.UnsupportedDomain,
-                bridgedb.Dist.normalizeEmail, 'bad at email.com',
-                {'example.com':'example.com'},
-                {'example.com':[]}) 
+        self.assertRaises(bridgedb.parse.addr.UnsupportedDomain,
+                          bridgedb.parse.addr.normalizeEmail,
+                          'bad at email.com',
+                          {'example.com':'example.com'},
+                          {'example.com':[]})
 
 class IPBridgeDistTests(unittest.TestCase):
     def dumbAreaMapper(self, ip):
diff --git a/lib/bridgedb/parse/addr.py b/lib/bridgedb/parse/addr.py
index 455b953..f34f416 100644
--- a/lib/bridgedb/parse/addr.py
+++ b/lib/bridgedb/parse/addr.py
@@ -13,19 +13,22 @@
 
 ** Module Overview: **
 
-..
+::
   parse
    ||_ parse.addr
-   |   |_ isIPAddress - Check if an arbitrary string is an IP address.
-   |   |_ isIPv4 - Check if an arbitrary string is an IPv4 address.
-   |   |_ isIPv6 - Check if an arbitrary string is an IPv6 address.
-   |   \_ isValidIP - Check that an IP address is valid.
+   |    | |_ extractEmailAddress - Validate a :rfc:2822 email address.
+   |    | |_ isIPAddress - Check if an arbitrary string is an IP address.
+   |    | |_ isIPv4 - Check if an arbitrary string is an IPv4 address.
+   |    | |_ isIPv6 - Check if an arbitrary string is an IPv6 address.
+   |    | \_ isValidIP - Check that an IP address is valid.
+   |    |
+   |    |_ :class:`PortList` - A container class for validated port ranges.
    |
-   |__ :mod:`bridgedbparse.headers`
+   |__ :mod:`bridgedb.parse.headers`
    |__ :mod:`bridgedb.parse.options`
    \__ :mod:`bridgedb.parse.versions`
 
-..
+::
 
 Private IP Address Ranges:
 ''''''''''''''''''''''''''
@@ -147,12 +150,119 @@ from __future__ import print_function
 from __future__ import unicode_literals
 
 import logging
+import re
+
 import ipaddr
 
 
+#: These are the special characters which RFC2822 allows within email addresses:
+#ASPECIAL = '!#$%&*+-/=?^_`{|}~' + "\\\'"
+#: These are the ones we're pretty sure we can handle right:
+ASPECIAL = '-_+/=_~'
+ACHAR = r'[\w%s]' % "".join("\\%s" % c for c in ASPECIAL)
+DOTATOM = r'%s+(?:\.%s+)*' % (ACHAR, ACHAR)
+DOMAIN = r'\w+(?:\.\w+)*'
+ADDRSPEC = r'(%s)\@(%s)' % (DOTATOM, DOMAIN)
+SPACE_PAT = re.compile(r'\s+')
+#: A compiled regex with matches RFC2822 email address strings:
+ADDRSPEC_PAT = re.compile(ADDRSPEC)
+
+
+class BadEmail(Exception):
+    """Exception raised when we get a bad email address."""
+    def __init__(self, msg, email):
+        Exception.__init__(self, msg)
+        self.email = email
+
 class InvalidPort(ValueError):
     """Raised when a given port number is invalid."""
 
+class UnsupportedDomain(ValueError):
+    """Raised when we get an email address from an unsupported domain."""
+
+
+def canonicalizeEmailDomain(domain, domainmap):
+    """Decide if an email was sent from a permitted domain.
+
+    :param str domain: The domain portion of an email address to validate. It
+        will be checked that it is one of the domains allowed to email
+        requests for bridges to the
+        :class:`~bridgedb.Dist.EmailBasedDistributor`.
+    :param dict domainmap: A map of permitted alternate domains (in lowercase)
+        to their canonical domain names (in lowercase). This can be configured
+        with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for
+        example::
+            EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com',
+                                'googlemail.com': 'gmail.com'}
+    :raises UnsupportedDomain: if the domain portion of the email address is
+        not within the map of alternate to canonical allowed domain names.
+    :rtype: str
+    :returns: The canonical domain name for the email address.
+    """
+    permitted = None
+
+    try:
+        permitted = domainmap.get(domain)
+    except AttributeError:
+        logging.debug("Got non-dict for 'domainmap' parameter: %r" % domainmap)
+
+    if not permitted:
+        raise UnsupportedDomain("Domain not permitted: %s" % domain)
+
+    return permitted
+
+def extractEmailAddress(emailaddr):
+    """Given an email address, obtained, for example, via a ``From:`` or
+    ``Sender:`` email header, try to extract and parse (according to
+    :rfc:2822) the username and domain portions. Returns ``(username,
+    domain)`` on success; raises BadEmail on failure.
+
+    We only allow the following form::
+        ADDRSPEC := LOCAL_PART "@" DOMAIN
+        LOCAL_PART := DOTATOM
+        DOMAIN := DOTATOM
+
+    In particular, we are disallowing: obs-local-part, obs-domain, comment,
+    and obs-FWS. Other forms exist, but none of the incoming services we
+    recognize support them.
+
+    :param emailaddr: An email address to validate.
+    :raises BadEmail: if the **emailaddr** couldn't be validated or parsed.
+    :rtype: tuple
+    :returns: A tuple of the validated email address, containing the mail
+        username and the domain::
+            (LOCALPART, DOMAIN)
+    """
+    orig = emailaddr
+
+    try:
+        addr = SPACE_PAT.sub(' ', emailaddr).strip()
+    except TypeError as error:
+        logging.debug(error)
+        raise BadEmail("Can't extract address from object type %r!"
+                       % type(orig), orig)
+
+    # Only works on usual-form addresses; raises BadEmail on weird
+    # address form.  That's okay, since we'll only get those when
+    # people are trying to fool us.
+    if '<' in addr:
+        # Take the _last_ index of <, so that we don't need to bother
+        # with quoting tricks.
+        idx = addr.rindex('<')
+        addr = addr[idx:]
+        m = re.search(r'<([^>]*)>', addr)
+        if m is None:
+            raise BadEmail("Couldn't extract address spec", orig)
+        addr = m.group(1)
+
+    # At this point, addr holds a putative addr-spec.
+    addr = addr.replace(" ", "")
+    m = ADDRSPEC_PAT.match(addr)
+    if not m:
+        raise BadEmail("Bad address spec format", orig)
+
+    localpart, domain = m.groups()
+    return localpart, domain
 
 def isIPAddress(ip, compressed=True):
     """Check if an arbitrary string is an IP address, and that it's valid.
@@ -275,6 +385,53 @@ def isValidIP(ip):
         return False
     return True
 
+def normalizeEmail(emailaddr, domainmap, domainrules, ignorePlus=True):
+    """Normalise an email address according to the processing rules for its
+    canonical originating domain.
+
+    The email address, **emailaddr**, will be parsed and validated, and then
+    checked that it originated from one of the domains allowed to email
+    requests for bridges to the :class:`~bridgedb.Dist.EmailBasedDistributor`
+    via the :func:`canonicaliseEmailDomain` function.
+
+    :param str emailaddr: An email address to normalise.
+    :param dict domainmap: A map of permitted alternate domains (in lowercase)
+        to their canonical domain names (in lowercase). This can be configured
+        with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for
+        example::
+            EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com',
+                                'googlemail.com': 'gmail.com'}
+    :param dict domainrules: A mapping of canonical permitted domain names to
+        a list of rules which should be applied to processing them, for
+        example::
+            EMAIL_DOMAIN_RULES = {'gmail.com': ["ignore_dots", "dkim"]
+        Currently, ``"ignore_dots"`` means that all ``"."`` characters will be
+        removed from the local part of the validated email address.
+    :param bool ignorePlus: If ``True``, assume that
+        ``blackhole+kerr at torproject.org`` is an alias for
+        ``blackhole at torproject.org``, and remove everything after the first
+        ``'+'`` character.
+    :raises BadEmail: if the email address could not be parsed or validated.
+    :rtype: str
+    :returns: The validated, normalised email address, if it was from a
+        permitted domain. Otherwise, returns an empty string.
+    """
+    emailaddr = emailaddr.lower()
+    localpart, domain = extractEmailAddress(emailaddr)
+    canonical = canonicalizeEmailDomain(domain, domainmap)
+
+    if ignorePlus:
+        idx = localpart.find('+')
+        if idx >= 0:
+            localpart = localpart[:idx]
+
+    rules = domainrules.get(canonical, [])
+    if 'ignore_dots' in rules:
+        localpart = localpart.replace(".", "")
+
+    normalized = "%s@%s" % (localpart, domain)
+    return normalized
+
 
 class PortList(object):
     """A container class for validated port ranges.
diff --git a/lib/bridgedb/test/test_EmailServer.py b/lib/bridgedb/test/test_EmailServer.py
index f828be7..8521762 100644
--- a/lib/bridgedb/test/test_EmailServer.py
+++ b/lib/bridgedb/test/test_EmailServer.py
@@ -20,10 +20,10 @@ from io import StringIO
 import copy
 
 from bridgedb import EmailServer
-from bridgedb.Dist import BadEmail
 from bridgedb.Dist import EmailBasedDistributor
 from bridgedb.EmailServer import MailContext
 from bridgedb.Time import NoSchedule
+from bridgedb.parse.addr import BadEmail
 from bridgedb.persistent import Conf
 from bridgedb.test.util import fileCheckDecorator
 





More information about the tor-commits mailing list