commit dfe81deffb272e4585af141ad58ece7d68e352f8 Author: Isis Lovecruft isis@torproject.org Date: Tue Apr 8 21:37:11 2014 +0000
Move email address parsers from bridgedb.Dist → bridgedb.parse.addr. --- lib/bridgedb/Dist.py | 117 +++------------------- lib/bridgedb/EmailServer.py | 41 ++++---- lib/bridgedb/Tests.py | 9 +- lib/bridgedb/parse/addr.py | 171 +++++++++++++++++++++++++++++++-- lib/bridgedb/test/test_EmailServer.py | 2 +- 5 files changed, 208 insertions(+), 132 deletions(-)
diff --git a/lib/bridgedb/Dist.py b/lib/bridgedb/Dist.py index 5b5a602..4e0a1aa 100644 --- a/lib/bridgedb/Dist.py +++ b/lib/bridgedb/Dist.py @@ -28,9 +28,20 @@ from bridgedb.Filters import filterAssignBridgesToRing from bridgedb.Filters import filterBridgesByRules from bridgedb.Filters import filterBridgesByIP4 from bridgedb.Filters import filterBridgesByIP6 +from bridgedb.parse import addr +from bridgedb.parse.addr import UnsupportedDomain from bridgedb.safelog import logSafely
+MAX_EMAIL_RATE = 3*3600 + +class IgnoreEmail(addr.BadEmail): + """Raised when we get requests from this address after rate warning.""" + +class TooSoonEmail(addr.BadEmail): + """Raised when we got a request from this address too recently.""" + + def uniformMap(ip): """Map an IP to an arbitrary 'area' string, such that any two /24 addresses get the same string. @@ -322,103 +333,6 @@ class IPBasedDistributor(Distributor): def dumpAssignments(self, f, description=""): self.splitter.dumpAssignments(f, description)
- -# These characters are the ones that RFC2822 allows. -#ASPECIAL = '!#$%&*+-/=?^_`{|}~' -#ASPECIAL += "\'" -# These are the ones we're pretty sure we can handle right. -ASPECIAL = '-_+/=_~' - -ACHAR = r'[\w%s]' % "".join("\%s"%c for c in ASPECIAL) -DOTATOM = r'%s+(?:.%s+)*' % (ACHAR,ACHAR) -DOMAIN = r'\w+(?:.\w+)*' -ADDRSPEC = r'(%s)@(%s)' % (DOTATOM, DOMAIN) - -SPACE_PAT = re.compile(r'\s+') -ADDRSPEC_PAT = re.compile(ADDRSPEC) - -MAX_EMAIL_RATE = 3*3600 - -class BadEmail(Exception): - """Exception raised when we get a bad email address.""" - def __init__(self, msg, email): - Exception.__init__(self, msg) - self.email = email - -class UnsupportedDomain(BadEmail): - """Exception raised when we get an email address from a domain we - don't know.""" - -class TooSoonEmail(BadEmail): - """Raised when we got a request from this address too recently.""" - -class IgnoreEmail(BadEmail): - """Raised when we get requests from this address after rate warning.""" - -def extractAddrSpec(addr): - """Given an email From line, try to extract and parse the addrspec - portion. Returns localpart,domain on success; raises BadEmail - on failure. - """ - orig_addr = addr - addr = SPACE_PAT.sub(' ', addr) - addr = addr.strip() - # Only works on usual-form addresses; raises BadEmail on weird - # address form. That's okay, since we'll only get those when - # people are trying to fool us. - if '<' in addr: - # Take the _last_ index of <, so that we don't need to bother - # with quoting tricks. - idx = addr.rindex('<') - addr = addr[idx:] - m = re.search(r'<([^>]*)>', addr) - if m is None: - raise BadEmail("Couldn't extract address spec", orig_addr) - addr = m.group(1) - - # At this point, addr holds a putative addr-spec. We only allow the - # following form: - # addr-spec = local-part "@" domain - # local-part = dot-atom - # domain = dot-atom - # - # In particular, we are disallowing: obs-local-part, obs-domain, - # comment, obs-FWS, - # - # Other forms exist, but none of the incoming services we recognize - # support them. - addr = addr.replace(" ", "") - m = ADDRSPEC_PAT.match(addr) - if not m: - raise BadEmail("Bad address spec format", orig_addr) - localpart, domain = m.groups() - return localpart, domain - -def normalizeEmail(addr, domainmap, domainrules): - """Given the contents of a from line, and a map of supported email - domains (in lowercase), raise BadEmail or return a normalized - email address. - """ - addr = addr.lower() - localpart, domain = extractAddrSpec(addr) - if domainmap is not None: - domain = domainmap.get(domain, None) - if domain is None: - raise UnsupportedDomain("Domain not supported", addr) - - #XXXX Do these rules also hold for Yahoo? - - # addr+foo@ is an alias for addr@ - idx = localpart.find('+') - if idx >= 0: - localpart = localpart[:idx] - rules = domainrules.get(domain, []) - if 'ignore_dots' in rules: - # j.doe@ is the same as jdoe@. - localpart = localpart.replace(".", "") - - return "%s@%s"%(localpart, domain) - class EmailBasedDistributor(Distributor): """Object that hands out bridges based on the email address of an incoming request and the current time period. @@ -475,12 +389,13 @@ class EmailBasedDistributor(Distributor): bridgeFilterRules=[] now = time.time() try: - emailaddress = normalizeEmail(emailaddress, self.domainmap, - self.domainrules) - except BadEmail as err: + emailaddress = addr.normalizeEmail(emailaddress, self.domainmap, + self.domainrules) + except addr.BadEmail as err: logging.warn(err) return [] - if emailaddress is None: + + if not emailaddress: return [] #XXXX raise an exception.
with bridgedb.Storage.getDB() as db: diff --git a/lib/bridgedb/EmailServer.py b/lib/bridgedb/EmailServer.py index f9f43bb..e1aa57e 100644 --- a/lib/bridgedb/EmailServer.py +++ b/lib/bridgedb/EmailServer.py @@ -26,7 +26,6 @@ from twisted.mail import smtp
from zope.interface import implements
-from bridgedb.Dist import BadEmail, TooSoonEmail, IgnoreEmail from bridgedb import Dist from bridgedb import I18n from bridgedb import safelog @@ -34,6 +33,10 @@ from bridgedb.Filters import filterBridgesByIP6 from bridgedb.Filters import filterBridgesByIP4 from bridgedb.Filters import filterBridgesByTransport from bridgedb.Filters import filterBridgesByNotBlockedIn +from bridgedb.parse import addr +from bridgedb.parse.addr import BadEmail +from bridgedb.parse.addr import UnsupportedDomain +from bridgedb.parse.addr import canonicalizeEmailDomain
class MailFile: @@ -97,23 +100,24 @@ def getMailResponse(lines, ctx): lang = getLocaleFromPlusAddr(clientToaddr) t = I18n.getLang(lang)
+ canon = ctx.cfg.EMAIL_DOMAIN_MAP + for domain, rule in ctx.cfg.EMAIL_DOMAIN_RULES.items(): + if domain not in canon.keys(): + canon[domain] = domain + for domain in ctx.cfg.EMAIL_DOMAINS: + canon[domain] = domain + try: - _, addrdomain = Dist.extractAddrSpec(clientAddr.lower()) - except BadEmail: - logging.info("Ignoring bad address on incoming email.") + _, clientDomain = addr.extractEmailAddress(clientAddr.lower()) + canonical = canonicalizeEmailDomain(clientDomain, canon) + except UnsupportedDomain as error: + logging.warn(error) return None, None - - if not addrdomain: - logging.info("Couldn't parse domain from %r" % clientAddr) - - if addrdomain and ctx.cfg.EMAIL_DOMAIN_MAP: - addrdomain = ctx.cfg.EMAIL_DOMAIN_MAP.get(addrdomain, addrdomain) - - if addrdomain not in ctx.cfg.EMAIL_DOMAINS: - logging.warn("Unrecognized email domain %r", addrdomain) + except BadEmail as error: + logging.warn(error) return None, None
- rules = ctx.cfg.EMAIL_DOMAIN_RULES.get(addrdomain, []) + rules = ctx.cfg.EMAIL_DOMAIN_RULES.get(canonical, [])
if 'dkim' in rules: # getheader() returns the last of a given kind of header; we want @@ -123,8 +127,8 @@ def getMailResponse(lines, ctx): if dkimHeaders: dkimHeader = dkimHeaders[0] if not dkimHeader.startswith("pass"): - logging.info("Got a bad dkim header (%r) on an incoming mail; " - "rejecting it.", dkimHeader) + logging.info("Rejecting bad DKIM header on incoming email: %r " + % dkimHeader) return None, None
# Was the magic string included @@ -186,17 +190,16 @@ def getMailResponse(lines, ctx): bridgeFilterRules=bridgeFilterRules)
# Handle rate limited email - except TooSoonEmail as err: + except Dist.TooSoonEmail as err: logging.info("Got a mail too frequently; warning '%s': %s." % (clientAddr, err))
- # Compose a warning email # MAX_EMAIL_RATE is in seconds, convert to hours body = buildSpamWarningTemplate(t) % (Dist.MAX_EMAIL_RATE / 3600) return composeEmail(ctx.fromAddr, clientAddr, subject, body, msgID, gpgContext=ctx.gpgContext)
- except IgnoreEmail as err: + except Dist.IgnoreEmail as err: logging.info("Got a mail too frequently; ignoring '%s': %s." % (clientAddr, err)) return None, None diff --git a/lib/bridgedb/Tests.py b/lib/bridgedb/Tests.py index 72dfe5e..4147549 100644 --- a/lib/bridgedb/Tests.py +++ b/lib/bridgedb/Tests.py @@ -232,10 +232,11 @@ class EmailBridgeDistTests(unittest.TestCase):
def testUnsupportedDomain(self): db = self.db - self.assertRaises(bridgedb.Dist.UnsupportedDomain, - bridgedb.Dist.normalizeEmail, 'bad@email.com', - {'example.com':'example.com'}, - {'example.com':[]}) + self.assertRaises(bridgedb.parse.addr.UnsupportedDomain, + bridgedb.parse.addr.normalizeEmail, + 'bad@email.com', + {'example.com':'example.com'}, + {'example.com':[]})
class IPBridgeDistTests(unittest.TestCase): def dumbAreaMapper(self, ip): diff --git a/lib/bridgedb/parse/addr.py b/lib/bridgedb/parse/addr.py index 455b953..f34f416 100644 --- a/lib/bridgedb/parse/addr.py +++ b/lib/bridgedb/parse/addr.py @@ -13,19 +13,22 @@
** Module Overview: **
-.. +:: parse ||_ parse.addr - | |_ isIPAddress - Check if an arbitrary string is an IP address. - | |_ isIPv4 - Check if an arbitrary string is an IPv4 address. - | |_ isIPv6 - Check if an arbitrary string is an IPv6 address. - | _ isValidIP - Check that an IP address is valid. + | | |_ extractEmailAddress - Validate a :rfc:2822 email address. + | | |_ isIPAddress - Check if an arbitrary string is an IP address. + | | |_ isIPv4 - Check if an arbitrary string is an IPv4 address. + | | |_ isIPv6 - Check if an arbitrary string is an IPv6 address. + | | _ isValidIP - Check that an IP address is valid. + | | + | |_ :class:`PortList` - A container class for validated port ranges. | - |__ :mod:`bridgedbparse.headers` + |__ :mod:`bridgedb.parse.headers` |__ :mod:`bridgedb.parse.options` __ :mod:`bridgedb.parse.versions`
-.. +::
Private IP Address Ranges: '''''''''''''''''''''''''' @@ -147,12 +150,119 @@ from __future__ import print_function from __future__ import unicode_literals
import logging +import re + import ipaddr
+#: These are the special characters which RFC2822 allows within email addresses: +#ASPECIAL = '!#$%&*+-/=?^_`{|}~' + "\'" +#: These are the ones we're pretty sure we can handle right: +ASPECIAL = '-_+/=_~' +ACHAR = r'[\w%s]' % "".join("\%s" % c for c in ASPECIAL) +DOTATOM = r'%s+(?:.%s+)*' % (ACHAR, ACHAR) +DOMAIN = r'\w+(?:.\w+)*' +ADDRSPEC = r'(%s)@(%s)' % (DOTATOM, DOMAIN) +SPACE_PAT = re.compile(r'\s+') +#: A compiled regex with matches RFC2822 email address strings: +ADDRSPEC_PAT = re.compile(ADDRSPEC) + + +class BadEmail(Exception): + """Exception raised when we get a bad email address.""" + def __init__(self, msg, email): + Exception.__init__(self, msg) + self.email = email + class InvalidPort(ValueError): """Raised when a given port number is invalid."""
+class UnsupportedDomain(ValueError): + """Raised when we get an email address from an unsupported domain.""" + + +def canonicalizeEmailDomain(domain, domainmap): + """Decide if an email was sent from a permitted domain. + + :param str domain: The domain portion of an email address to validate. It + will be checked that it is one of the domains allowed to email + requests for bridges to the + :class:`~bridgedb.Dist.EmailBasedDistributor`. + :param dict domainmap: A map of permitted alternate domains (in lowercase) + to their canonical domain names (in lowercase). This can be configured + with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for + example:: + EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com', + 'googlemail.com': 'gmail.com'} + :raises UnsupportedDomain: if the domain portion of the email address is + not within the map of alternate to canonical allowed domain names. + :rtype: str + :returns: The canonical domain name for the email address. + """ + permitted = None + + try: + permitted = domainmap.get(domain) + except AttributeError: + logging.debug("Got non-dict for 'domainmap' parameter: %r" % domainmap) + + if not permitted: + raise UnsupportedDomain("Domain not permitted: %s" % domain) + + return permitted + +def extractEmailAddress(emailaddr): + """Given an email address, obtained, for example, via a ``From:`` or + ``Sender:`` email header, try to extract and parse (according to + :rfc:2822) the username and domain portions. Returns ``(username, + domain)`` on success; raises BadEmail on failure. + + We only allow the following form:: + ADDRSPEC := LOCAL_PART "@" DOMAIN + LOCAL_PART := DOTATOM + DOMAIN := DOTATOM + + In particular, we are disallowing: obs-local-part, obs-domain, comment, + and obs-FWS. Other forms exist, but none of the incoming services we + recognize support them. + + :param emailaddr: An email address to validate. + :raises BadEmail: if the **emailaddr** couldn't be validated or parsed. + :rtype: tuple + :returns: A tuple of the validated email address, containing the mail + username and the domain:: + (LOCALPART, DOMAIN) + """ + orig = emailaddr + + try: + addr = SPACE_PAT.sub(' ', emailaddr).strip() + except TypeError as error: + logging.debug(error) + raise BadEmail("Can't extract address from object type %r!" + % type(orig), orig) + + # Only works on usual-form addresses; raises BadEmail on weird + # address form. That's okay, since we'll only get those when + # people are trying to fool us. + if '<' in addr: + # Take the _last_ index of <, so that we don't need to bother + # with quoting tricks. + idx = addr.rindex('<') + addr = addr[idx:] + m = re.search(r'<([^>]*)>', addr) + if m is None: + raise BadEmail("Couldn't extract address spec", orig) + addr = m.group(1) + + # At this point, addr holds a putative addr-spec. + addr = addr.replace(" ", "") + m = ADDRSPEC_PAT.match(addr) + if not m: + raise BadEmail("Bad address spec format", orig) + + localpart, domain = m.groups() + return localpart, domain
def isIPAddress(ip, compressed=True): """Check if an arbitrary string is an IP address, and that it's valid. @@ -275,6 +385,53 @@ def isValidIP(ip): return False return True
+def normalizeEmail(emailaddr, domainmap, domainrules, ignorePlus=True): + """Normalise an email address according to the processing rules for its + canonical originating domain. + + The email address, **emailaddr**, will be parsed and validated, and then + checked that it originated from one of the domains allowed to email + requests for bridges to the :class:`~bridgedb.Dist.EmailBasedDistributor` + via the :func:`canonicaliseEmailDomain` function. + + :param str emailaddr: An email address to normalise. + :param dict domainmap: A map of permitted alternate domains (in lowercase) + to their canonical domain names (in lowercase). This can be configured + with the ``EMAIL_DOMAIN_MAP`` option in ``bridgedb.conf``, for + example:: + EMAIL_DOMAIN_MAP = {'mail.google.com': 'gmail.com', + 'googlemail.com': 'gmail.com'} + :param dict domainrules: A mapping of canonical permitted domain names to + a list of rules which should be applied to processing them, for + example:: + EMAIL_DOMAIN_RULES = {'gmail.com': ["ignore_dots", "dkim"] + Currently, ``"ignore_dots"`` means that all ``"."`` characters will be + removed from the local part of the validated email address. + :param bool ignorePlus: If ``True``, assume that + ``blackhole+kerr@torproject.org`` is an alias for + ``blackhole@torproject.org``, and remove everything after the first + ``'+'`` character. + :raises BadEmail: if the email address could not be parsed or validated. + :rtype: str + :returns: The validated, normalised email address, if it was from a + permitted domain. Otherwise, returns an empty string. + """ + emailaddr = emailaddr.lower() + localpart, domain = extractEmailAddress(emailaddr) + canonical = canonicalizeEmailDomain(domain, domainmap) + + if ignorePlus: + idx = localpart.find('+') + if idx >= 0: + localpart = localpart[:idx] + + rules = domainrules.get(canonical, []) + if 'ignore_dots' in rules: + localpart = localpart.replace(".", "") + + normalized = "%s@%s" % (localpart, domain) + return normalized +
class PortList(object): """A container class for validated port ranges. diff --git a/lib/bridgedb/test/test_EmailServer.py b/lib/bridgedb/test/test_EmailServer.py index f828be7..8521762 100644 --- a/lib/bridgedb/test/test_EmailServer.py +++ b/lib/bridgedb/test/test_EmailServer.py @@ -20,10 +20,10 @@ from io import StringIO import copy
from bridgedb import EmailServer -from bridgedb.Dist import BadEmail from bridgedb.Dist import EmailBasedDistributor from bridgedb.EmailServer import MailContext from bridgedb.Time import NoSchedule +from bridgedb.parse.addr import BadEmail from bridgedb.persistent import Conf from bridgedb.test.util import fileCheckDecorator