commit 8cdbbebdc458ff162e936ad5b979a38f433a01b5 Author: Philipp Winter phw@nymity.ch Date: Tue Apr 21 11:10:27 2020 -0700
Don't assume a line can be decoded as UTF-8.
For example, we may be dealing with text that's cp1252-encoded. BridgeDB gets a lot of spam like that:
b"Subject: Ich m\xf6chte Sie treffen" b'Subject: Wei\xdft du, wie man ein M\xe4dchen anmacht?'
Instead of trying really hard to figure out what encoding we're dealing with, we simply ignore characters we cannot decode. --- bridgedb/distributors/email/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bridgedb/distributors/email/server.py b/bridgedb/distributors/email/server.py index f18a5ad..54a8231 100644 --- a/bridgedb/distributors/email/server.py +++ b/bridgedb/distributors/email/server.py @@ -212,7 +212,7 @@ class SMTPMessage(object): if self.nBytes > self.context.maximumSize: self.ignoring = True else: - self.lines.append(line.decode('utf-8') if isinstance(line, bytes) else line) + self.lines.append(line.decode('utf-8', 'ignore') if isinstance(line, bytes) else line) if not safelog.safe_logging: try: ln = line.rstrip("\r\n").encode('utf-8', 'replace')
tor-commits@lists.torproject.org