commit 7ceb25e306a5af456c4a4ba1f1f5b2a72d6eb77c Author: Philipp Winter phw@nymity.ch Date: Wed Aug 14 15:00:59 2019 -0700
Support handing out decoy bridges to bots.
This patch makes it possible to identify bots by inspecting HTTP request headers. A CSV file, specified by BLACKLISTED_REQUEST_HEADERS_FILE, contains mappings from request header to a regular expression of the header's value, e.g.:
Accept-Language,[Kk]lingon User-Agent,Spa+ce ...
Once a regular expression matches a client's request, we probably caught a bot. This patch also makes it possible to respond to bot requests with a decoy bridge, e.g., to study what the owners of the bot intend to do with the bridge. Decoy bridges are configured in the CSV file DECOY_BRIDGES_FILE. The file maps a transport type and its IP address version to bridge lines, e.g.:
vanillav4,1.2.3.4:1234 FINGERPRINT obfs4v4,obfs4 1.2.3.4:1234 FINGERPRINT ARGS ...
This fixes https://bugs.torproject.org/31252 --- CHANGELOG | 9 +++ bridgedb.conf | 19 ++++++ bridgedb/antibot.py | 123 ++++++++++++++++++++++++++++++++++ bridgedb/distributors/https/server.py | 10 +++ bridgedb/distributors/moat/server.py | 6 ++ bridgedb/main.py | 6 ++ bridgedb/test/test_antibot.py | 108 +++++++++++++++++++++++++++++ 7 files changed, 281 insertions(+)
diff --git a/CHANGELOG b/CHANGELOG index 32e6fe5..03390d6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,6 +16,15 @@ Changes in version 0.8.0 - YYYY-MM-DD Use stem instead of leekspin to create test descriptors. We now don't need to depend on leekspin anymore.
+ * FIXES #31252 https://bugs.torproject.org/31252 + Add an anti-bot mechanism that allows us to detect bots by matching + HTTP request headers for blacklisted patterns. For example, bots may + have their Accept-Language set to "Klingon". Blacklisted patterns are + configured in BLACKLISTED_REQUEST_HEADERS_FILE. When BridgeDB detects + a bot request, we can answer their request with a decoy bridge that's + only handed out to bots. Decoy bridges are configured in + DECOY_BRIDGES_FILE. + Changes in version 0.7.1 - 2019-06-07
* FIXES #28496 https://bugs.torproject.org/28496 diff --git a/bridgedb.conf b/bridgedb.conf index ba43bb6..a0e00a8 100644 --- a/bridgedb.conf +++ b/bridgedb.conf @@ -301,6 +301,25 @@ PROBING_RESISTANT_TRANSPORTS = ['scramblesuit', 'obfs4'] # menu). DEFAULT_TRANSPORT = 'obfs4'
+# HTTP headers that suggest that a request was issued by a bot. The CSV +# file must have the following format: +# <HEADER>,<REGEXP> +# ... +# For example: +# Accept-Language,[Kk]lingon +BLACKLISTED_REQUEST_HEADERS_FILE="blacklisted-request-headers.csv" + +# Decoy bridges that we are handing out to bots that we detected using the +# regular expressions in BLACKLISTED_REQUEST_HEADERS_FILE. The CSV file must +# have the following format: +# <TRANSPORT>v<IP_VERSION>,<BRIDGE_LINE> +# ... +# For example: +# vanillav4,1.2.3.4:1234 0123456789ABCDEF0123456789ABCDEF01234567 +# vanillav6,[::1]:1234 0123456789ABCDEF0123456789ABCDEF01234567 +# obfs4v4,obfs4 1.2.3.4:1234 public-key=... node-id=... iat-mode=... +DECOY_BRIDGES_FILE="decoy-bridges.csv" + #------------------------------- # Moat Distribution Options \ #------------------------------------------------------------------------------ diff --git a/bridgedb/antibot.py b/bridgedb/antibot.py new file mode 100644 index 0000000..e724c68 --- /dev/null +++ b/bridgedb/antibot.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*- +# _____________________________________________________________________________ +# +# This file is part of BridgeDB, a Tor bridge distribution system. +# +# :authors: please see included AUTHORS file +# :copyright: (c) 2019, The Tor Project, Inc. +# (c) 2019, Philipp Winter +# :license: see LICENSE for licensing information +# _____________________________________________________________________________ + +"""Functions for dealing with bot requests.""" + +import re +import logging + +# Maps transport types and IP version (e.g., "obfs4v4", "vanillav4", or +# "vanillav6") to bridge lines (e.g., "1.2.3.4:1234 ...". +DECOY_BRIDGES = {} + +# Maps HTTP request headers (e.g., "Accept-Language") to regular expressions +# that suggest that the request was issued by a bot (e.g., "[Kk]lingon"). +BLACKLISTED_REQUEST_HEADERS = {} + + +def _loadCSV(filename): + """Load and return the content of the given CSV file. + + :param str filename: The filename to read. + :rtype: dict + :returns: A dictionary mapping keys (first column) to values (second + column). + """ + + csv = dict() + try: + with open(filename) as fh: + for line in fh.readlines(): + if line.count(",") != 1: + logging.warning("Line must have exactly one comma: %s" % + line) + continue + key, value = line.split(",") + csv[key.strip()] = value.strip() + except IOError as err: + logging.warning("I/O error while reading from file %s: %s" % + (filename, err)) + + return csv + + +def loadBlacklistedRequestHeaders(filename): + """Load and globally set a dictionary of blacklisted request headers. + + :param str filename: The filename to read. + """ + + content = _loadCSV(filename) + blacklisted = dict() + # Turn dictionary values into compiled regular expressions. + for header, regexp in content.items(): + try: + blacklisted[header] = re.compile(regexp) + except Exception as err: + logging.warning("Skipping regexp %s because we couldn't compile " + "it: %s" % (regexp, err)) + + global BLACKLISTED_REQUEST_HEADERS + BLACKLISTED_REQUEST_HEADERS = blacklisted + + +def loadDecoyBridges(filename): + """Load and globally set a dictionary of decoy bridges. + + :param str filename: The filename to read. + """ + + d = _loadCSV(filename) + # Turn our bridge lines (which are strings) into lists. + decoyBridges = {ttype: [line] for ttype, line in d.items()} + + global DECOY_BRIDGES + DECOY_BRIDGES = decoyBridges + + +def getDecoyBridge(transport, ipVersion): + """Return a decoy bridge or, if none is available, None. + + :param str transport: The desired transport, e.g., "vanilla" or "obfs4". + :param int ipVersion: The IP version, which must be either 4 or 6. + :rtype: list + :returns: Return a list of bridge lines or, if we don't have any, None. + """ + + if ipVersion not in [4, 6]: + return None + + logging.info("Returning IPv%d decoy bridge for transport %s." % + (ipVersion, transport)) + return DECOY_BRIDGES.get("%sv%d" % (transport, ipVersion), None) + + +def isRequestFromBot(request): + """Determine if the given request is coming from a bot. + + :type request: :api:`twisted.web.http.Request` + :param request: A ``Request`` object, including POST arguments which + should include two key/value pairs. + :rtype: bool + :returns: True if the request is coming from a bot and False otherwise. + """ + + for header, badRegexp in BLACKLISTED_REQUEST_HEADERS.items(): + value = request.getHeader(header) + if value is None: + continue + + if badRegexp.search(value) is not None: + logging.info("Found bot request. Headers: %s" % + request.requestHeaders) + return True + + return False diff --git a/bridgedb/distributors/https/server.py b/bridgedb/distributors/https/server.py index 732f8bf..e5df7da 100644 --- a/bridgedb/distributors/https/server.py +++ b/bridgedb/distributors/https/server.py @@ -53,6 +53,7 @@ from bridgedb import strings from bridgedb import translations from bridgedb import txrecaptcha from bridgedb import metrics +from bridgedb import antibot from bridgedb.distributors.common.http import setFQDN from bridgedb.distributors.common.http import getFQDN from bridgedb.distributors.common.http import getClientIP @@ -916,6 +917,15 @@ class BridgesResource(CustomErrorHandlingResource, CSPResource): bridgeLines = [replaceControlChars(bridge.getBridgeLine( bridgeRequest, self.includeFingerprints)) for bridge in bridges]
+ if antibot.isRequestFromBot(request): + transports = bridgeRequest.transports + # Return either a decoy bridge or no bridge. + if len(transports) > 2: + logging.warning("More than one transport requested") + return self.renderAnswer(request) + ttype = "vanilla" if len(transports) == 0 else transports[0] + return self.renderAnswer(request, antibot.getDecoyBridge(ttype, bridgeRequest.ipVersion)) + return self.renderAnswer(request, bridgeLines)
def getResponseFormat(self, request): diff --git a/bridgedb/distributors/moat/server.py b/bridgedb/distributors/moat/server.py index 73d2423..10096e7 100644 --- a/bridgedb/distributors/moat/server.py +++ b/bridgedb/distributors/moat/server.py @@ -41,6 +41,7 @@ from twisted.web.server import Site from bridgedb import metrics from bridgedb import captcha from bridgedb import crypto +from bridgedb import antibot from bridgedb.distributors.common.http import setFQDN from bridgedb.distributors.common.http import getFQDN from bridgedb.distributors.common.http import getClientIP @@ -735,6 +736,11 @@ class CaptchaCheckResource(CaptchaResource): logging.warn(("Not enough bridges of the type specified to " "fulfill the following request: %s") % bridgeRequest)
+ if antibot.isRequestFromBot(request): + ttype = transport or "vanilla" + bridgeLines = antibot.getDecoyBridge(ttype, + bridgeRequest.ipVersion) + # If we have no bridges at all to give to the client, then # return a JSON API 404 error. if not bridgeLines: diff --git a/bridgedb/main.py b/bridgedb/main.py index 5d9b0c6..94f4921 100644 --- a/bridgedb/main.py +++ b/bridgedb/main.py @@ -26,6 +26,7 @@ from bridgedb import proxy from bridgedb import runner from bridgedb import util from bridgedb import metrics +from bridgedb import antibot from bridgedb.bridges import MalformedBridgeInfo from bridgedb.bridges import MissingServerDescriptorDigest from bridgedb.bridges import ServerDescriptorDigestMismatch @@ -417,6 +418,11 @@ def run(options, reactor=reactor): proxy.loadProxiesFromFile(proxyfile, proxies, removeStale=True) metrics.setProxies(proxies)
+ logging.info("Reloading blacklisted request headers...") + antibot.loadBlacklistedRequestHeaders(config.BLACKLISTED_REQUEST_HEADERS_FILE) + logging.info("Reloading decoy bridges...") + antibot.loadDecoyBridges(config.DECOY_BRIDGES_FILE) + logging.info("Reparsing bridge descriptors...") (hashring, emailDistributorTmp, diff --git a/bridgedb/test/test_antibot.py b/bridgedb/test/test_antibot.py new file mode 100644 index 0000000..1cda86a --- /dev/null +++ b/bridgedb/test/test_antibot.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 ; test-case-name: bridgedb.test.test_metrics ; -*- +# _____________________________________________________________________________ +# +# This file is part of BridgeDB, a Tor bridge distribution system. +# +# :authors: please see included AUTHORS file +# :copyright: (c) 2019, The Tor Project, Inc. +# (c) 2019, Philipp Winter +# :license: see LICENSE for licensing information +# _____________________________________________________________________________ + +"""Tests for :mod:`bridgedb.antibot`.""" + +import os +import tempfile + +from twisted.trial import unittest +from twisted.web.test.requesthelper import DummyRequest + +from bridgedb import antibot + + +class AntiBot(unittest.TestCase): + """Unittests for :mod:`bridgedb.antibot`.""" + + def write_file(self, content): + """ + Write the given content to a temporary file. + + We're responsible for deleting the file once we're done. + """ + fd, filename = tempfile.mkstemp(prefix="bridgedb") + fh = os.fdopen(fd, "w") + fh.write(content) + fh.close() + return filename + + def test_load_csv(self): + """Load a valid CSV file.""" + content = "foo,bar\nbar,foo\n" + filename = self.write_file(content) + + csv = antibot._loadCSV(filename) + self.assertEqual(csv["foo"], "bar") + self.assertEqual(csv["bar"], "foo") + + os.unlink(filename) + + def test_load_invalid_csv(self): + """Load an invalid CSV file that has two commas in one line.""" + content = "foo,bar,bad\nbar,foo\n" + filename = self.write_file(content) + + csv = antibot._loadCSV(filename) + self.assertEqual(len(csv), 1) + + os.unlink(filename) + + def test_load_blacklisted_headers(self): + """Load valid blacklisted request headers.""" + content = "accept-language,[Kk]lingon" + filename = self.write_file(content) + + antibot.loadBlacklistedRequestHeaders(filename) + + request = DummyRequest(['']) + verdict = antibot.isRequestFromBot(request) + self.assertFalse(verdict) + + request.requestHeaders.setRawHeaders("accept-language", + ["i speak kllingon"]) + antibot.loadBlacklistedRequestHeaders(filename) + verdict = antibot.isRequestFromBot(request) + self.assertFalse(verdict) + + request.requestHeaders.setRawHeaders("accept-language", + ["i speak klingon"]) + antibot.loadBlacklistedRequestHeaders(filename) + verdict = antibot.isRequestFromBot(request) + self.assertTrue(verdict) + + os.unlink(filename) + + def test_load_invalid_blacklisted_headers(self): + """Load invalid blacklisted request headers with a broken regexp.""" + content = "accept-language,[Klingon\nuser-agent,foo*" + filename = self.write_file(content) + + antibot.loadBlacklistedRequestHeaders(filename) + self.assertEqual(len(antibot.BLACKLISTED_REQUEST_HEADERS), 1) + + os.unlink(filename) + + def test_load_decoy_bridges(self): + """Load decoy bridges.""" + obfs4_line = "obfs4 1.2.3.4:1234 FINGERPRINT FOO BAR" + vanilla_line = "1.2.3.4:1234 FINGERPRINT" + + content = "vanillav4,%s\nobfs4v4,%s" % (vanilla_line, obfs4_line) + filename = self.write_file(content) + + antibot.loadDecoyBridges(filename) + self.assertEqual(antibot.getDecoyBridge("obfs4", 4), [obfs4_line]) + self.assertEqual(antibot.getDecoyBridge("vanilla", 4), [vanilla_line]) + self.assertEqual(antibot.getDecoyBridge("vanilla", 6), None) + self.assertEqual(antibot.getDecoyBridge("vanilla", 7), None) + + os.unlink(filename)