[tor-commits] [bridgedb/master] Remove the idea of "categories". Proxies constitute another cluster.

isis at torproject.org isis at torproject.org
Sat Jul 25 19:26:21 UTC 2015


commit 9ba68b455a8adeaef34785ecccbe7ac5a5d5ac26
Author: Isis Lovecruft <isis at torproject.org>
Date:   Sat Apr 11 01:10:05 2015 +0000

    Remove the idea of "categories". Proxies constitute another cluster.
    
    This completely removes the half-baked idea of "IP categories" from
    BridgeDB's codebase.  An IPBasedDistributor now holds a bunch of proxies
    (in one single bridgedb.proxy.ProxySet).  These proxies have their own
    "cluster".  End of story.  No more messing around with both "categories"
    and "clusters", because nobody ever knew what either of those were
    anyway, much less why they were different.
    
     * CHANGE IPBasedDistributor.categories to be a single
       bridgedb.proxy.ProxySet, rather than a list of lists.  ProxySets have
       "tags" for each proxy within them, and can efficiently separate
       proxies based upon tag(s), so there is no longer any need to have a
       list of lists to keep the proxies separate.  Besides, in the end, we
       treat all clients coming from any proxy the same, regardless of
       whether the proxy is a Tor Exit relay or some other known open proxy.
    
     * RENAME IPBasedDistributor.categories → IPBasedDistributor.proxies,
       because no one ever understood what an "IP category" was.
    
     * CHANGE IPBasedDistributor.numberOfClusters to always include the
       "cluster" for clients coming from proxies, if
       IPBasedDistributor.proxies is configured.
    
     * ADD IPBasedDistributor.proxyCluster, which is an integer pointing to
       the cluster number for the proxies cluster.  If there is no cluster
       for proxies, it is set to 0.  Otherwise, it points to the last
       cluster.
    
     * CHANGE the numbering of clusters in IPBasedDistributor to start from
       1 instead of 0.  While this might make iteration over all of the
       clusters increasingly subject to fencepost errors, it still makes the
       code feel more intuitive, i.e. "The first cluster is 1 out of 5 total
       clusters, and the last cluster is 5/5" rather than "The first cluster
       is number 0 out of 5 total clusters, and the last 4/5".
    
     * CHANGE a couple test cases which expected
       IPBasedDistributor.categories to exist.
    
     * UPDATE documentation pertaining to all of the above, as well as the
       documentation on (sub)hashring structure.
---
 lib/bridgedb/Dist.py              |  172 +++++++++++++++++++------------------
 lib/bridgedb/Main.py              |   41 +++------
 lib/bridgedb/test/legacy_Tests.py |   11 ++-
 lib/bridgedb/test/test_Main.py    |    4 +-
 4 files changed, 107 insertions(+), 121 deletions(-)

diff --git a/lib/bridgedb/Dist.py b/lib/bridgedb/Dist.py
index 2f63574..ab7ae6e 100644
--- a/lib/bridgedb/Dist.py
+++ b/lib/bridgedb/Dist.py
@@ -20,6 +20,7 @@ import time
 import bridgedb.Bridges
 import bridgedb.Storage
 
+from bridgedb import proxy
 from bridgedb.crypto import getHMAC
 from bridgedb.crypto import getHMACFunc
 from bridgedb.Filters import filterAssignBridgesToRing
@@ -134,7 +135,9 @@ class IPBasedDistributor(Distributor):
         hashrings, one for each area in the ``areaMapper``. Every inserted
         bridge will go into one of these rings, and every area is associated
         with one.
-    :ivar categories: DOCDOC See :param:`proxySets`.
+    :type proxies: :class:`~bridgedb.proxies.ProxySet`
+    :ivar proxies: All known proxies, which we treat differently. See
+        :param:`proxies`.
     :type splitter: :class:`bridgedb.Bridges.FixedBridgeSplitter`
     :ivar splitter: A hashring that assigns bridges to subrings with fixed
         proportions. Used to assign bridges into the subrings of this
@@ -142,7 +145,7 @@ class IPBasedDistributor(Distributor):
     """
 
     def __init__(self, areaMapper, numberOfClusters, key,
-                 proxySets=None, answerParameters=None):
+                 proxies=None, answerParameters=None):
         """Create a Distributor that decides which bridges to distribute based
         upon the client's IP address and the current time.
 
@@ -160,27 +163,33 @@ class IPBasedDistributor(Distributor):
         :param bytes key: The master HMAC key for this distributor. All added
             bridges are HMACed with this key in order to place them into the
             hashrings.
-        :type proxySets: iterable or None
-        :param proxySets: DOCDOC
+        :type proxies: :class:`~bridgedb.proxy.ProxySet`
+        :param proxies: A :class:`bridgedb.proxy.ProxySet` containing known
+            Tor Exit relays and other known proxies.  These will constitute
+            the extra cluster, and any client requesting bridges from one of
+            these **proxies** will be distributed bridges from a separate
+            subhashring that is specific to Tor/proxy users.
         :type answerParameters: :class:`bridgedb.Bridges.BridgeRingParameters`
         :param answerParameters: A mechanism for ensuring that the set of
             bridges that this distributor answers a client with fit certain
             parameters, i.e. that an answer has "at least two obfsproxy
             bridges" or "at least one bridge on port 443", etc.
         """
+        self.rings = []
         self.areaMapper = areaMapper
-        self.numberOfClusters = numberOfClusters
         self.answerParameters = answerParameters
+        self.numberOfClusters = numberOfClusters
 
-        if not proxySets:
-            proxySets = []
-        if not answerParameters:
-            answerParameters = []
-        self.rings = []
+        if proxies:
+            logging.info("Added known proxies to HTTPS distributor...")
+            self.proxies = proxies
+            self.numberOfClusters += 1
+            self.proxyCluster = self.numberOfClusters
+        else:
+            logging.warn("No known proxies were added to HTTPS distributor!")
+            self.proxies = proxy.ProxySet()
+            self.proxyCluster = 0
 
-        self.categories = []
-        for c in proxySets:
-            self.categories.append(c)
 
         key2 = getHMAC(key, "Assign-Bridges-To-Rings")
         key3 = getHMAC(key, "Order-Areas-In-Rings")
@@ -193,7 +202,7 @@ class IPBasedDistributor(Distributor):
         #
         # XXX Why is the "extra room" hardcoded to be 5? Shouldn't it be some
         #     fraction of the number of clusters/categories? --isis
-        ring_cache_size  = self.numberOfClusters + len(proxySets) + 5
+        ring_cache_size = self.numberOfClusters + 5
         self.splitter = bridgedb.Bridges.FilteredBridgeSplitter(
             key2, max_cached_rings=ring_cache_size)
         logging.debug("Added splitter %s to IPBasedDistributor."
@@ -207,63 +216,59 @@ class IPBasedDistributor(Distributor):
 
         The hashring structure for this distributor is influenced by the
         ``N_IP_CLUSTERS`` configuration option, as well as the number of
-        ``PROXY_LIST_FILES``.  Essentially, :data:`numberOfClusters` is set to the
-        specified ``N_IP_CLUSTERS``.  The ``PROXY_LIST_FILES`` (plus the
-        :class:`bridgedb.proxy.ProxySet` for the Tor Exit list downloaded into
-        memory with :script:`get-tor-exits`) are stored in :data:`categories`.
-
-        The number of subhashrings which this :class:`Distributor` has active
-        in its hashring is then the :data:`numberOfClusters` plus the number of
-        :data:`categories`.
+        ``PROXY_LIST_FILES``.
+
+        Essentially, :data:`numberOfClusters` is set to the specified
+        ``N_IP_CLUSTERS``.  All of the ``PROXY_LIST_FILES``, plus the list of
+        Tor Exit relays (downloaded into memory with :script:`get-tor-exits`),
+        are stored in :data:`proxies`, and the latter is added as an
+        additional cluster (such that :data:`numberOfClusters` becomes
+        ``N_IP_CLUSTERS + 1``).  The number of subhashrings which this
+        :class:`Distributor` has active in its hashring is then
+        :data:`numberOfClusters`, where the last cluster is reserved for all
+        :data:`proxies`.
 
         As an example, if BridgeDB was configured with ``N_IP_CLUSTERS=4`` and
         ``PROXY_LIST_FILES=["open-socks-proxies.txt"]``, then the total number
-        of subhashrings is six — four for the "clusters", and two
-        "categories": one for everything contained within the
-        ``"open-socks-proxies.txt"`` file and the other for the downloaded
-        list of Tor Exits.  Thus, the resulting hashring-subhashring structure
+        of subhashrings is five — four for the "clusters", and one for the
+        :data:`proxies`. Thus, the resulting hashring-subhashring structure
         would look like:
 
-        +------------------+---------------------------------------------------+-------------------------+
-        |                  |               Directly connecting users           | Tor / known proxy users |
-        +------------------+------------+------------+------------+------------+------------+------------+
-        | Clusters /       | Cluster-1  | Cluster-2  | Cluster-3  | Cluster-4  | Cat-1      | Cat-2      |
-        | Categories       |            |            |            |            |            |            |
-        +==================+============+============+============+============+============+============+
-        | Subhashrings     |            |            |            |            |            |            |
-        | (total, assigned)| (6,0)      | (6,1)      | (6,2)      | (6,3)      | (6,4)      | (6,5)      |
-        +------------------+------------+------------+------------+------------+------------+------------+
-        | Filtered         | (6,0)-IPv4 | (6,1)-IPv4 | (6,2)-IPv4 | (6,3)-IPv4 | (6,4)-IPv4 | (6,5)-IPv4 |
-        | Subhashrings     |            |            |            |            |            |            |
-        | bBy requested    +------------+------------+------------+------------+------------+------------+
-        | bridge type)     | (6,0)-IPv6 | (6,1)-IPv6 | (6,2)-IPv6 | (6,3)-IPv6 | (6,4)-IPv6 | (6,5)-IPv6 |
-        |                  |            |            |            |            |            |            |
-        +------------------+------------+------------+------------+------------+------------+------------+
+        +------------------+---------------------------------------------------+--------------
+        |                  |               Directly connecting users           | Tor / known |
+        |                  |                                                   | proxy users |
+        +------------------+------------+------------+------------+------------+-------------+
+        | Clusters         | Cluster-1  | Cluster-2  | Cluster-3  | Cluster-4  | Cluster-5   |
+        +==================+============+============+============+============+=============+
+        | Subhashrings     |            |            |            |            |             |
+        | (total, assigned)| (5,1)      | (5,2)      | (5,3)      | (5,4)      | (5,5)       |
+        +------------------+------------+------------+------------+------------+-------------+
+        | Filtered         | (5,1)-IPv4 | (5,2)-IPv4 | (5,3)-IPv4 | (5,4)-IPv4 | (5,5)-IPv4  |
+        | Subhashrings     |            |            |            |            |             |
+        | bBy requested    +------------+------------+------------+------------+-------------+
+        | bridge type)     | (5,1)-IPv6 | (5,2)-IPv6 | (5,3)-IPv6 | (5,4)-IPv6 | (5,5)-IPv6  |
+        |                  |            |            |            |            |             |
+        +------------------+------------+------------+------------+------------+-------------+
 
         The "filtered subhashrings" are essentially filtered copies of their
         respective subhashring, such that they only contain bridges which
-        support IPv4 or IPv6, respectively.  (I have no idea of the relation
-        between ``(6,0)-IPv4`` and ``(6,0)-IPv6``, including whether or not
-        their contents are disjoint. I didn't design this shit, I'm just
-        redesigning it.)
+        support IPv4 or IPv6, respectively.  Additionally, the contents of
+        ``(5,1)-IPv4`` and ``(5,1)-IPv6`` sets are *not* disjoint.
 
-        Thus, in this example, we end up with **12 total subhashrings**.
+        Thus, in this example, we end up with **10 total subhashrings**.
         """
         logging.info("Prepopulating %s distributor hashrings..." % self.name)
 
         for filterFn in [filterBridgesByIP4, filterBridgesByIP6]:
-            # XXX Distributors should have a "totalClusters" property in order
-            # to avoid reusing this unclear construct all over the place.  (Or
-            # just get rid of the idea of "categories".)
-            for cluster in range(self.numberOfClusters + len(self.categories)):
+            for cluster in range(1, self.numberOfClusters):
                 filters = self._buildHashringFilters([filterFn,], cluster)
                 key1 = getHMAC(self.splitter.key, "Order-Bridges-In-Ring-%d" % cluster)
                 ring = bridgedb.Bridges.BridgeRing(key1, self.answerParameters)
                 # For consistency with previous implementation of this method,
-                # only set the "name" for "clusters" which are in this
-                # distributor's categories:
-                if cluster >= self.numberOfClusters:
-                    ring.setName('{0} Ring'.format(self.name))
+                # only set the "name" for "clusters" which are for this
+                # distributor's proxies:
+                if cluster == self.proxyCluster:
+                    ring.setName('{0} Proxy Ring'.format(self.name))
                 self.splitter.addRing(ring, filters,
                                       filterBridgesByRules(filters),
                                       populate_from=self.splitter.bridges)
@@ -273,8 +278,8 @@ class IPBasedDistributor(Distributor):
         self.splitter.insert(bridge)
 
     def _buildHashringFilters(self, previousFilters, clientCluster):
-        totalRings = self.numberOfClusters + len(self.categories)
-        g = filterAssignBridgesToRing(self.splitter.hmac, totalRings, clientCluster)
+        g = filterAssignBridgesToRing(self.splitter.hmac,
+                                      self.numberOfClusters, clientCluster)
         previousFilters.append(g)
         return frozenset(previousFilters)
 
@@ -301,41 +306,38 @@ class IPBasedDistributor(Distributor):
             logging.warn("Bailing! Splitter has zero bridges!")
             return []
 
-        # The cluster the client should draw bridges from:
-        clientCluster = self.numberOfClusters
-        # First, check if the client's IP is one of the known proxies in one
-        # of our :data:`catagories`:
-        for category in self.categories:
-            if bridgeRequest.client in category:
-                # The tag is a tag applied to a proxy IP address when it is
-                # added to the bridgedb.proxy.ProxySet. For Tor Exit relays,
-                # the default is 'exit_relay'. For other proxies loaded from
-                # the PROXY_LIST_FILES config option, the default tag is the
-                # full filename that the IP address originally came from.
-                tag = category.getTag(bridgeRequest.client)
-                logging.info("Client was from known proxy (tag: %s): %s" %
-                             (tag, bridgeRequest.client))
-                # Cluster Tor/proxy users into four groups.  This means that
-                # no matter how many different Tor Exits or proxies a client
-                # uses, the most they can ever get is four different sets of
-                # bridge lines (per period).
-                group = (int(ipaddr.IPAddress(bridgeRequest.client)) % 4) + 1
-                area = "known-proxy-group-%d" % group
-                break
-            clientCluster += 1
-        # If the client wasn't using Tor or any other known proxy, select the
-        # client's cluster number based upon the /16 of the client's IP
-        # address:
+        # First, check if the client's IP is one of the known :data:`proxies`:
+        if bridgeRequest.client in self.proxies:
+            cluster = self.proxyCluster
+            # The tag is a tag applied to a proxy IP address when it is added
+            # to the bridgedb.proxy.ProxySet. For Tor Exit relays, the default
+            # is 'exit_relay'. For other proxies loaded from the
+            # PROXY_LIST_FILES config option, the default tag is the full
+            # filename that the IP address originally came from.
+            tag = self.proxies.getTag(bridgeRequest.client)
+            logging.info("Client was from known proxy (tag: %s): %s" %
+                         (tag, bridgeRequest.client))
+            # Place Tor/proxy users into four groups.  This means that no
+            # matter how many different Tor Exits or proxies a client
+            # uses, the most they can ever get is four different sets of
+            # bridge lines (per period).
+            group = (int(ipaddr.IPAddress(bridgeRequest.client)) % 4) + 1
+            area = "known-proxy-group-%d" % group
+        # If the client wasn't using a proxy, select the client's cluster
+        # based upon the client's area (i.e. the /16 of the client's IP
+        # address):
         else:
             # Areas (i.e. /16s) are grouped into the number of rings specified
             # by the N_IP_CLUSTERS configuration option.
             area = self.areaMapper(bridgeRequest.client)
-            logging.debug("IP mapped to area:\t%s" % area)
-            clientCluster = int(self.areaClusterHmac(area)[:8], 16) % self.numberOfClusters
+            cluster = (int(self.areaClusterHmac(area)[:8], 16)
+                       % (self.numberOfClusters - 1))
 
         pos = self.areaOrderHmac("<%s>%s" % (interval, area))
-        filters = self._buildHashringFilters(bridgeRequest.filters, clientCluster)
+        filters = self._buildHashringFilters(bridgeRequest.filters, cluster)
 
+        logging.debug("Assigned client to cluster %d/%d" %
+                      (cluster, self.numberOfClusters))
         logging.debug("Assigned client hashring position based on: <%s>%s" %
                       (interval, area))
         logging.debug("Bridges in splitter:\t%d" % len(self.splitter))
@@ -349,7 +351,7 @@ class IPBasedDistributor(Distributor):
         # Otherwise, construct a new hashring and populate it:
         else:
             logging.debug("Cache miss %s" % filters)
-            key1 = getHMAC(self.splitter.key, "Order-Bridges-In-Ring-%d" % clientCluster)
+            key1 = getHMAC(self.splitter.key, "Order-Bridges-In-Ring-%d" % cluster)
             ring = bridgedb.Bridges.BridgeRing(key1, self.answerParameters)
             self.splitter.addRing(ring, filters, filterBridgesByRules(filters),
                                   populate_from=self.splitter.bridges)
diff --git a/lib/bridgedb/Main.py b/lib/bridgedb/Main.py
index c2e44ca..23ed262 100644
--- a/lib/bridgedb/Main.py
+++ b/lib/bridgedb/Main.py
@@ -210,17 +210,11 @@ def createBridgeRings(cfg, proxyList, key):
     # As appropriate, create an IP-based distributor.
     if cfg.HTTPS_DIST and cfg.HTTPS_SHARE:
         logging.debug("Setting up HTTPS Distributor...")
-        categories = []
-        if proxyList:
-            logging.debug("Adding proxyList to HTTPS Distributor categories.")
-            categories.append(proxyList)
-        logging.debug("HTTPS Distributor categories: '%s'" % categories)
-
         ipDistributor = Dist.IPBasedDistributor(
             Dist.uniformMap,
             cfg.N_IP_CLUSTERS,
             crypto.getHMAC(key, "HTTPS-IP-Dist-Key"),
-            categories,
+            proxyList,
             answerParameters=ringParams)
         splitter.addRing(ipDistributor, "https", cfg.HTTPS_SHARE)
 
@@ -306,17 +300,12 @@ def run(options, reactor=reactor):
 
     # Load the master key, or create a new one.
     key = crypto.getKey(config.MASTER_KEY_FILE)
-
-    # Get a proxy list.
-    proxyList = proxy.ProxySet()
-    for proxyfile in config.PROXY_LIST_FILES:
-        logging.info("Loading proxies from: %s" % proxyfile)
-        proxy.loadProxiesFromFile(proxyfile, proxyList)
-
-    emailDistributor = ipDistributor = None
+    proxies = proxy.ProxySet()
+    emailDistributor = None
+    ipDistributor = None
 
     # Save our state
-    state.proxyList = proxyList
+    state.proxies = proxies
     state.key = key
     state.save()
 
@@ -363,26 +352,22 @@ def run(options, reactor=reactor):
         level = getattr(logging, level)
         logging.getLogger().setLevel(level)
 
-        logging.debug("Saving state again before reparsing descriptors...")
-        state.save()
-        logging.info("Reparsing bridge descriptors...")
+        logging.info("Reloading the list of open proxies...")
+        for proxyfile in cfg.PROXY_LIST_FILES:
+            logging.info("Loading proxies from: %s" % proxyfile)
+            proxy.loadProxiesFromFile(proxyfile, state.proxies, removeStale=True)
 
+        logging.info("Reparsing bridge descriptors...")
         (splitter,
          emailDistributorTmp,
-         ipDistributorTmp) = createBridgeRings(cfg, proxyList, key)
+         ipDistributorTmp) = createBridgeRings(cfg, state.proxies, key)
+        logging.info("Bridges loaded: %d" % len(splitter))
 
         # Initialize our DB.
         bridgedb.Storage.initializeDBLock()
         bridgedb.Storage.setDBFilename(cfg.DB_FILE + ".sqlite")
         load(state, splitter, clear=False)
 
-        state = persistent.load()
-        logging.info("Bridges loaded: %d" % len(splitter))
-
-        logging.debug("Replacing the list of open proxies...")
-        for proxyfile in cfg.PROXY_LIST_FILES:
-            proxy.loadProxiesFromFile(proxyfile, state.proxyList, removeStale=True)
-
         if emailDistributorTmp is not None:
             emailDistributorTmp.prepopulateRings() # create default rings
             logging.info("Bridges allotted for %s distribution: %d"
@@ -462,7 +447,7 @@ def run(options, reactor=reactor):
         if config.TASKS['GET_TOR_EXIT_LIST']:
             tasks['GET_TOR_EXIT_LIST'] = task.LoopingCall(
                 proxy.downloadTorExits,
-                proxyList,
+                state.proxies,
                 config.SERVER_PUBLIC_EXTERNAL_IP)
 
         # Schedule all configured repeating tasks:
diff --git a/lib/bridgedb/test/legacy_Tests.py b/lib/bridgedb/test/legacy_Tests.py
index 97134e4..7d7d236 100644
--- a/lib/bridgedb/test/legacy_Tests.py
+++ b/lib/bridgedb/test/legacy_Tests.py
@@ -143,8 +143,8 @@ simpleDesc = "router Unnamed %s %s 0 9030\n"\
 orAddress = "or-address %s:%s\n"
 
 
-class RhymesWith255Category:
-    def contains(self, ip):
+class RhymesWith255ProxySet:
+    def __contains__(self, ip):
         return ip.endswith(".255")
 
 class EmailBridgeDistTests(unittest.TestCase):
@@ -194,15 +194,14 @@ class IPBridgeDistTests(unittest.TestCase):
         n2 = d.getBridges("1.2.3.4", "x", 2)
         self.assertEquals(n, n2)
 
-    def testDistWithCategories(self):
+    def testDistWithProxies(self):
         d = bridgedb.Dist.IPBasedDistributor(self.dumbAreaMapper, 3, "Foo",
-                                             [RhymesWith255Category()])
-        assert len(d.categories) == 1
+                                             [RhymesWith255ProxySet()])
         for _ in xrange(256):
             d.insert(fakeBridge())
 
         for _ in xrange(256):
-            # Make sure that the categories do not overlap
+            # Make sure that the ProxySets do not overlap
             f = lambda: ".".join([str(random.randrange(1,255)) for _ in xrange(4)])
             g = lambda: ".".join([str(random.randrange(1,255)) for _ in xrange(3)] + ['255'])
             n = d.getBridges(g(), "x", 10)
diff --git a/lib/bridgedb/test/test_Main.py b/lib/bridgedb/test/test_Main.py
index 9c8b68c..43e008c 100644
--- a/lib/bridgedb/test/test_Main.py
+++ b/lib/bridgedb/test/test_Main.py
@@ -278,8 +278,8 @@ class MainTests(unittest.TestCase):
         # Should have an IPBasedDistributor ring, an EmailDistributor ring,
         # and an UnallocatedHolder ring:
         self.assertEqual(len(splitter.ringsByName.keys()), 3)
-        self.assertGreater(len(httpsDist.categories), 0)
-        self.assertItemsEqual(exitRelays, httpsDist.categories[-1])
+        self.assertGreater(len(httpsDist.proxies), 0)
+        self.assertItemsEqual(exitRelays, httpsDist.proxies)
 
     def test_Main_createBridgeRings_no_https_dist(self):
         """When HTTPS_DIST=False, Main.createBridgeRings() should add only





More information about the tor-commits mailing list