[or-cvs] r14318: When we remove old routers, use Bloom filters rather than a (in tor/trunk: . src/common src/or)

nickm at seul.org nickm at seul.org
Tue Apr 8 17:06:42 UTC 2008


Author: nickm
Date: 2008-04-08 13:06:41 -0400 (Tue, 08 Apr 2008)
New Revision: 14318

Modified:
   tor/trunk/
   tor/trunk/ChangeLog
   tor/trunk/src/common/container.c
   tor/trunk/src/common/container.h
   tor/trunk/src/or/routerlist.c
   tor/trunk/src/or/test.c
Log:
 r19233 at catbus:  nickm | 2008-04-08 13:06:34 -0400
 When we remove old routers, use Bloom filters rather than a digestmap-based set in order to tell which ones we absolutely need to keep.  This will save us roughly a kazillion little short-lived allocations for hash table entries.



Property changes on: tor/trunk
___________________________________________________________________
 svk:merge ticket from /tor/trunk [r19233] on 8246c3cf-6607-4228-993b-4d95d33730f1

Modified: tor/trunk/ChangeLog
===================================================================
--- tor/trunk/ChangeLog	2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/ChangeLog	2008-04-08 17:06:41 UTC (rev 14318)
@@ -51,6 +51,10 @@
     - New --hush command-line option similar to --quiet.  While --quiet
       disables all logging to the console on startup, --hush limits the
       output to messages of warning and error severity.
+    - Use a Bloom filter rather than a digest-based set to track which
+      descriptors we need to keep around when we're cleaning out old
+      router descriptors.  This speeds up the computation significantly, and
+      may reduce fragmentation.
 
   o Code simplifications and refactoring:
     - Refactor code using connection_ap_handshake_attach_circuit() to

Modified: tor/trunk/src/common/container.c
===================================================================
--- tor/trunk/src/common/container.c	2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/common/container.c	2008-04-08 17:06:41 UTC (rev 14318)
@@ -1192,3 +1192,23 @@
 IMPLEMENT_ORDER_FUNC(find_nth_uint32, uint32_t)
 IMPLEMENT_ORDER_FUNC(find_nth_long, long)
 
+/** Return a newly allocated digestset_t, optimized to hold a total of
+ * <b>max_elements</b> digests with a reasonably low false positive weight. */
+digestset_t *
+digestset_new(int max_elements)
+{
+  int n_bits = 1u << (tor_log2(max_elements)+5);
+  digestset_t *r = tor_malloc(sizeof(digestset_t));
+  r->mask = n_bits - 1;
+  r->ba = bitarray_init_zero(n_bits);
+  return r;
+}
+
+/** Free all storage held in <b>set</b>. */
+void
+digestset_free(digestset_t *set)
+{
+  bitarray_free(set->ba);
+  tor_free(set);
+}
+

Modified: tor/trunk/src/common/container.h
===================================================================
--- tor/trunk/src/common/container.h	2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/common/container.h	2008-04-08 17:06:41 UTC (rev 14318)
@@ -564,6 +564,49 @@
   return b[bit >> BITARRAY_SHIFT] & (1u << (bit & BITARRAY_MASK));
 }
 
+/** A set of digests, implemented as a Bloom filter. */
+typedef struct {
+  int mask; /* One less than the number of bits in <b>ba</b>; always one less
+             * than a power of two. */
+  bitarray_t *ba; /* A bit array to implement the Bloom filter. */
+} digestset_t;
+
+#define BIT(n) ((n) & set->mask)
+/** Add the digest <b>digest</b> to <b>set</b>. */
+static INLINE void
+digestset_add(digestset_t *set, const char *digest)
+{
+  const uint32_t *p = (const uint32_t *)digest;
+  const uint32_t d1 = p[0] + (p[1]>>16);
+  const uint32_t d2 = p[1] + (p[2]>>16);
+  const uint32_t d3 = p[2] + (p[3]>>16);
+  const uint32_t d4 = p[3] + (p[0]>>16);
+  bitarray_set(set->ba, BIT(d1));
+  bitarray_set(set->ba, BIT(d2));
+  bitarray_set(set->ba, BIT(d3));
+  bitarray_set(set->ba, BIT(d4));
+}
+
+/** If <b>digest</b> is in <b>set</b>, return nonzero.  Otherwise,
+ * <em>probably</em> return zero. */
+static INLINE int
+digestset_isin(const digestset_t *set, const char *digest)
+{
+  const uint32_t *p = (const uint32_t *)digest;
+  const uint32_t d1 = p[0] + (p[1]>>16);
+  const uint32_t d2 = p[1] + (p[2]>>16);
+  const uint32_t d3 = p[2] + (p[3]>>16);
+  const uint32_t d4 = p[3] + (p[0]>>16);
+  return bitarray_is_set(set->ba, BIT(d1)) &&
+         bitarray_is_set(set->ba, BIT(d2)) &&
+         bitarray_is_set(set->ba, BIT(d3)) &&
+         bitarray_is_set(set->ba, BIT(d4));
+}
+#undef BIT
+
+digestset_t *digestset_new(int max_elements);
+void digestset_free(digestset_t* set);
+
 /* These functions, given an <b>array</b> of <b>n_elements</b>, return the
  * <b>nth</b> lowest element. <b>nth</b>=0 gives the lowest element;
  * <b>n_elements</b>-1 gives the highest; and (<b>n_elements</b>-1) / 2 gives

Modified: tor/trunk/src/or/routerlist.c
===================================================================
--- tor/trunk/src/or/routerlist.c	2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/or/routerlist.c	2008-04-08 17:06:41 UTC (rev 14318)
@@ -2957,7 +2957,7 @@
 static void
 routerlist_remove_old_cached_routers_with_id(time_t now,
                                              time_t cutoff, int lo, int hi,
-                                             digestmap_t *retain)
+                                             digestset_t *retain)
 {
   int i, n = hi-lo+1;
   unsigned n_extra, n_rmv = 0;
@@ -2974,10 +2974,9 @@
     tor_assert(!memcmp(ident, r->identity_digest, DIGEST_LEN));
   }
 #endif
-
   /* Check whether we need to do anything at all. */
   {
-    int mdpr = directory_caches_dir_info(get_options()) ? 5 : 2;
+    int mdpr = directory_caches_dir_info(get_options()) ? 2 : 1;
     if (n <= mdpr)
       return;
     n_extra = n - mdpr;
@@ -2993,7 +2992,7 @@
     signed_descriptor_t *r_next;
     lifespans[i-lo].idx = i;
     if (r->last_listed_as_valid_until >= now ||
-        (retain && digestmap_get(retain, r->signed_descriptor_digest))) {
+        (retain && digestset_isin(retain, r->signed_descriptor_digest))) {
       must_keep[i-lo] = 1;
     }
     if (i < hi) {
@@ -3049,10 +3048,11 @@
   time_t cutoff;
   routerinfo_t *router;
   signed_descriptor_t *sd;
-  digestmap_t *retain;
+  digestset_t *retain;
   int caches = directory_caches_dir_info(get_options());
   const networkstatus_t *consensus = networkstatus_get_latest_consensus();
   const smartlist_t *networkstatus_v2_list = networkstatus_get_v2_list();
+  int n_expected_retain = 0;
 
   trusted_dirs_remove_old_certs();
 
@@ -3061,7 +3061,18 @@
 
   // routerlist_assert_ok(routerlist);
 
-  retain = digestmap_new();
+  n_expected_retain = smartlist_len(consensus->routerstatus_list);
+  if (caches &&
+      networkstatus_v2_list && smartlist_len(networkstatus_v2_list)) {
+    SMARTLIST_FOREACH(networkstatus_v2_list, networkstatus_v2_t *, ns,
+                      n_expected_retain += smartlist_len(ns->entries));
+    /*XXXX021 too much magic. */
+    n_expected_retain /= (smartlist_len(networkstatus_v2_list)/2+1);
+  }
+  //log_notice(LD_DIR,"n_expected_retain=%d",n_expected_retain);
+
+  retain = digestset_new(n_expected_retain);
+
   cutoff = now - OLD_ROUTER_DESC_MAX_AGE;
   /* Build a list of all the descriptors that _anybody_ lists. */
   if (caches) {
@@ -3077,7 +3088,7 @@
        * system will obsolete this whole thing in 0.2.0.x. */
       SMARTLIST_FOREACH(ns->entries, routerstatus_t *, rs,
         if (rs->published_on >= cutoff)
-          digestmap_set(retain, rs->descriptor_digest, (void*)1));
+          digestset_add(retain, rs->descriptor_digest));
     });
   }
 
@@ -3085,13 +3096,13 @@
   if (consensus) {
     SMARTLIST_FOREACH(consensus->routerstatus_list, routerstatus_t *, rs,
         if (rs->published_on >= cutoff)
-          digestmap_set(retain, rs->descriptor_digest, (void*)1));
+          digestset_add(retain, rs->descriptor_digest));
   }
 
-  /* If we have a bunch of networkstatuses, we should consider pruning current
-   * routers that are too old and that nobody recommends.  (If we don't have
-   * enough networkstatuses, then we should get more before we decide to kill
-   * routers.) */
+  /* If we have nearly as many networkstatuses as we want, we should consider
+   * pruning current routers that are too old and that nobody recommends.  (If
+   * we don't have enough networkstatuses, then we should get more before we
+   * decide to kill routers.) */
   if (!caches ||
       smartlist_len(networkstatus_v2_list) > get_n_v2_authorities() / 2) {
     cutoff = now - ROUTER_MAX_AGE;
@@ -3100,7 +3111,8 @@
       router = smartlist_get(routerlist->routers, i);
       if (router->cache_info.published_on <= cutoff &&
           router->cache_info.last_listed_as_valid_until < now &&
-          !digestmap_get(retain,router->cache_info.signed_descriptor_digest)) {
+          !digestset_isin(retain,
+                          router->cache_info.signed_descriptor_digest)) {
         /* Too old: remove it.  (If we're a cache, just move it into
          * old_routers.) */
         log_info(LD_DIR,
@@ -3120,7 +3132,7 @@
     sd = smartlist_get(routerlist->old_routers, i);
     if (sd->published_on <= cutoff &&
         sd->last_listed_as_valid_until < now &&
-        !digestmap_get(retain, sd->signed_descriptor_digest)) {
+        !digestset_isin(retain, sd->signed_descriptor_digest)) {
       /* Too old.  Remove it. */
       routerlist_remove_old(routerlist, sd, i--);
     }
@@ -3128,11 +3140,9 @@
 
   //routerlist_assert_ok(routerlist);
 
-  log_info(LD_DIR, "We have %d live routers and %d old router descriptors. "
-           "At most %d must be retained because of networkstatuses.",
+  log_info(LD_DIR, "We have %d live routers and %d old router descriptors.",
            smartlist_len(routerlist->routers),
-           smartlist_len(routerlist->old_routers),
-           digestmap_size(retain));
+           smartlist_len(routerlist->old_routers));
 
   /* Now we might have to look at routerlist->old_routers for extraneous
    * members. (We'd keep all the members if we could, but we need to save
@@ -3141,9 +3151,10 @@
    * total number doesn't approach max_descriptors_per_router()*len(router).
    */
   if (smartlist_len(routerlist->old_routers) <
-      smartlist_len(routerlist->routers) * (caches?4:2))
+      smartlist_len(routerlist->routers))
     goto done;
 
+  /* Sort by identity, then fix indices. */
   smartlist_sort(routerlist->old_routers, _compare_old_routers_by_identity);
   /* Fix indices. */
   for (i = 0; i < smartlist_len(routerlist->old_routers); ++i) {
@@ -3171,7 +3182,7 @@
   //routerlist_assert_ok(routerlist);
 
  done:
-  digestmap_free(retain, NULL);
+  digestset_free(retain);
 }
 
 /** We just added a new set of descriptors. Take whatever extra steps

Modified: tor/trunk/src/or/test.c
===================================================================
--- tor/trunk/src/or/test.c	2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/or/test.c	2008-04-08 17:06:41 UTC (rev 14318)
@@ -1922,6 +1922,40 @@
   bitarray_free(ba);
 }
 
+static void
+test_util_digestset(void)
+{
+  smartlist_t *included = smartlist_create();
+  char d[DIGEST_LEN];
+  int i;
+  int ok = 1;
+  int false_positives = 0;
+  digestset_t *set;
+
+  for (i = 0; i < 1000; ++i) {
+    crypto_rand(d, DIGEST_LEN);
+    smartlist_add(included, tor_memdup(d, DIGEST_LEN));
+  }
+  set = digestset_new(1000);
+  SMARTLIST_FOREACH(included, const char *, cp,
+                    if (digestset_isin(set, cp))
+                      ok = 0);
+  test_assert(ok);
+  SMARTLIST_FOREACH(included, const char *, cp,
+                    digestset_add(set, cp));
+  SMARTLIST_FOREACH(included, const char *, cp,
+                    if (!digestset_isin(set, cp))
+                      ok = 0);
+  test_assert(ok);
+  for (i = 0; i < 1000; ++i) {
+    crypto_rand(d, DIGEST_LEN);
+    if (digestset_isin(set, d))
+      ++false_positives;
+  }
+  test_assert(false_positives < 50); /* Should be far lower. */
+  digestset_free(set);
+}
+
 /* stop threads running at once. */
 static tor_mutex_t *_thread_test_mutex = NULL;
 /* make sure that threads have to run at the same time. */
@@ -3362,6 +3396,69 @@
 }
 
 static void
+bench_dmap(void)
+{
+  smartlist_t *sl = smartlist_create();
+  smartlist_t *sl2 = smartlist_create();
+  struct timeval start, end, pt2, pt3, pt4;
+  const int iters = 10000;
+  const int elts = 4000;
+  const int fpostests = 1000000;
+  char d[20];
+  int i,n=0, fp = 0;
+  digestmap_t *dm = digestmap_new();
+  digestset_t *ds = digestset_new(elts);
+
+  for (i = 0; i < elts; ++i) {
+    crypto_rand(d, 20);
+    smartlist_add(sl, tor_memdup(d, 20));
+  }
+  for (i = 0; i < elts; ++i) {
+    crypto_rand(d, 20);
+    smartlist_add(sl2, tor_memdup(d, 20));
+  }
+  printf("nbits=%d\n", ds->mask+1);
+
+  tor_gettimeofday(&start);
+  for (i = 0; i < iters; ++i) {
+    SMARTLIST_FOREACH(sl, const char *, cp, digestmap_set(dm, cp, (void*)1));
+  }
+  tor_gettimeofday(&pt2);
+  for (i = 0; i < iters; ++i) {
+    SMARTLIST_FOREACH(sl, const char *, cp, digestmap_get(dm, cp));
+    SMARTLIST_FOREACH(sl2, const char *, cp, digestmap_get(dm, cp));
+  }
+  tor_gettimeofday(&pt3);
+  for (i = 0; i < iters; ++i) {
+    SMARTLIST_FOREACH(sl, const char *, cp, digestset_add(ds, cp));
+  }
+  tor_gettimeofday(&pt4);
+  for (i = 0; i < iters; ++i) {
+    SMARTLIST_FOREACH(sl, const char *, cp, n += digestset_isin(ds, cp));
+    SMARTLIST_FOREACH(sl2, const char *, cp, n += digestset_isin(ds, cp));
+  }
+  tor_gettimeofday(&end);
+
+  for (i = 0; i < fpostests; ++i) {
+    crypto_rand(d, 20);
+    if (digestset_isin(ds, d)) ++fp;
+  }
+
+  printf("%ld\n",(unsigned long)tv_udiff(&start, &pt2));
+  printf("%ld\n",(unsigned long)tv_udiff(&pt2, &pt3));
+  printf("%ld\n",(unsigned long)tv_udiff(&pt3, &pt4));
+  printf("%ld\n",(unsigned long)tv_udiff(&pt4, &end));
+  printf("-- %d\n", n);
+  printf("++ %f\n", fp/(double)fpostests);
+  digestmap_free(dm, NULL);
+  digestset_free(ds);
+  SMARTLIST_FOREACH(sl, char *, cp, tor_free(cp));
+  SMARTLIST_FOREACH(sl2, char *, cp, tor_free(cp));
+  smartlist_free(sl);
+  smartlist_free(sl2);
+}
+
+static void
 test_util_mempool(void)
 {
   mp_pool_t *pool;
@@ -3850,6 +3947,7 @@
   SUBENT(util, datadir),
   SUBENT(util, smartlist),
   SUBENT(util, bitarray),
+  SUBENT(util, digestset),
   SUBENT(util, mempool),
   SUBENT(util, memarea),
   SUBENT(util, strmap),
@@ -3960,6 +4058,11 @@
     return 0;
   }
 
+  if (0) {
+    bench_dmap();
+    return 0;
+  }
+
   atexit(remove_directory);
 
   printf("Running Tor unit tests on %s\n", get_uname());



More information about the tor-commits mailing list