[tor-commits] [tor/master] Refactor bloom filter logic not to be digest-specific.

nickm at torproject.org nickm at torproject.org
Tue Jun 26 21:01:03 UTC 2018


commit bf89278c799e718a3b6c1ea139a6069db054ac09
Author: Nick Mathewson <nickm at torproject.org>
Date:   Tue Jun 26 13:18:23 2018 -0400

    Refactor bloom filter logic not to be digest-specific.
    
    Now the address-set code and the digest-set code share the same
    backend.
    
    Closes ticket 26510
---
 changes/ticket26510           |  4 ++
 src/common/address_set.c      | 85 ++++++++-----------------------------------
 src/common/address_set.h      | 12 ++----
 src/lib/container/bloomfilt.c | 79 +++++++++++++++++++++++++++++++++++++---
 src/lib/container/bloomfilt.h | 71 ++++++++++++------------------------
 src/lib/crypt_ops/digestset.c | 58 +++++++++++++++++++++++++++++
 src/lib/crypt_ops/digestset.h | 32 ++++++++++++++++
 src/lib/crypt_ops/include.am  |  6 ++-
 src/or/entrynodes.c           |  2 +-
 src/or/routerlist.c           |  2 +-
 src/test/bench.c              |  4 +-
 src/test/test_containers.c    |  2 +-
 12 files changed, 219 insertions(+), 138 deletions(-)

diff --git a/changes/ticket26510 b/changes/ticket26510
new file mode 100644
index 000000000..f00457964
--- /dev/null
+++ b/changes/ticket26510
@@ -0,0 +1,4 @@
+  o Code simplification and refactoring:
+    - Unify our bloom filter logic. Previously we had two copies of this
+      code: one for routerlist filtering, and one for address set
+      calculations. Closes ticket 26510.
diff --git a/src/common/address_set.c b/src/common/address_set.c
index f4c5f581c..369d33e9a 100644
--- a/src/common/address_set.c
+++ b/src/common/address_set.c
@@ -14,35 +14,19 @@
 #include "common/address_set.h"
 #include "common/address.h"
 #include "common/compat.h"
-#include "lib/container/bitarray.h"
+#include "lib/container/bloomfilt.h"
 #include "lib/crypt_ops/crypto_rand.h"
 #include "common/util.h"
 #include "siphash.h"
 
-/** How many 64-bit siphash values to extract per address */
-#define N_HASHES 2
-/** How many bloom-filter bits we set per address. This is twice the N_HASHES
- * value, since we split the siphash output into two 32-bit values. */
-#define N_BITS_PER_ITEM (N_HASHES * 2)
-
-/* XXXX This code is largely duplicated with digestset_t.  We should merge
- * them together into a common bloom-filter implementation.  I'm keeping
- * them separate for now, though, since this module needs to be backported
- * all the way to 0.2.9.
- *
- * The main difference between digestset_t and this code is that we use
- * independent siphashes rather than messing around with bit-shifts.  The
- * approach here is probably more sound, and we should prefer it if&when we
- * unify the implementations.
- */
-
-struct address_set_t {
-  /** siphash keys to make N_HASHES independent hashes for each address. */
-  struct sipkey key[N_HASHES];
-  int mask; /**< One less than the number of bits in <b>ba</b>; always one less
-             * than a power of two. */
-  bitarray_t *ba; /**< A bit array to implement the Bloom filter. */
-};
+/* Wrap our hash function to have the signature that the bloom filter
+ * needs. */
+static uint64_t
+bloomfilt_addr_hash(const struct sipkey *key,
+                    const void *item)
+{
+  return tor_addr_keyed_hash(key, item);
+}
 
 /**
  * Allocate and return an address_set, suitable for holding up to
@@ -51,33 +35,11 @@ struct address_set_t {
 address_set_t *
 address_set_new(int max_addresses_guess)
 {
-  /* See digestset_new() for rationale on this equation. */
-  int n_bits = 1u << (tor_log2(max_addresses_guess)+5);
-
-  address_set_t *set = tor_malloc_zero(sizeof(address_set_t));
-  set->mask = n_bits - 1;
-  set->ba = bitarray_init_zero(n_bits);
-  crypto_rand((char*) set->key, sizeof(set->key));
-
-  return set;
-}
-
-/**
- * Release all storage associated with <b>set</b>.
- */
-void
-address_set_free(address_set_t *set)
-{
-  if (! set)
-    return;
-
-  bitarray_free(set->ba);
-  tor_free(set);
+  uint8_t k[BLOOMFILT_KEY_LEN];
+  crypto_rand((void*)k, sizeof(k));
+  return bloomfilt_new(max_addresses_guess, bloomfilt_addr_hash, k);
 }
 
-/** Yield the bit index corresponding to 'val' for set. */
-#define BIT(set, val) ((val) & (set)->mask)
-
 /**
  * Add <b>addr</b> to <b>set</b>.
  *
@@ -87,14 +49,7 @@ address_set_free(address_set_t *set)
 void
 address_set_add(address_set_t *set, const struct tor_addr_t *addr)
 {
-  int i;
-  for (i = 0; i < N_HASHES; ++i) {
-    uint64_t h = tor_addr_keyed_hash(&set->key[i], addr);
-    uint32_t high_bits = (uint32_t)(h >> 32);
-    uint32_t low_bits = (uint32_t)(h);
-    bitarray_set(set->ba, BIT(set, high_bits));
-    bitarray_set(set->ba, BIT(set, low_bits));
-  }
+  bloomfilt_add(set, addr);
 }
 
 /** As address_set_add(), but take an ipv4 address in host order. */
@@ -111,18 +66,8 @@ address_set_add_ipv4h(address_set_t *set, uint32_t addr)
  * return false if <b>addr</b> is not a member of set.)
  */
 int
-address_set_probably_contains(address_set_t *set,
+address_set_probably_contains(const address_set_t *set,
                               const struct tor_addr_t *addr)
 {
-  int i, matches = 0;
-  for (i = 0; i < N_HASHES; ++i) {
-    uint64_t h = tor_addr_keyed_hash(&set->key[i], addr);
-    uint32_t high_bits = (uint32_t)(h >> 32);
-    uint32_t low_bits = (uint32_t)(h);
-    // Note that !! is necessary here, since bitarray_is_set does not
-    // necessarily return 1 on true.
-    matches += !! bitarray_is_set(set->ba, BIT(set, high_bits));
-    matches += !! bitarray_is_set(set->ba, BIT(set, low_bits));
-  }
-  return matches == N_BITS_PER_ITEM;
+  return bloomfilt_probably_contains(set, addr);
 }
diff --git a/src/common/address_set.h b/src/common/address_set.h
index cfee89cfb..2efa1cb03 100644
--- a/src/common/address_set.h
+++ b/src/common/address_set.h
@@ -4,10 +4,6 @@
 /**
  * \file address_set.h
  * \brief Types to handle sets of addresses.
- *
- * This module was first written on a semi-emergency basis to improve the
- * robustness of the anti-DoS module.  As such, it's written in a pretty
- * conservative way, and should be susceptible to improvement later on.
  **/
 
 #ifndef TOR_ADDRESS_SET_H
@@ -15,21 +11,21 @@
 
 #include "orconfig.h"
 #include "lib/cc/torint.h"
+#include "lib/container/bloomfilt.h"
 
 /**
  * An address_set_t represents a set of tor_addr_t values. The implementation
  * is probabilistic: false negatives cannot occur but false positives are
  * possible.
  */
-typedef struct address_set_t address_set_t;
+typedef struct bloomfilt_t address_set_t;
 struct tor_addr_t;
 
 address_set_t *address_set_new(int max_addresses_guess);
-void address_set_free(address_set_t *set);
+#define address_set_free(set) bloomfilt_free(set)
 void address_set_add(address_set_t *set, const struct tor_addr_t *addr);
 void address_set_add_ipv4h(address_set_t *set, uint32_t addr);
-int address_set_probably_contains(address_set_t *set,
+int address_set_probably_contains(const address_set_t *set,
                                   const struct tor_addr_t *addr);
 
 #endif
-
diff --git a/src/lib/container/bloomfilt.c b/src/lib/container/bloomfilt.c
index 2133c280a..1cab817e1 100644
--- a/src/lib/container/bloomfilt.c
+++ b/src/lib/container/bloomfilt.c
@@ -9,14 +9,75 @@
  **/
 
 #include <stdlib.h>
+
 #include "lib/malloc/util_malloc.h"
 #include "lib/container/bloomfilt.h"
 #include "lib/intmath/bits.h"
+#include "lib/log/util_bug.h"
+#include "siphash.h"
+
+/** How many bloom-filter bits we set per address. This is twice the
+ * BLOOMFILT_N_HASHES value, since we split the siphash output into two 32-bit
+ * values. */
+#define N_BITS_PER_ITEM (BLOOMFILT_N_HASHES * 2)
+
+struct bloomfilt_t {
+  /** siphash keys to make BLOOMFILT_N_HASHES independent hashes for each
+   * items. */
+  struct sipkey key[BLOOMFILT_N_HASHES];
+  bloomfilt_hash_fn hashfn; /**< Function used to generate hashes */
+  uint32_t mask; /**< One less than the number of bits in <b>ba</b>; always
+                  * one less than a power of two. */
+  bitarray_t *ba; /**< A bit array to implement the Bloom filter. */
+};
+
+#define BIT(set, n) ((n) & (set)->mask)
+
+/** Add the element <b>item</b> to <b>set</b>. */
+void
+bloomfilt_add(bloomfilt_t *set,
+              const void *item)
+{
+  int i;
+  for (i = 0; i < BLOOMFILT_N_HASHES; ++i) {
+    uint64_t h = set->hashfn(&set->key[i], item);
+    uint32_t high_bits = (uint32_t)(h >> 32);
+    uint32_t low_bits = (uint32_t)(h);
+    bitarray_set(set->ba, BIT(set, high_bits));
+    bitarray_set(set->ba, BIT(set, low_bits));
+  }
+}
 
-/** Return a newly allocated digestset_t, optimized to hold a total of
- * <b>max_elements</b> digests with a reasonably low false positive weight. */
-digestset_t *
-digestset_new(int max_elements)
+/** If <b>item</b> is in <b>set</b>, return nonzero.  Otherwise,
+ * <em>probably</em> return zero. */
+int
+bloomfilt_probably_contains(const bloomfilt_t *set,
+                            const void *item)
+{
+  int i, matches = 0;
+  for (i = 0; i < BLOOMFILT_N_HASHES; ++i) {
+    uint64_t h = set->hashfn(&set->key[i], item);
+    uint32_t high_bits = (uint32_t)(h >> 32);
+    uint32_t low_bits = (uint32_t)(h);
+    // Note that !! is necessary here, since bitarray_is_set does not
+    // necessarily return 1 on true.
+    matches += !! bitarray_is_set(set->ba, BIT(set, high_bits));
+    matches += !! bitarray_is_set(set->ba, BIT(set, low_bits));
+  }
+  return matches == N_BITS_PER_ITEM;
+}
+
+/** Return a newly allocated bloomfilt_t, optimized to hold a total of
+ * <b>max_elements</b> elements with a reasonably low false positive weight.
+ *
+ * Uses the siphash-based function <b>hashfn</b> to compute hard-to-collide
+ * functions of the items, and the key material <b>random_key</b> to
+ * key the hash.  There must be BLOOMFILT_KEY_LEN bytes in the supplied key.
+ **/
+bloomfilt_t *
+bloomfilt_new(int max_elements,
+              bloomfilt_hash_fn hashfn,
+              const uint8_t *random_key)
 {
   /* The probability of false positives is about P=(1 - exp(-kn/m))^k, where k
    * is the number of hash functions per entry, m is the bits in the array,
@@ -29,15 +90,21 @@ digestset_new(int max_elements)
    * conserve CPU, and k==13 is pretty big.
    */
   int n_bits = 1u << (tor_log2(max_elements)+5);
-  digestset_t *r = tor_malloc(sizeof(digestset_t));
+  bloomfilt_t *r = tor_malloc(sizeof(bloomfilt_t));
   r->mask = n_bits - 1;
   r->ba = bitarray_init_zero(n_bits);
+
+  tor_assert(sizeof(r->key) == BLOOMFILT_KEY_LEN);
+  memcpy(r->key, random_key, sizeof(r->key));
+
+  r->hashfn = hashfn;
+
   return r;
 }
 
 /** Free all storage held in <b>set</b>. */
 void
-digestset_free_(digestset_t *set)
+bloomfilt_free_(bloomfilt_t *set)
 {
   if (!set)
     return;
diff --git a/src/lib/container/bloomfilt.h b/src/lib/container/bloomfilt.h
index adcdb10fc..577acf5e4 100644
--- a/src/lib/container/bloomfilt.h
+++ b/src/lib/container/bloomfilt.h
@@ -9,50 +9,27 @@
 #include "orconfig.h"
 #include "lib/cc/torint.h"
 #include "lib/container/bitarray.h"
-#include "siphash.h"
-
-/** A set of digests, implemented as a Bloom filter. */
-typedef struct {
-  int mask; /**< One less than the number of bits in <b>ba</b>; always one less
-             * than a power of two. */
-  bitarray_t *ba; /**< A bit array to implement the Bloom filter. */
-} digestset_t;
-
-#define BIT(n) ((n) & set->mask)
-/** Add the digest <b>digest</b> to <b>set</b>. */
-static inline void
-digestset_add(digestset_t *set, const char *digest)
-{
-  const uint64_t x = siphash24g(digest, 20);
-  const uint32_t d1 = (uint32_t) x;
-  const uint32_t d2 = (uint32_t)( (x>>16) + x);
-  const uint32_t d3 = (uint32_t)( (x>>32) + x);
-  const uint32_t d4 = (uint32_t)( (x>>48) + x);
-  bitarray_set(set->ba, BIT(d1));
-  bitarray_set(set->ba, BIT(d2));
-  bitarray_set(set->ba, BIT(d3));
-  bitarray_set(set->ba, BIT(d4));
-}
-
-/** If <b>digest</b> is in <b>set</b>, return nonzero.  Otherwise,
- * <em>probably</em> return zero. */
-static inline int
-digestset_contains(const digestset_t *set, const char *digest)
-{
-  const uint64_t x = siphash24g(digest, 20);
-  const uint32_t d1 = (uint32_t) x;
-  const uint32_t d2 = (uint32_t)( (x>>16) + x);
-  const uint32_t d3 = (uint32_t)( (x>>32) + x);
-  const uint32_t d4 = (uint32_t)( (x>>48) + x);
-  return bitarray_is_set(set->ba, BIT(d1)) &&
-         bitarray_is_set(set->ba, BIT(d2)) &&
-         bitarray_is_set(set->ba, BIT(d3)) &&
-         bitarray_is_set(set->ba, BIT(d4));
-}
-#undef BIT
-
-digestset_t *digestset_new(int max_elements);
-void digestset_free_(digestset_t* set);
-#define digestset_free(set) FREE_AND_NULL(digestset_t, digestset_free_, (set))
-
-#endif /* !defined(TOR_CONTAINER_H) */
+
+/** A set of elements, implemented as a Bloom filter. */
+typedef struct bloomfilt_t bloomfilt_t;
+
+/** How many 64-bit siphash values to extract per item. */
+#define BLOOMFILT_N_HASHES 2
+
+/** How much key material do we need to randomize hashes? */
+#define BLOOMFILT_KEY_LEN (BLOOMFILT_N_HASHES * 16)
+
+struct sipkey;
+typedef uint64_t (*bloomfilt_hash_fn)(const struct sipkey *key,
+                                      const void *item);
+
+void bloomfilt_add(bloomfilt_t *set, const void *item);
+int bloomfilt_probably_contains(const bloomfilt_t *set, const void *item);
+
+bloomfilt_t *bloomfilt_new(int max_elements,
+                           bloomfilt_hash_fn hashfn,
+                           const uint8_t *random_key);
+void bloomfilt_free_(bloomfilt_t* set);
+#define bloomfilt_free(set) FREE_AND_NULL(bloomfilt_t, bloomfilt_free_, (set))
+
+#endif /* !defined(TOR_BLOOMFILT_H) */
diff --git a/src/lib/crypt_ops/digestset.c b/src/lib/crypt_ops/digestset.c
new file mode 100644
index 000000000..89dd377a9
--- /dev/null
+++ b/src/lib/crypt_ops/digestset.c
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018-2018, The Tor Project, Inc. */
+/* See LICENSE for licensing information */
+
+/**
+ * \file digestset.c
+ * \brief Implementation for a set of digests
+ **/
+
+#include "orconfig.h"
+#include "lib/container/bloomfilt.h"
+#include "lib/crypt_ops/crypto_rand.h"
+#include "lib/defs/digest_sizes.h"
+#include "lib/crypt_ops/digestset.h"
+#include "siphash.h"
+
+/* Wrap our hash function to have the signature that the bloom filter
+ * needs. */
+static uint64_t
+bloomfilt_digest_hash(const struct sipkey *key,
+                      const void *item)
+{
+  return siphash24(item, DIGEST_LEN, key);
+}
+
+/**
+ * Allocate and return an digestset, suitable for holding up to
+ * <b>max_guess</b> distinct values.
+ */
+digestset_t *
+digestset_new(int max_guess)
+{
+  uint8_t k[BLOOMFILT_KEY_LEN];
+  crypto_rand((void*)k, sizeof(k));
+  return bloomfilt_new(max_guess, bloomfilt_digest_hash, k);
+}
+
+/**
+ * Add <b>digest</b> to <b>set</b>.
+ *
+ * All future queries for <b>digest</b> in set will return true. Removing
+ * items is not possible.
+ */
+void
+digestset_add(digestset_t *set, const char *digest)
+{
+  bloomfilt_add(set, digest);
+}
+
+/**
+ * Return true if <b>digest</b> is a member of <b>set</b>.  (And probably,
+ * return false if <b>digest</b> is not a member of set.)
+ */
+int
+digestset_probably_contains(const digestset_t *set,
+                            const char *digest)
+{
+  return bloomfilt_probably_contains(set, digest);
+}
diff --git a/src/lib/crypt_ops/digestset.h b/src/lib/crypt_ops/digestset.h
new file mode 100644
index 000000000..3b5d62c31
--- /dev/null
+++ b/src/lib/crypt_ops/digestset.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018, The Tor Project, Inc. */
+/* See LICENSE for licensing information */
+
+/**
+ * \file digestset.h
+ * \brief Types to handle sets of digests, based on bloom filters.
+ **/
+
+#ifndef TOR_DIGESTSET_H
+#define TOR_DIGESTSET_H
+
+#include "orconfig.h"
+#include "lib/cc/torint.h"
+#include "lib/container/bloomfilt.h"
+
+/**
+ * An digestset_t represents a set of 20-byte digest values. The
+ * implementation is probabilistic: false negatives cannot occur but false
+ * positives are possible.
+ */
+typedef struct bloomfilt_t digestset_t;
+
+digestset_t *digestset_new(int max_addresses_guess);
+#define digestset_free(set) bloomfilt_free(set)
+void digestset_add(digestset_t *set, const char *addr);
+int digestset_probably_contains(const digestset_t *set,
+                                const char *addr);
+
+// XXXX to remove.
+#define digestset_contains digestset_probably_contains
+
+#endif
diff --git a/src/lib/crypt_ops/include.am b/src/lib/crypt_ops/include.am
index b881c689d..1b88b880d 100644
--- a/src/lib/crypt_ops/include.am
+++ b/src/lib/crypt_ops/include.am
@@ -19,7 +19,8 @@ src_lib_libtor_crypt_ops_a_SOURCES =			\
 	src/lib/crypt_ops/crypto_rand.c			\
 	src/lib/crypt_ops/crypto_rsa.c			\
 	src/lib/crypt_ops/crypto_s2k.c			\
-	src/lib/crypt_ops/crypto_util.c
+	src/lib/crypt_ops/crypto_util.c                 \
+	src/lib/crypt_ops/digestset.c
 
 src_lib_libtor_crypt_ops_testing_a_SOURCES = \
 	$(src_lib_libtor_crypt_ops_a_SOURCES)
@@ -41,4 +42,5 @@ noinst_HEADERS +=					\
 	src/lib/crypt_ops/crypto_rand.h			\
 	src/lib/crypt_ops/crypto_rsa.h			\
 	src/lib/crypt_ops/crypto_s2k.h			\
-	src/lib/crypt_ops/crypto_util.h
+	src/lib/crypt_ops/crypto_util.h                 \
+	src/lib/crypt_ops/digestset.h
diff --git a/src/or/entrynodes.c b/src/or/entrynodes.c
index 3c63d3c1c..9d27d80a8 100644
--- a/src/or/entrynodes.c
+++ b/src/or/entrynodes.c
@@ -142,7 +142,7 @@
 #include "or/node_st.h"
 #include "or/origin_circuit_st.h"
 
-#include "lib/container/bloomfilt.h"
+#include "lib/crypt_ops/digestset.h"
 
 /** A list of existing guard selection contexts. */
 static smartlist_t *guard_contexts = NULL;
diff --git a/src/or/routerlist.c b/src/or/routerlist.c
index 5af09fad2..a7dda68a5 100644
--- a/src/or/routerlist.c
+++ b/src/or/routerlist.c
@@ -137,7 +137,7 @@
 #include "or/routerlist_st.h"
 #include "or/vote_routerstatus_st.h"
 
-#include "lib/container/bloomfilt.h"
+#include "lib/crypt_ops/digestset.h"
 
 // #define DEBUG_ROUTERLIST
 
diff --git a/src/test/bench.c b/src/test/bench.c
index 3cfdd0ae4..aa4af703f 100644
--- a/src/test/bench.c
+++ b/src/test/bench.c
@@ -29,7 +29,7 @@
 #include "or/cell_st.h"
 #include "or/or_circuit_st.h"
 
-#include "lib/container/bloomfilt.h"
+#include "lib/crypt_ops/digestset.h"
 
 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_PROCESS_CPUTIME_ID)
 static uint64_t nanostart;
@@ -376,7 +376,7 @@ bench_dmap(void)
     crypto_rand(d, 20);
     smartlist_add(sl2, tor_memdup(d, 20));
   }
-  printf("nbits=%d\n", ds->mask+1);
+  //printf("nbits=%d\n", ds->mask+1);
 
   reset_perftime();
 
diff --git a/src/test/test_containers.c b/src/test/test_containers.c
index f45082be0..acb7543b3 100644
--- a/src/test/test_containers.c
+++ b/src/test/test_containers.c
@@ -10,8 +10,8 @@
 #include "test/test.h"
 
 #include "lib/container/bitarray.h"
-#include "lib/container/bloomfilt.h"
 #include "lib/container/order.h"
+#include "lib/crypt_ops/digestset.h"
 
 /** Helper: return a tristate based on comparing the strings in *<b>a</b> and
  * *<b>b</b>. */





More information about the tor-commits mailing list