[or-cvs] [tor/master] Implement the pareto fitting and timeout calculating bits.

arma at seul.org arma at seul.org
Thu Sep 17 01:46:21 UTC 2009


Author: Mike Perry <mikeperry-git at fscked.org>
Date: Thu, 27 Aug 2009 01:46:06 -0700
Subject: Implement the pareto fitting and timeout calculating bits.
Commit: 04414830fe199d80bddf67c64e17d32d54a385e4

---
 src/or/Makefile.am    |    4 +-
 src/or/circuitbuild.c |  285 +++++++++++++++++++++++++++++++++++++------------
 src/or/circuituse.c   |    2 +
 src/or/config.c       |    7 +-
 src/or/or.h           |   28 ++++-
 5 files changed, 244 insertions(+), 82 deletions(-)

diff --git a/src/or/Makefile.am b/src/or/Makefile.am
index 7d6c9eb..e9916d5 100644
--- a/src/or/Makefile.am
+++ b/src/or/Makefile.am
@@ -41,14 +41,14 @@ AM_CPPFLAGS = -DSHARE_DATADIR="\"$(datadir)\"" \
 tor_LDFLAGS = @TOR_LDFLAGS_zlib@ @TOR_LDFLAGS_openssl@ @TOR_LDFLAGS_libevent@
 tor_LDADD = ../common/libor.a ../common/libor-crypto.a \
 	../common/libor-event.a \
-	-lz -levent -lssl -lcrypto @TOR_LIB_WS32@ @TOR_LIB_GDI@
+	-lz -lm -levent -lssl -lcrypto @TOR_LIB_WS32@ @TOR_LIB_GDI@
 test_SOURCES = $(COMMON_SRC) test_data.c test.c
 
 test_LDFLAGS = @TOR_LDFLAGS_zlib@ @TOR_LDFLAGS_openssl@ \
         @TOR_LDFLAGS_libevent@
 test_LDADD = ../common/libor.a ../common/libor-crypto.a \
 	../common/libor-event.a \
-	-lz -levent -lssl -lcrypto @TOR_LIB_WS32@ @TOR_LIB_GDI@
+	-lz -lm -levent -lssl -lcrypto @TOR_LIB_WS32@ @TOR_LIB_GDI@
 
 noinst_HEADERS = or.h eventdns.h eventdns_tor.h micro-revision.i
 
diff --git a/src/or/circuitbuild.c b/src/or/circuitbuild.c
index 7334308..582567b 100644
--- a/src/or/circuitbuild.c
+++ b/src/or/circuitbuild.c
@@ -9,14 +9,23 @@
  * \brief The actual details of building circuits.
  **/
 
+#include <math.h>
+
+long int lround(double x);
+
+double
+ln(double x)
+{
+  return log(x);
+}
+#undef log
+
 #include "or.h"
+#include "crypto.h"
 
 /********* START VARIABLES **********/
 /** Global list of circuit build times */
-// XXX: Make this a smartlist..
-uint16_t circuit_build_times[NCIRCUITS_TO_OBSERVE];
-int build_times_idx = 0;
-int total_build_times = 0;
+circuit_build_times_t circ_times;
 
 /** A global list of all circuits at this hop. */
 extern circuit_t *global_circuitlist;
@@ -65,25 +74,24 @@ static int onion_append_hop(crypt_path_t **head_ptr, extend_info_t *choice);
 static void entry_guards_changed(void);
 static time_t start_of_month(time_t when);
 
-static int circuit_build_times_add_time(time_t time);
-/** circuit_build_times is a circular array, so loop around when
- *  array is full
+/**
+ * circuit_build_times is a circular array, so loop around when
+ * array is full
  *
  * time units are milliseconds
  */
-static
 int
-circuit_build_times_add_time(long time)
+circuit_build_times_add_time(circuit_build_times_t *cbt, long time)
 {
-  if(time > UINT16_MAX) {
+  if (time > UINT16_MAX) {
     log_notice(LD_CIRC,
-       "Circuit build time of %dms exceeds max. Capping at 65536ms", time);
+       "Circuit build time of %ldms exceeds max. Capping at 65536ms", time);
     time = UINT16_MAX;
   }
-  circuit_build_times[build_times_idx] = time;
-  build_times_idx = (build_times_idx + 1) % NCIRCUITS_TO_OBSERVE;
-  if(total_build_times + 1 < NCIRCUITS_TO_OBSERVE)
-    total_build_times++;
+  cbt->circuit_build_times[cbt->build_times_idx] = time;
+  cbt->build_times_idx = (cbt->build_times_idx + 1) % NCIRCUITS_TO_OBSERVE;
+  if (cbt->total_build_times + 1 < NCIRCUITS_TO_OBSERVE)
+    cbt->total_build_times++;
 
   return 0;
 }
@@ -91,15 +99,16 @@ circuit_build_times_add_time(long time)
 /**
  * Calculate histogram
  */
-void
-circuit_build_times_create_histogram(uint16_t * histogram)
+static void
+circuit_build_times_create_histogram(circuit_build_times_t *cbt,
+                                     uint16_t *histogram)
 {
- int i, c;
-   // calculate histogram
-  for(i = 0; i < NCIRCUITS_TO_OBSERVE; i++) {
-    if(circuit_build_times[i] == 0) continue; /* 0 <-> uninitialized */
+  int i, c;
+  // calculate histogram
+  for (i = 0; i < NCIRCUITS_TO_OBSERVE; i++) {
+    if (cbt->circuit_build_times[i] == 0) continue; /* 0 <-> uninitialized */
 
-    c = (circuit_build_times[i] / BUILDTIME_BIN_WIDTH);
+    c = (cbt->circuit_build_times[i] / BUILDTIME_BIN_WIDTH);
     histogram[c]++;
   }
 }
@@ -107,26 +116,29 @@ circuit_build_times_create_histogram(uint16_t * histogram)
 /**
  * Find maximum circuit build time
  */
-uint16_t
-circuit_build_times_max()
+static uint16_t
+circuit_build_times_max(circuit_build_times_t *cbt)
 {
   int i = 0, max_build_time = 0;
-  for( i = 0; i < NCIRCUITS_TO_OBSERVE; i++) {
-    if(circuit_build_times[i] > max_build_time)
-      max_build_time = circuit_build_times[i];
+  for (i = 0; i < NCIRCUITS_TO_OBSERVE; i++) {
+    if (cbt->circuit_build_times[i] > max_build_time)
+      max_build_time = cbt->circuit_build_times[i];
   }
   return max_build_time;
 }
 
-uint16_t
-circuit_build_times_min()
+static uint16_t
+circuit_build_times_min(circuit_build_times_t *cbt)
 {
   int i = 0;
   uint16_t min_build_time = UINT16_MAX;
-  for( i = 0; i < NCIRCUITS_TO_OBSERVE; i++) {
-    if(circuit_build_times[i] && /* 0 <-> uninitialized */
-            circuit_build_times[i] < min_build_time)
-      min_build_time = circuit_build_times[i];
+  for (i = 0; i < NCIRCUITS_TO_OBSERVE; i++) {
+    if (cbt->circuit_build_times[i] && /* 0 <-> uninitialized */
+            cbt->circuit_build_times[i] < min_build_time)
+      min_build_time = cbt->circuit_build_times[i];
+  }
+  if (min_build_time == UINT16_MAX) {
+    log_warn(LD_CIRC, "No build times less than UIN16_MAX!");
   }
   return min_build_time;
 }
@@ -135,85 +147,218 @@ circuit_build_times_min()
  * output a histogram of current circuit build times
  */
 void
-circuit_build_times_update_state(or_state_t * state)
+circuit_build_times_update_state(circuit_build_times_t *cbt,
+                                 or_state_t *state)
 {
   uint16_t max_build_time = 0, *histogram;
   int i = 0, nbins = 0;
   config_line_t **next, *line;
 
-  max_build_time = circuit_build_times_max();
+  max_build_time = circuit_build_times_max(cbt);
   nbins = 1 + (max_build_time / BUILDTIME_BIN_WIDTH);
   histogram = tor_malloc_zero(nbins * sizeof(uint16_t));
 
-  circuit_build_times_create_histogram(histogram);
+  circuit_build_times_create_histogram(cbt, histogram);
   // write to state
   config_free_lines(state->BuildtimeHistogram);
   next = &state->BuildtimeHistogram;
   *next = NULL;
 
-  state->TotalBuildTimes = total_build_times;
+  state->TotalBuildTimes = cbt->total_build_times;
 
   // total build times?
-  for(i = 0; i < nbins; i++) {
-    if(histogram[i] == 0) continue; // compress the histogram by skipping the blanks
+  for (i = 0; i < nbins; i++) {
+    // compress the histogram by skipping the blanks
+    if (histogram[i] == 0) continue;
     *next = line = tor_malloc_zero(sizeof(config_line_t));
     line->key = tor_strdup("CircuitBuildTimeBin");
-    line->value = tor_malloc(20);
-    tor_snprintf(line->value, 20, "%d %d", i*BUILDTIME_BIN_WIDTH,
+    line->value = tor_malloc(25);
+    tor_snprintf(line->value, 25, "%d %d", i*BUILDTIME_BIN_WIDTH,
                  histogram[i]);
     next = &(line->next);
   }
-  if(!get_options()->AvoidDiskWrites)
+  if (!get_options()->AvoidDiskWrites)
     or_state_mark_dirty(get_or_state(), 0);
 
-  if(histogram) tor_free(histogram);
-}
-
-int
-find_next_available(int chosen)
-{// find index of next open slot in circuit_build_times
-  int idx = 0;
-  for(idx = (chosen + 1) % NCIRCUITS_TO_OBSERVE; idx < chosen;
-          idx = ((idx + 1 ) % NCIRCUITS_TO_OBSERVE)) {
-    if(circuit_build_times[idx] == 0) {
-      return idx;
-    }
-  }
-  return 0;
+  if (histogram) tor_free(histogram);
 }
 
 /** Load histogram from state */
 int
-circuit_build_times_parse_state(or_state_t *state, char **msg)
+circuit_build_times_parse_state(circuit_build_times_t *cbt,
+                                or_state_t *state, char **msg)
 {
+  int tot_values = 0, lines = 0;
   config_line_t *line;
   msg = NULL;
-  memset(circuit_build_times, 0, NCIRCUITS_TO_OBSERVE);
-  total_build_times = state->TotalBuildTimes;
+  memset(cbt->circuit_build_times, 0, NCIRCUITS_TO_OBSERVE);
+  cbt->total_build_times = state->TotalBuildTimes;
 
-  for(line = state->BuildtimeHistogram; line; line = line->next) {
+  for (line = state->BuildtimeHistogram; line; line = line->next) {
     smartlist_t * args = smartlist_create();
     smartlist_split_string(args, line->value, " ",
                                   SPLIT_SKIP_SPACE|SPLIT_IGNORE_BLANK, 0);
-    if(smartlist_len(args) < 2) {
+    if (smartlist_len(args) < 2) {
       *msg = tor_strdup("Unable to parse circuit build times: "
                           "Too few arguments to CircuitBuildTIme");
       break;
     } else {
-      uint16_t ms, count, i;
-      /* XXX: use tor_strtol */
-      ms = atol(smartlist_get(args,0));
-      count = atol(smartlist_get(args,1));
-      for(i = 0; i < count; i++) {
-        circuit_build_times_add_time(ms);
+      const char *ms_str = smartlist_get(args,0);
+      const char *count_str = smartlist_get(args,1);
+      uint32_t count, i;
+      uint16_t ms;
+      int ok;
+      ms = tor_parse_ulong(ms_str, 0, 0, UINT16_MAX, &ok, NULL);
+      if (!ok) {
+        *msg = tor_strdup("Unable to parse circuit build times: "
+                          "Unparsable bin number");
+        break;
+      }
+      count = tor_parse_ulong(count_str, 0, 0, UINT32_MAX, &ok, NULL);
+      if (!ok) {
+        *msg = tor_strdup("Unable to parse circuit build times: "
+                          "Unparsable bin count");
+        break;
+      }
+      lines++;
+      for (i = 0; i < count; i++) {
+        circuit_build_times_add_time(cbt, ms);
+        tot_values++;
       }
     }
   }
+  log_info(LD_CIRC, "Loaded %d values from %d lines in circuit time histogram",
+           tot_values, lines);
+  circuit_build_times_set_timeout(cbt);
   return (msg ? -1 : 0);
 }
 
+static void
+circuit_build_times_update_alpha(circuit_build_times_t *cbt)
+{
+  uint16_t *x=cbt->circuit_build_times;
+  double a = 0;
+  int n=0,i=0;
 
+  /* http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation */
+  cbt->Xm = circuit_build_times_min(cbt);
+
+  for (i=0; i< NCIRCUITS_TO_OBSERVE; i++) {
+    if (!x[i]) continue;
+    a += ln(x[i]);
+    n++;
+  }
+  tor_assert(n==cbt->total_build_times);
+  a -= n*ln(cbt->Xm);
+  a = n/a;
+
+  cbt->alpha = a;
+}
 
+/**
+ * This is the Pareto Quantile Function. It calculates the point x
+ * in the distribution such that F(x) < quantile (ie quantile*100%
+ * of the mass of the density function is below x on the curve).
+ *
+ * We use it to calculate the timeout and also synthetic values of
+ * time for circuits that timeout before completion.
+ *
+ * See http://en.wikipedia.org/wiki/Quantile_function,
+ * http://en.wikipedia.org/wiki/Inverse_transform_sampling and
+ * http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation
+ * That's right. I'll cite wikipedia all day long.
+ */
+static double
+circuit_build_times_calculate_timeout(circuit_build_times_t *cbt,
+                                      double quantile)
+{
+  return cbt->Xm/pow(1.0-quantile,1.0/cbt->alpha);
+}
+
+static void
+circuit_build_times_add_timeout_worker(circuit_build_times_t *cbt)
+{
+  /* Generate 0.8-1.0... */
+  uint64_t r = crypto_rand_uint64(UINT64_MAX-1);
+  double u = BUILDTIMEOUT_QUANTILE_CUTOFF +
+             ((1.0-BUILDTIMEOUT_QUANTILE_CUTOFF)*r)/(1.0*UINT64_MAX);
+
+  long gentime = lround(circuit_build_times_calculate_timeout(cbt, u));
+
+  if (gentime < get_options()->CircuitBuildTimeout*1000) {
+    log_warn(LD_CIRC,
+      "Generated a synthetic timeout LESS than the current timeout: %ld vs %d",
+      gentime, get_options()->CircuitBuildTimeout*1000);
+    tor_assert(gentime >= get_options()->CircuitBuildTimeout*1000);
+  } else if (gentime > UINT16_MAX) {
+    gentime = UINT16_MAX;
+    log_info(LD_CIRC,
+      "Generated a synthetic timeout LESS than the current timeout: %ld vs %d",
+      gentime, get_options()->CircuitBuildTimeout*1000);
+  } else {
+    log_info(LD_CIRC, "Generated synthetic time %ld for timeout",
+             gentime);
+  }
+
+  circuit_build_times_add_time(cbt, gentime);
+}
+
+/**
+ * Store a timeout as a synthetic value
+ */
+void
+circuit_build_times_add_timeout(circuit_build_times_t *cbt)
+{
+  if (cbt->total_build_times < MIN_CIRCUITS_TO_OBSERVE) {
+    /* Store a timeout before we have enough data as special */
+    cbt->pre_timeouts++;
+    return;
+  }
+
+  /* Store a timeout as a random position on this curve */
+  if (cbt->pre_timeouts && get_options()->CircuitBuildTimeout != 60) {
+    cbt->Xm = circuit_build_times_min(cbt);
+    // Use current timeout to get an estimate on alpha
+    // Q(u) = Xm/((1-u)^(1/a))
+    // Q(0.8) = Xm/((1-0.8))^(1/a)) = CircBuildTimeout
+    // CircBuildTimeout = Xm/((1-0.8))^(1/a))
+    // CircBuildTimeout = Xm*((1-0.8))^(-1/a))
+    // ln(CircBuildTimeout) = ln(Xm)+ln(((1-0.8)))*(-1/a)
+    // -ln(1-0.8)/(ln(CircBuildTimeout)-ln(Xm))=a
+    cbt->alpha = -ln(1-BUILDTIMEOUT_QUANTILE_CUTOFF)/
+             (ln(get_options()->CircuitBuildTimeout)-ln(cbt->Xm));
+    while (cbt->pre_timeouts-- != 0) {
+      circuit_build_times_add_timeout_worker(cbt);
+    }
+  }
+
+  cbt->pre_timeouts = 0;
+  circuit_build_times_add_timeout_worker(cbt);
+}
+
+void
+circuit_build_times_set_timeout(circuit_build_times_t *cbt)
+{
+  double timeout;
+
+  if (cbt->total_build_times < MIN_CIRCUITS_TO_OBSERVE) {
+    log_info(LD_CIRC,
+            "Not enough circuits yet to calculate a new build timeout."
+            " Need %d more.",
+            MIN_CIRCUITS_TO_OBSERVE-cbt->total_build_times);
+    return;
+  }
+
+  circuit_build_times_update_alpha(cbt);
+  timeout = circuit_build_times_calculate_timeout(cbt,
+                                BUILDTIMEOUT_QUANTILE_CUTOFF);
+
+  get_options()->CircuitBuildTimeout = lround(timeout/1000.0);
+
+  log_info(LD_CIRC,
+           "Set circuit build timeout to %d based on %d circuit times",
+           get_options()->CircuitBuildTimeout, cbt->total_build_times);
+}
 
 /** Iterate over values of circ_id, starting from conn-\>next_circ_id,
  * and with the high bit specified by conn-\>circ_id_type, until we get
@@ -800,9 +945,9 @@ circuit_send_next_onion_skin(origin_circuit_t *circ)
       tor_gettimeofday(&end);
       /* done building the circuit. whew. */
       circuit_set_state(TO_CIRCUIT(circ), CIRCUIT_STATE_OPEN);
-      circuit_build_times_add_time(tor_mdiff(&circ->_base.timestamp_created,
-                  &end));
-      circuit_build_times_recompute();
+      circuit_build_times_add_time(&circ_times,
+                  tv_mdiff(&circ->_base.highres_created, &end));
+      circuit_build_times_set_timeout(&circ_times);
       log_info(LD_CIRC,"circuit built!");
       circuit_reset_failure_count(0);
       if (circ->build_state->onehop_tunnel)
diff --git a/src/or/circuituse.c b/src/or/circuituse.c
index ee2d0bb..e93d28d 100644
--- a/src/or/circuituse.c
+++ b/src/or/circuituse.c
@@ -358,6 +358,8 @@ circuit_expire_building(time_t now)
 
     circuit_log_path(LOG_INFO,LD_CIRC,TO_ORIGIN_CIRCUIT(victim));
     circuit_mark_for_close(victim, END_CIRC_REASON_TIMEOUT);
+    circuit_build_times_add_timeout(&circ_times);
+    circuit_build_times_set_timeout(&circ_times);
   }
 }
 
diff --git a/src/or/config.c b/src/or/config.c
index 0345ca7..39a4ac1 100644
--- a/src/or/config.c
+++ b/src/or/config.c
@@ -413,7 +413,6 @@ static config_var_t _state_vars[] = {
   VAR("CircuitBuildTimeBin",      LINELIST_S,  BuildtimeHistogram,     NULL),
   VAR("BuildtimeHistogram",      LINELIST_V,  BuildtimeHistogram,     NULL),
 
-
   { NULL, CONFIG_TYPE_OBSOLETE, 0, NULL }
 };
 
@@ -2923,7 +2922,7 @@ compute_publishserverdescriptor(or_options_t *options)
 /** Lowest allowable value for CircuitBuildTimeout; values too low will
  * increase network load because of failing connections being retried, and
  * might prevent users from connecting to the network at all. */
-#define MIN_CIRCUIT_BUILD_TIMEOUT 30
+#define MIN_CIRCUIT_BUILD_TIMEOUT 5
 
 /** Lowest allowable value for MaxCircuitDirtiness; if this is too low, Tor
  * will generate too many circuits and potentially overload the network. */
@@ -5070,7 +5069,7 @@ or_state_set(or_state_t *new_state)
     tor_free(err);
   }
 
-  if(circuit_build_times_parse_state(global_state, &err) < 0)   {
+  if (circuit_build_times_parse_state(&circ_times, global_state, &err) < 0)   {
     log_warn(LD_GENERAL,"%s",err);
     tor_free(err);
 
@@ -5208,7 +5207,7 @@ or_state_save(time_t now)
    * to avoid redundant writes. */
   entry_guards_update_state(global_state);
   rep_hist_update_state(global_state);
-  circuit_build_times_update_state(global_state);
+  circuit_build_times_update_state(&circ_times, global_state);
   if (accounting_is_enabled(get_options()))
     accounting_run_housekeeping(now);
 
diff --git a/src/or/or.h b/src/or/or.h
index be54ab4..5e94a56 100644
--- a/src/or/or.h
+++ b/src/or/or.h
@@ -1890,6 +1890,8 @@ typedef struct crypt_path_t {
 #define NCIRCUITS_TO_OBSERVE 10000 /* approx 3 weeks worth of circuits */
 #define BUILDTIME_BIN_WIDTH 50
 
+/* TODO: This should be moved to the consensus */
+#define BUILDTIMEOUT_QUANTILE_CUTOFF 0.8
 
 /** Information used to build a circuit. */
 typedef struct {
@@ -1984,7 +1986,7 @@ typedef struct circuit_t {
   time_t timestamp_created; /**< When was this circuit created? */
   time_t timestamp_dirty; /**< When the circuit was first used, or 0 if the
                            * circuit is clean. */
-  struct timeval highres_created; /**< When exactly was this circuit created? */
+  struct timeval highres_created; /**< When exactly was the circuit created? */
 
   uint16_t marked_for_close; /**< Should we close this circuit at the end of
                               * the main loop? (If true, holds the line number
@@ -2695,7 +2697,6 @@ typedef struct {
   config_line_t * BuildtimeHistogram;
   uint16_t TotalBuildTimes;
 
-
   /** What version of Tor wrote this state file? */
   char *TorVersion;
 
@@ -2865,10 +2866,25 @@ void bridges_retry_all(void);
 
 void entry_guards_free_all(void);
 
-void circuit_build_times_update_state(or_state_t *state);
-int  circuit_build_times_parse_state(or_state_t *state, char **msg);
-
-
+/* Circuit Build Timeout "public" functions (I love C... No wait.) */
+typedef struct {
+  // XXX: Make this a smartlist..
+  uint16_t circuit_build_times[NCIRCUITS_TO_OBSERVE];
+  int build_times_idx;
+  int total_build_times;
+  int pre_timeouts;
+  uint16_t Xm;
+  double alpha;
+} circuit_build_times_t;
+
+extern circuit_build_times_t circ_times;
+void circuit_build_times_update_state(circuit_build_times_t *cbt,
+                                      or_state_t *state);
+int  circuit_build_times_parse_state(circuit_build_times_t *cbt,
+                                     or_state_t *state, char **msg);
+void circuit_build_times_add_timeout(circuit_build_times_t *cbt);
+void circuit_build_times_set_timeout(circuit_build_times_t *cbt);
+int circuit_build_times_add_time(circuit_build_times_t *cbt, long time);
 
 /********************************* circuitlist.c ***********************/
 
-- 
1.5.6.5




More information about the tor-commits mailing list