[tor-commits] [tor/master] Fix Xm mode calculation to properly average N=10 modes.

asn at torproject.org asn at torproject.org
Fri Feb 19 11:21:15 UTC 2021


commit ed9d60cb92efbf5bc2a3c22ade6ffee4e1c8116a
Author: Mike Perry <mikeperry-git at torproject.org>
Date:   Fri Nov 6 16:30:16 2020 -0600

    Fix Xm mode calculation to properly average N=10 modes.
    
    This is still fast enough. ~100usec on my laptop with 1000 build times.
---
 src/core/or/circuitstats.c | 61 +++++++++++++++++++---------------------------
 src/core/or/circuitstats.h |  2 +-
 2 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/src/core/or/circuitstats.c b/src/core/or/circuitstats.c
index d92f35fdc2..57d45d2240 100644
--- a/src/core/or/circuitstats.c
+++ b/src/core/or/circuitstats.c
@@ -853,58 +853,49 @@ circuit_build_times_create_histogram(const circuit_build_times_t *cbt,
  * Return the Pareto start-of-curve parameter Xm.
  *
  * Because we are not a true Pareto curve, we compute this as the
- * weighted average of the N most frequent build time bins. N is either
- * 1 if we don't have enough circuit build time data collected, or
- * determined by the consensus parameter cbtnummodes (default 3).
+ * weighted average of the 10 most frequent build time bins. This
+ * heuristic allowed for the actual timeout rate to be closest
+ * to the chosen quantile cutoff, for quantiles 60-80%, out of
+ * many variant approaches (see #40157 for analysis).
  */
 static build_time_t
 circuit_build_times_get_xm(circuit_build_times_t *cbt)
 {
-  build_time_t i, nbins;
+  build_time_t nbins = 0;
   build_time_t *nth_max_bin;
-  int32_t bin_counts=0;
-  build_time_t ret = 0;
-  uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins);
-  int n=0;
+  build_time_t xm_total = 0;
+  build_time_t Xm = 0;
+  int32_t xm_counts=0;
   int num_modes = circuit_build_times_default_num_xm_modes();
+  uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins);
 
   tor_assert(nbins > 0);
   tor_assert(num_modes > 0);
 
-  // Only use one mode if < 1000 buildtimes. Not enough data
-  // for multiple.
-  if (cbt->total_build_times < CBT_NCIRCUITS_TO_OBSERVE)
-    num_modes = 1;
-
   nth_max_bin = tor_calloc(num_modes, sizeof(build_time_t));
 
-  /* Determine the N most common build times */
-  for (i = 0; i < nbins; i++) {
-    if (histogram[i] >= histogram[nth_max_bin[0]]) {
-      nth_max_bin[0] = i;
-    }
-
-    for (n = 1; n < num_modes; n++) {
-      if (histogram[i] >= histogram[nth_max_bin[n]] &&
-           (!histogram[nth_max_bin[n-1]]
-               || histogram[i] < histogram[nth_max_bin[n-1]])) {
+  /* Determine the N most common build times, by selecting the
+   * nth largest mode, counting it, and removing it from the histogram. */
+  for (int n = 0; n < num_modes; n++) {
+    /* Get nth mode */
+    for (build_time_t i = 0; i < nbins; i++) {
+      if (histogram[i] > histogram[nth_max_bin[n]]) {
         nth_max_bin[n] = i;
       }
     }
-  }
 
-  for (n = 0; n < num_modes; n++) {
-    bin_counts += histogram[nth_max_bin[n]];
-    ret += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]];
-    log_info(LD_CIRC, "Xm mode #%d: %u %u", n, CBT_BIN_TO_MS(nth_max_bin[n]),
-             histogram[nth_max_bin[n]]);
+    /* Update average */
+    xm_counts += histogram[nth_max_bin[n]];
+    xm_total += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]];
+
+    /* Prevent from re-counting this value */
+    histogram[nth_max_bin[n]] = 0;
   }
 
-  /* bin_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE
+  /* xm_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE
    * circuits were abandoned before they completed. This shouldn't happen,
    * though. We should have reset/re-learned a lower timeout first. */
-  if (bin_counts == 0) {
-    ret = 0;
+  if (xm_counts == 0) {
     log_warn(LD_CIRC,
                "No valid circuit build time data out of %d times, %u modes, "
                "have_timeout=%d, %lfms", cbt->total_build_times, num_modes,
@@ -912,15 +903,13 @@ circuit_build_times_get_xm(circuit_build_times_t *cbt)
     goto done;
   }
 
-  tor_assert(bin_counts > 0);
-
-  ret /= bin_counts;
+  Xm = xm_total / xm_counts;
 
  done:
   tor_free(histogram);
   tor_free(nth_max_bin);
 
-  return ret;
+  return Xm;
 }
 
 /**
diff --git a/src/core/or/circuitstats.h b/src/core/or/circuitstats.h
index 15fe23641e..864b3b5f43 100644
--- a/src/core/or/circuitstats.h
+++ b/src/core/or/circuitstats.h
@@ -58,7 +58,7 @@ void circuit_build_times_mark_circ_as_measurement_only(origin_circuit_t *circ);
 #define CBT_BIN_WIDTH ((build_time_t)10)
 
 /** Number of modes to use in the weighted-avg computation of Xm */
-#define CBT_DEFAULT_NUM_XM_MODES 3
+#define CBT_DEFAULT_NUM_XM_MODES 10
 #define CBT_MIN_NUM_XM_MODES 1
 #define CBT_MAX_NUM_XM_MODES 20
 





More information about the tor-commits mailing list