[tor-commits] [tor/main] relay: Overload state on DNS timeout is now X% over Y secs

dgoulet at torproject.org dgoulet at torproject.org
Wed Oct 20 15:28:23 UTC 2021


commit 7a8108ea87320da3008e65747baa43c1bbcf13c2
Author: David Goulet <dgoulet at torproject.org>
Date:   Wed Oct 20 09:59:04 2021 -0400

    relay: Overload state on DNS timeout is now X% over Y secs
    
    With this commit, we will only report a general overload state if we've
    seen more than X% of DNS timeout errors over Y seconds. Previous
    behavior was to report when a single timeout occured which is really too
    small of a threshold.
    
    The value X is a consensus parameters called
    "overload_dns_timeout_scale_percent" which is a scaled percentage
    (factor of 1000) so we can represent decimal points for X like 0.5% for
    instance. Its default is 1000 which ends up being 1%.
    
    The value Y is a consensus parameters called
    "overload_dns_timeout_period_secs" which is the time period for which
    will gather DNS errors and once over, we assess if that X% has been
    reached ultimately triggering a general overload signal.
    
    Closes #40491
    
    Signed-off-by: David Goulet <dgoulet at torproject.org>
---
 changes/ticket40491                  |   7 ++
 src/feature/nodelist/networkstatus.c |   2 +
 src/feature/stats/rephist.c          | 130 ++++++++++++++++++++++++++++++++++-
 src/feature/stats/rephist.h          |   2 +
 src/test/test_stats.c                |  78 +++++++++++++++++++++
 5 files changed, 218 insertions(+), 1 deletion(-)

diff --git a/changes/ticket40491 b/changes/ticket40491
new file mode 100644
index 0000000000..01c6c7d748
--- /dev/null
+++ b/changes/ticket40491
@@ -0,0 +1,7 @@
+  o Major bugfixes (relay, overload state):
+    - Report the general overload state for DNS timeout errors only if X% of all
+      DNS queries over Y seconds are errors. Before that, it only took 1 timeout
+      to report the overload state which was just too low of a threshold. The X
+      and Y values are 1% and 10 minutes respectively but they are also
+      controlled by consensus parameters. Fixes bug 40491; bugfix on
+      0.4.6.1-alpha.
diff --git a/src/feature/nodelist/networkstatus.c b/src/feature/nodelist/networkstatus.c
index 6867d8c98e..d57db4c415 100644
--- a/src/feature/nodelist/networkstatus.c
+++ b/src/feature/nodelist/networkstatus.c
@@ -105,6 +105,7 @@
 #include "feature/dirauth/vote_microdesc_hash_st.h"
 #include "feature/nodelist/vote_routerstatus_st.h"
 #include "feature/nodelist/routerstatus_st.h"
+#include "feature/stats/rephist.h"
 
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
@@ -1665,6 +1666,7 @@ notify_before_networkstatus_changes(const networkstatus_t *old_c,
   dos_consensus_has_changed(new_c);
   relay_consensus_has_changed(new_c);
   hs_dos_consensus_has_changed(new_c);
+  rep_hist_consensus_has_changed(new_c);
 }
 
 /* Called after a new consensus has been put in the global state. It is safe
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c
index cb3ccdc91e..aef12e5628 100644
--- a/src/feature/stats/rephist.c
+++ b/src/feature/stats/rephist.c
@@ -219,6 +219,59 @@ static uint64_t stats_n_tcp_exhaustion = 0;
 
 /***** DNS statistics *****/
 
+/* We use a scale here so we can represent percentages with decimal points by
+ * scaling the value by this factor and so 0.5% becomes a value of 500.
+ * Default is 1% and thus min and max range is 0 to 100%. */
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0
+#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000
+
+/** Consensus parameter: indicate what fraction of DNS timeout errors over the
+ * total number of DNS requests must be reached before we trigger a general
+ * overload signal .*/
+static double overload_dns_timeout_fraction =
+   OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT /
+   OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
+
+/* Number of seconds for the assessment period. Default is 10 minutes (600) and
+ * the min max range is within a 32bit value. */
+#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60)
+#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0
+#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX
+
+/** Consensus parameter: Period, in seconds, over which we count the number of
+ * DNS requests and timeout errors. After that period, we assess if we trigger
+ * an overload or not. */
+static int32_t overload_dns_timeout_period_secs =
+  OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT;
+
+/** Overload DNS statistics. The information in this object is used to assess
+ * if, due to DNS errors, we should emit a general overload signal or not.
+ *
+ * NOTE: This structure is _not_ per DNS query type like the statistics below
+ * because of a libevent bug
+ * (https://github.com/libevent/libevent/issues/1219), on error, the type is
+ * not propagated up back to the user and so we need to keep our own stats for
+ * the overload signal. */
+typedef struct {
+  /** Total number of DNS request seen at an Exit. They might not all end
+   * successfully or might even be lost by tor. This counter is incremented
+   * right before the DNS request is initiated. */
+  uint64_t stats_n_request;
+
+  /** Total number of DNS timeout errors. */
+  uint64_t stats_n_error_timeout;
+
+  /** When is the next assessment time of the general overload for DNS errors.
+   * Once this time is reached, all stats are reset and this time is set to the
+   * next assessment time. */
+  time_t next_assessment_time;
+} overload_dns_stats_t;
+
+/** Keep track of the DNS requests for the general overload state. */
+static overload_dns_stats_t overload_dns_stats;
+
 /** Represents the statistics of DNS queries seen if it is an Exit. */
 typedef struct {
   /* Total number of DNS errors found in RFC 1035 (from 0 to 5 code). */
@@ -266,6 +319,60 @@ get_dns_stats_by_type(const int type)
   }
 }
 
+/** Assess the DNS timeout errors and if we have enough to trigger a general
+ * overload. */
+static void
+overload_general_dns_assessment(void)
+{
+  /* Initialize the time. Should be done once. */
+  if (overload_dns_stats.next_assessment_time == 0) {
+    goto reset;
+  }
+
+  /* Not the time yet. */
+  if (overload_dns_stats.next_assessment_time > approx_time()) {
+    return;
+  }
+
+  /* Lets see if we can signal a general overload. */
+  double fraction = (double) overload_dns_stats.stats_n_error_timeout /
+                    (double) overload_dns_stats.stats_n_request;
+  if (fraction >= overload_dns_timeout_fraction) {
+    log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") "
+               "fraction %.4f%% is above threshold of %.4f%%",
+               overload_dns_stats.stats_n_error_timeout,
+               fraction * 100.0,
+               overload_dns_timeout_fraction * 100.0);
+    rep_hist_note_overload(OVERLOAD_GENERAL);
+  }
+
+ reset:
+  /* Reset counters for the next period. */
+  overload_dns_stats.stats_n_error_timeout = 0;
+  overload_dns_stats.stats_n_request = 0;
+  overload_dns_stats.next_assessment_time =
+    approx_time() + overload_dns_timeout_period_secs;
+}
+
+/** Called just before the consensus will be replaced. Update the consensus
+ * parameters in case they changed. */
+void
+rep_hist_consensus_has_changed(const networkstatus_t *ns)
+{
+  overload_dns_timeout_fraction =
+    networkstatus_get_param(ns, "overload_dns_timeout_scale_percent",
+                            OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT,
+                            OVERLOAD_DNS_TIMEOUT_PERCENT_MIN,
+                            OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) /
+    OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0;
+
+  overload_dns_timeout_period_secs =
+    networkstatus_get_param(ns, "overload_dns_timeout_period_secs",
+                            OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT,
+                            OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN,
+                            OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX);
+}
+
 /** Return the DNS error count for the given libevent DNS type and error code.
  * The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
 uint64_t
@@ -320,10 +427,31 @@ rep_hist_get_n_dns_request(int type)
 }
 
 /** Note a DNS error for the given given libevent DNS record type and error
- * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */
+ * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA.
+ *
+ * NOTE: Libevent is _not_ returning the type in case of an error and so if
+ * error is anything but DNS_ERR_NONE, the type is not usable and set to 0.
+ *
+ * See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */
 void
 rep_hist_note_dns_error(int type, uint8_t error)
 {
+  /* Assess if we need to trigger a general overload with regards to the DNS
+   * errors or not. */
+  overload_general_dns_assessment();
+
+  /* Because of the libevent problem (see note in the function comment), we
+   * disregard the DNS query type and keep track of DNS timeout for the
+   * overload state. */
+  if (error == DNS_ERR_TIMEOUT) {
+    overload_dns_stats.stats_n_error_timeout++;
+  }
+  overload_dns_stats.stats_n_request++;
+
+  /* Again, the libevent bug (see function comment), for an error that is
+   * anything but DNS_ERR_NONE, the type is always 0 which means that we don't
+   * have a DNS stat object for it so this code will do nothing until libevent
+   * is fixed. */
   dns_stats_t *dns_stats = get_dns_stats_by_type(type);
   /* Unsupported DNS query type. */
   if (!dns_stats) {
diff --git a/src/feature/stats/rephist.h b/src/feature/stats/rephist.h
index 7f414de4c8..6918ed18c2 100644
--- a/src/feature/stats/rephist.h
+++ b/src/feature/stats/rephist.h
@@ -89,6 +89,8 @@ uint64_t rep_hist_get_n_dns_request(int type);
 void rep_hist_note_dns_request(int type);
 void rep_hist_note_dns_error(int type, uint8_t error);
 
+void rep_hist_consensus_has_changed(const networkstatus_t *ns);
+
 extern uint64_t rephist_total_alloc;
 extern uint32_t rephist_total_num;
 #ifdef TOR_UNIT_TESTS
diff --git a/src/test/test_stats.c b/src/test/test_stats.c
index 081ae22cd5..e85ad40699 100644
--- a/src/test/test_stats.c
+++ b/src/test/test_stats.c
@@ -51,6 +51,8 @@
 #include "feature/stats/bw_array_st.h"
 #include "feature/relay/router.h"
 
+#include <event2/dns.h>
+
 /** Run unit tests for some stats code. */
 static void
 test_stats(void *arg)
@@ -865,6 +867,81 @@ test_overload_stats(void *arg)
   tor_free(stats_str);
 }
 
+/** Test the overload stats logic. */
+static void
+test_overload_dns_timeout(void *arg)
+{
+  char *stats_str = NULL;
+  (void) arg;
+
+  /* Lets simulate a series of timeouts but below our default 1% threshold. */
+
+  for (int i = 0; i < 1000; i++) {
+    /* This should trigger 9 timeouts which is just below 1% (10) */
+    if (i > 0 && !(i % 100)) {
+      rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT);
+    } else {
+      rep_hist_note_dns_error(0, DNS_ERR_NONE);
+    }
+  }
+
+  /* No overload yet. */
+  stats_str = rep_hist_get_overload_general_line();
+  tt_assert(!stats_str);
+
+  /* Move it 10 minutes in the future and see if we get a general overload. */
+  update_approx_time(approx_time() + (10 * 60));
+
+  /* This query should NOT trigger the general overload because we are below
+   * our default of 1%. */
+  rep_hist_note_dns_error(0, DNS_ERR_NONE);
+  stats_str = rep_hist_get_overload_general_line();
+  tt_assert(!stats_str);
+
+  /* We'll now go above our 1% threshold. */
+  for (int i = 0; i < 1000; i++) {
+    /* This should trigger 10 timeouts which is our threshold of 1% (10) */
+    if (!(i % 10)) {
+      rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT);
+    } else {
+      rep_hist_note_dns_error(0, DNS_ERR_NONE);
+    }
+  }
+
+  /* Move it 10 minutes in the future and see if we get a general overload. */
+  update_approx_time(approx_time() + (10 * 60));
+
+  /* This query should trigger the general overload because we are above 1%. */
+  rep_hist_note_dns_error(0, DNS_ERR_NONE);
+  stats_str = rep_hist_get_overload_general_line();
+  tt_assert(stats_str);
+  tor_free(stats_str);
+
+  /* Move 72h in the future, we should NOT get an overload anymore. */
+  update_approx_time(approx_time() + (72 * 3600));
+
+  stats_str = rep_hist_get_overload_general_line();
+  tt_assert(!stats_str);
+
+  /* This query should NOT trigger the general overload. */
+  rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT);
+  stats_str = rep_hist_get_overload_general_line();
+  tt_assert(!stats_str);
+
+  /* Move it 10 minutes in the future and see if we get a general overload. We
+   * have now 100% of requests timing out. */
+  update_approx_time(approx_time() + (10 * 60));
+
+  /* This query should trigger the general overload with 50% of timeouts. */
+  rep_hist_note_dns_error(0, DNS_ERR_NONE);
+  stats_str = rep_hist_get_overload_general_line();
+  tt_assert(stats_str);
+  tor_free(stats_str);
+
+ done:
+  tor_free(stats_str);
+}
+
 #define ENT(name)                                                       \
   { #name, test_ ## name , 0, NULL, NULL }
 #define FORK(name)                                                      \
@@ -881,6 +958,7 @@ struct testcase_t stats_tests[] = {
   FORK(rephist_v3_onions),
   FORK(load_stats_file),
   FORK(overload_stats),
+  FORK(overload_dns_timeout),
 
   END_OF_TESTCASES
 };





More information about the tor-commits mailing list