[tor-commits] [tor/release-0.4.6] relay: Don't make DNS timeout trigger an overload

ahf at torproject.org ahf at torproject.org
Wed Dec 15 12:47:55 UTC 2021


commit cda7acb35d40c505dc4d2c3b55d611faab189477
Author: David Goulet <dgoulet at torproject.org>
Date:   Mon Dec 13 10:22:29 2021 -0500

    relay: Don't make DNS timeout trigger an overload
    
    Tor has configure libevent to attempt up to 3 times a DNS query for a
    maximum of 5 seconds each. Once that 5 seconds has elapsed, it consider
    the query "Timed Out" but tor only gets a timeout if all 3 attempts have
    failed.
    
    For example, using Unbound, it has a much higher threshold of timeout.
    It is well defined in
    https://www.nlnetlabs.nl/documentation/unbound/info-timeout/ and has
    some complexity to it. But the gist is that if it times out, it will be
    much more than 5 seconds.
    
    And so the Tor DNS timeouts are more of a "UX issue" rather than a
    "network issue". For this reason, we are removing this metric from the
    overload general signal.
    
    See https://gitlab.torproject.org/tpo/network-health/team/-/issues/139
    for more information.
    
    Fixes #40527
    
    Signed-off-by: David Goulet <dgoulet at torproject.org>
---
 changes/ticket40527         |  5 +++
 src/feature/stats/rephist.c | 12 -------
 src/test/test_stats.c       | 82 ++-------------------------------------------
 3 files changed, 8 insertions(+), 91 deletions(-)

diff --git a/changes/ticket40527 b/changes/ticket40527
new file mode 100644
index 0000000000..631b3d4bb9
--- /dev/null
+++ b/changes/ticket40527
@@ -0,0 +1,5 @@
+  o Major bugfixes (relay, overload):
+    - Don't make Tor DNS timeout trigger an overload general state. These
+      timeouts are different from DNS server timeout. They have to be seen as
+      timeout related to UX and not because of a network problem. Fixes bug
+      40527; bugfix on 0.4.6.1-alpha.
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c
index c3a281a8c2..2bfa14d326 100644
--- a/src/feature/stats/rephist.c
+++ b/src/feature/stats/rephist.c
@@ -283,18 +283,6 @@ overload_general_dns_assessment(void)
     return;
   }
 
-  /* Lets see if we can signal a general overload. */
-  double fraction = (double) overload_dns_stats.stats_n_error_timeout /
-                    (double) overload_dns_stats.stats_n_request;
-  if (fraction >= overload_dns_timeout_fraction) {
-    log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") "
-               "fraction %.4f%% is above threshold of %.4f%%",
-               overload_dns_stats.stats_n_error_timeout,
-               fraction * 100.0,
-               overload_dns_timeout_fraction * 100.0);
-    rep_hist_note_overload(OVERLOAD_GENERAL);
-  }
-
  reset:
   /* Reset counters for the next period. */
   overload_dns_stats.stats_n_error_timeout = 0;
diff --git a/src/test/test_stats.c b/src/test/test_stats.c
index 3b9a192c75..af035ae54c 100644
--- a/src/test/test_stats.c
+++ b/src/test/test_stats.c
@@ -721,7 +721,7 @@ test_overload_stats(void *arg)
   stats_str = rep_hist_get_overload_stats_lines();
   tt_assert(!stats_str);
 
-  /* Note a DNS overload */
+  /* Note a overload */
   rep_hist_note_overload(OVERLOAD_GENERAL);
 
   /* Move the time forward one hour */
@@ -742,7 +742,7 @@ test_overload_stats(void *arg)
 
   /* Now the time should be 2002-01-07 00:00:00 */
 
-  /* Note a DNS overload */
+  /* Note a overload */
   rep_hist_note_overload(OVERLOAD_GENERAL);
 
   stats_str = rep_hist_get_overload_general_line();
@@ -760,7 +760,7 @@ test_overload_stats(void *arg)
   tt_str_op("overload-fd-exhausted 1 2002-01-07 00:00:00\n", OP_EQ, stats_str);
   tor_free(stats_str);
 
-  /* Move the time forward. Register DNS overload. See that the time changed */
+  /* Move the time forward. Register overload. See that the time changed */
   current_time += 3600*2;
   update_approx_time(current_time);
 
@@ -867,81 +867,6 @@ test_overload_stats(void *arg)
   tor_free(stats_str);
 }
 
-/** Test the overload stats logic. */
-static void
-test_overload_dns_timeout(void *arg)
-{
-  char *stats_str = NULL;
-  (void) arg;
-
-  /* Lets simulate a series of timeouts but below our default 1% threshold. */
-
-  for (int i = 0; i < 1000; i++) {
-    /* This should trigger 9 timeouts which is just below 1% (10) */
-    if (i > 0 && !(i % 100)) {
-      rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT);
-    } else {
-      rep_hist_note_dns_query(0, DNS_ERR_NONE);
-    }
-  }
-
-  /* No overload yet. */
-  stats_str = rep_hist_get_overload_general_line();
-  tt_assert(!stats_str);
-
-  /* Move it 10 minutes in the future and see if we get a general overload. */
-  update_approx_time(approx_time() + (10 * 60));
-
-  /* This query should NOT trigger the general overload because we are below
-   * our default of 1%. */
-  rep_hist_note_dns_query(0, DNS_ERR_NONE);
-  stats_str = rep_hist_get_overload_general_line();
-  tt_assert(!stats_str);
-
-  /* We'll now go above our 1% threshold. */
-  for (int i = 0; i < 1000; i++) {
-    /* This should trigger 10 timeouts which is our threshold of 1% (10) */
-    if (!(i % 10)) {
-      rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT);
-    } else {
-      rep_hist_note_dns_query(0, DNS_ERR_NONE);
-    }
-  }
-
-  /* Move it 10 minutes in the future and see if we get a general overload. */
-  update_approx_time(approx_time() + (10 * 60));
-
-  /* This query should trigger the general overload because we are above 1%. */
-  rep_hist_note_dns_query(0, DNS_ERR_NONE);
-  stats_str = rep_hist_get_overload_general_line();
-  tt_assert(stats_str);
-  tor_free(stats_str);
-
-  /* Move 72h in the future, we should NOT get an overload anymore. */
-  update_approx_time(approx_time() + (72 * 3600));
-
-  stats_str = rep_hist_get_overload_general_line();
-  tt_assert(!stats_str);
-
-  /* This query should NOT trigger the general overload. */
-  rep_hist_note_dns_query(0, DNS_ERR_TIMEOUT);
-  stats_str = rep_hist_get_overload_general_line();
-  tt_assert(!stats_str);
-
-  /* Move it 10 minutes in the future and see if we get a general overload. We
-   * have now 100% of requests timing out. */
-  update_approx_time(approx_time() + (10 * 60));
-
-  /* This query should trigger the general overload with 50% of timeouts. */
-  rep_hist_note_dns_query(0, DNS_ERR_NONE);
-  stats_str = rep_hist_get_overload_general_line();
-  tt_assert(stats_str);
-  tor_free(stats_str);
-
- done:
-  tor_free(stats_str);
-}
-
 #define ENT(name)                                                       \
   { #name, test_ ## name , 0, NULL, NULL }
 #define FORK(name)                                                      \
@@ -958,7 +883,6 @@ struct testcase_t stats_tests[] = {
   FORK(rephist_v3_onions),
   FORK(load_stats_file),
   FORK(overload_stats),
-  FORK(overload_dns_timeout),
 
   END_OF_TESTCASES
 };





More information about the tor-commits mailing list