commit 7a8108ea87320da3008e65747baa43c1bbcf13c2 Author: David Goulet dgoulet@torproject.org Date: Wed Oct 20 09:59:04 2021 -0400
relay: Overload state on DNS timeout is now X% over Y secs
With this commit, we will only report a general overload state if we've seen more than X% of DNS timeout errors over Y seconds. Previous behavior was to report when a single timeout occured which is really too small of a threshold.
The value X is a consensus parameters called "overload_dns_timeout_scale_percent" which is a scaled percentage (factor of 1000) so we can represent decimal points for X like 0.5% for instance. Its default is 1000 which ends up being 1%.
The value Y is a consensus parameters called "overload_dns_timeout_period_secs" which is the time period for which will gather DNS errors and once over, we assess if that X% has been reached ultimately triggering a general overload signal.
Closes #40491
Signed-off-by: David Goulet dgoulet@torproject.org --- changes/ticket40491 | 7 ++ src/feature/nodelist/networkstatus.c | 2 + src/feature/stats/rephist.c | 130 ++++++++++++++++++++++++++++++++++- src/feature/stats/rephist.h | 2 + src/test/test_stats.c | 78 +++++++++++++++++++++ 5 files changed, 218 insertions(+), 1 deletion(-)
diff --git a/changes/ticket40491 b/changes/ticket40491 new file mode 100644 index 0000000000..01c6c7d748 --- /dev/null +++ b/changes/ticket40491 @@ -0,0 +1,7 @@ + o Major bugfixes (relay, overload state): + - Report the general overload state for DNS timeout errors only if X% of all + DNS queries over Y seconds are errors. Before that, it only took 1 timeout + to report the overload state which was just too low of a threshold. The X + and Y values are 1% and 10 minutes respectively but they are also + controlled by consensus parameters. Fixes bug 40491; bugfix on + 0.4.6.1-alpha. diff --git a/src/feature/nodelist/networkstatus.c b/src/feature/nodelist/networkstatus.c index 6867d8c98e..d57db4c415 100644 --- a/src/feature/nodelist/networkstatus.c +++ b/src/feature/nodelist/networkstatus.c @@ -105,6 +105,7 @@ #include "feature/dirauth/vote_microdesc_hash_st.h" #include "feature/nodelist/vote_routerstatus_st.h" #include "feature/nodelist/routerstatus_st.h" +#include "feature/stats/rephist.h"
#ifdef HAVE_UNISTD_H #include <unistd.h> @@ -1665,6 +1666,7 @@ notify_before_networkstatus_changes(const networkstatus_t *old_c, dos_consensus_has_changed(new_c); relay_consensus_has_changed(new_c); hs_dos_consensus_has_changed(new_c); + rep_hist_consensus_has_changed(new_c); }
/* Called after a new consensus has been put in the global state. It is safe diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c index cb3ccdc91e..aef12e5628 100644 --- a/src/feature/stats/rephist.c +++ b/src/feature/stats/rephist.c @@ -219,6 +219,59 @@ static uint64_t stats_n_tcp_exhaustion = 0;
/***** DNS statistics *****/
+/* We use a scale here so we can represent percentages with decimal points by + * scaling the value by this factor and so 0.5% becomes a value of 500. + * Default is 1% and thus min and max range is 0 to 100%. */ +#define OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE 1000.0 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT 1000 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_MIN 0 +#define OVERLOAD_DNS_TIMEOUT_PERCENT_MAX 100000 + +/** Consensus parameter: indicate what fraction of DNS timeout errors over the + * total number of DNS requests must be reached before we trigger a general + * overload signal .*/ +static double overload_dns_timeout_fraction = + OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT / + OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; + +/* Number of seconds for the assessment period. Default is 10 minutes (600) and + * the min max range is within a 32bit value. */ +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT (10 * 60) +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN 0 +#define OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX INT32_MAX + +/** Consensus parameter: Period, in seconds, over which we count the number of + * DNS requests and timeout errors. After that period, we assess if we trigger + * an overload or not. */ +static int32_t overload_dns_timeout_period_secs = + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT; + +/** Overload DNS statistics. The information in this object is used to assess + * if, due to DNS errors, we should emit a general overload signal or not. + * + * NOTE: This structure is _not_ per DNS query type like the statistics below + * because of a libevent bug + * (https://github.com/libevent/libevent/issues/1219), on error, the type is + * not propagated up back to the user and so we need to keep our own stats for + * the overload signal. */ +typedef struct { + /** Total number of DNS request seen at an Exit. They might not all end + * successfully or might even be lost by tor. This counter is incremented + * right before the DNS request is initiated. */ + uint64_t stats_n_request; + + /** Total number of DNS timeout errors. */ + uint64_t stats_n_error_timeout; + + /** When is the next assessment time of the general overload for DNS errors. + * Once this time is reached, all stats are reset and this time is set to the + * next assessment time. */ + time_t next_assessment_time; +} overload_dns_stats_t; + +/** Keep track of the DNS requests for the general overload state. */ +static overload_dns_stats_t overload_dns_stats; + /** Represents the statistics of DNS queries seen if it is an Exit. */ typedef struct { /* Total number of DNS errors found in RFC 1035 (from 0 to 5 code). */ @@ -266,6 +319,60 @@ get_dns_stats_by_type(const int type) } }
+/** Assess the DNS timeout errors and if we have enough to trigger a general + * overload. */ +static void +overload_general_dns_assessment(void) +{ + /* Initialize the time. Should be done once. */ + if (overload_dns_stats.next_assessment_time == 0) { + goto reset; + } + + /* Not the time yet. */ + if (overload_dns_stats.next_assessment_time > approx_time()) { + return; + } + + /* Lets see if we can signal a general overload. */ + double fraction = (double) overload_dns_stats.stats_n_error_timeout / + (double) overload_dns_stats.stats_n_request; + if (fraction >= overload_dns_timeout_fraction) { + log_notice(LD_HIST, "General overload -> DNS timeouts (%" PRIu64 ") " + "fraction %.4f%% is above threshold of %.4f%%", + overload_dns_stats.stats_n_error_timeout, + fraction * 100.0, + overload_dns_timeout_fraction * 100.0); + rep_hist_note_overload(OVERLOAD_GENERAL); + } + + reset: + /* Reset counters for the next period. */ + overload_dns_stats.stats_n_error_timeout = 0; + overload_dns_stats.stats_n_request = 0; + overload_dns_stats.next_assessment_time = + approx_time() + overload_dns_timeout_period_secs; +} + +/** Called just before the consensus will be replaced. Update the consensus + * parameters in case they changed. */ +void +rep_hist_consensus_has_changed(const networkstatus_t *ns) +{ + overload_dns_timeout_fraction = + networkstatus_get_param(ns, "overload_dns_timeout_scale_percent", + OVERLOAD_DNS_TIMEOUT_PERCENT_DEFAULT, + OVERLOAD_DNS_TIMEOUT_PERCENT_MIN, + OVERLOAD_DNS_TIMEOUT_PERCENT_MAX) / + OVERLOAD_DNS_TIMEOUT_PERCENT_SCALE / 100.0; + + overload_dns_timeout_period_secs = + networkstatus_get_param(ns, "overload_dns_timeout_period_secs", + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_DEFAULT, + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MIN, + OVERLOAD_DNS_TIMEOUT_PERIOD_SECS_MAX); +} + /** Return the DNS error count for the given libevent DNS type and error code. * The possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */ uint64_t @@ -320,10 +427,31 @@ rep_hist_get_n_dns_request(int type) }
/** Note a DNS error for the given given libevent DNS record type and error - * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. */ + * code. Possible types are: DNS_IPv4_A, DNS_PTR, DNS_IPv6_AAAA. + * + * NOTE: Libevent is _not_ returning the type in case of an error and so if + * error is anything but DNS_ERR_NONE, the type is not usable and set to 0. + * + * See: https://gitlab.torproject.org/tpo/core/tor/-/issues/40490 */ void rep_hist_note_dns_error(int type, uint8_t error) { + /* Assess if we need to trigger a general overload with regards to the DNS + * errors or not. */ + overload_general_dns_assessment(); + + /* Because of the libevent problem (see note in the function comment), we + * disregard the DNS query type and keep track of DNS timeout for the + * overload state. */ + if (error == DNS_ERR_TIMEOUT) { + overload_dns_stats.stats_n_error_timeout++; + } + overload_dns_stats.stats_n_request++; + + /* Again, the libevent bug (see function comment), for an error that is + * anything but DNS_ERR_NONE, the type is always 0 which means that we don't + * have a DNS stat object for it so this code will do nothing until libevent + * is fixed. */ dns_stats_t *dns_stats = get_dns_stats_by_type(type); /* Unsupported DNS query type. */ if (!dns_stats) { diff --git a/src/feature/stats/rephist.h b/src/feature/stats/rephist.h index 7f414de4c8..6918ed18c2 100644 --- a/src/feature/stats/rephist.h +++ b/src/feature/stats/rephist.h @@ -89,6 +89,8 @@ uint64_t rep_hist_get_n_dns_request(int type); void rep_hist_note_dns_request(int type); void rep_hist_note_dns_error(int type, uint8_t error);
+void rep_hist_consensus_has_changed(const networkstatus_t *ns); + extern uint64_t rephist_total_alloc; extern uint32_t rephist_total_num; #ifdef TOR_UNIT_TESTS diff --git a/src/test/test_stats.c b/src/test/test_stats.c index 081ae22cd5..e85ad40699 100644 --- a/src/test/test_stats.c +++ b/src/test/test_stats.c @@ -51,6 +51,8 @@ #include "feature/stats/bw_array_st.h" #include "feature/relay/router.h"
+#include <event2/dns.h> + /** Run unit tests for some stats code. */ static void test_stats(void *arg) @@ -865,6 +867,81 @@ test_overload_stats(void *arg) tor_free(stats_str); }
+/** Test the overload stats logic. */ +static void +test_overload_dns_timeout(void *arg) +{ + char *stats_str = NULL; + (void) arg; + + /* Lets simulate a series of timeouts but below our default 1% threshold. */ + + for (int i = 0; i < 1000; i++) { + /* This should trigger 9 timeouts which is just below 1% (10) */ + if (i > 0 && !(i % 100)) { + rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT); + } else { + rep_hist_note_dns_error(0, DNS_ERR_NONE); + } + } + + /* No overload yet. */ + stats_str = rep_hist_get_overload_general_line(); + tt_assert(!stats_str); + + /* Move it 10 minutes in the future and see if we get a general overload. */ + update_approx_time(approx_time() + (10 * 60)); + + /* This query should NOT trigger the general overload because we are below + * our default of 1%. */ + rep_hist_note_dns_error(0, DNS_ERR_NONE); + stats_str = rep_hist_get_overload_general_line(); + tt_assert(!stats_str); + + /* We'll now go above our 1% threshold. */ + for (int i = 0; i < 1000; i++) { + /* This should trigger 10 timeouts which is our threshold of 1% (10) */ + if (!(i % 10)) { + rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT); + } else { + rep_hist_note_dns_error(0, DNS_ERR_NONE); + } + } + + /* Move it 10 minutes in the future and see if we get a general overload. */ + update_approx_time(approx_time() + (10 * 60)); + + /* This query should trigger the general overload because we are above 1%. */ + rep_hist_note_dns_error(0, DNS_ERR_NONE); + stats_str = rep_hist_get_overload_general_line(); + tt_assert(stats_str); + tor_free(stats_str); + + /* Move 72h in the future, we should NOT get an overload anymore. */ + update_approx_time(approx_time() + (72 * 3600)); + + stats_str = rep_hist_get_overload_general_line(); + tt_assert(!stats_str); + + /* This query should NOT trigger the general overload. */ + rep_hist_note_dns_error(0, DNS_ERR_TIMEOUT); + stats_str = rep_hist_get_overload_general_line(); + tt_assert(!stats_str); + + /* Move it 10 minutes in the future and see if we get a general overload. We + * have now 100% of requests timing out. */ + update_approx_time(approx_time() + (10 * 60)); + + /* This query should trigger the general overload with 50% of timeouts. */ + rep_hist_note_dns_error(0, DNS_ERR_NONE); + stats_str = rep_hist_get_overload_general_line(); + tt_assert(stats_str); + tor_free(stats_str); + + done: + tor_free(stats_str); +} + #define ENT(name) \ { #name, test_ ## name , 0, NULL, NULL } #define FORK(name) \ @@ -881,6 +958,7 @@ struct testcase_t stats_tests[] = { FORK(rephist_v3_onions), FORK(load_stats_file), FORK(overload_stats), + FORK(overload_dns_timeout),
END_OF_TESTCASES };