commit 0a5ecb334298187a64f58382231245111130aa76
Author: George Kadianakis <desnacked(a)riseup.net>
Date: Tue Mar 9 15:36:40 2021 +0200
Implement backbone of overload statistics.
- Implement overload statistics structure.
- Implement function that keeps track of overload statistics.
- Implement function that writes overload statistics to descriptor.
- Unittest for the whole logic.
---
changes/bug40222 | 6 +++
src/app/config/config.c | 1 +
src/app/config/or_options_st.h | 3 ++
src/feature/relay/router.c | 6 +++
src/feature/stats/rephist.c | 116 +++++++++++++++++++++++++++++++++++++++++
src/feature/stats/rephist.h | 18 +++++++
src/test/test_stats.c | 109 ++++++++++++++++++++++++++++++++++++++
7 files changed, 259 insertions(+)
diff --git a/changes/bug40222 b/changes/bug40222
new file mode 100644
index 0000000000..8338ea4a7b
--- /dev/null
+++ b/changes/bug40222
@@ -0,0 +1,6 @@
+ o Major features (metrics):
+ - Introduce a new class of extra-info statistics that allows relays to
+ expose details of how overloaded they are. This information is controlled
+ using the OverloadStatistics torrc option, and it will be used to make
+ more informed decisions about the network's load balancing. Implements
+ proposal 328; closes ticket 40222.
diff --git a/src/app/config/config.c b/src/app/config/config.c
index 1ac460bac0..156e90a549 100644
--- a/src/app/config/config.c
+++ b/src/app/config/config.c
@@ -352,6 +352,7 @@ static const config_var_t option_vars_[] = {
V(CacheDirectoryGroupReadable, AUTOBOOL, "auto"),
V(CellStatistics, BOOL, "0"),
V(PaddingStatistics, BOOL, "1"),
+ V(OverloadStatistics, BOOL, "1"),
V(LearnCircuitBuildTimeout, BOOL, "1"),
V(CircuitBuildTimeout, INTERVAL, "0"),
OBSOLETE("CircuitIdleTimeout"),
diff --git a/src/app/config/or_options_st.h b/src/app/config/or_options_st.h
index 90302eae7b..689adbc71b 100644
--- a/src/app/config/or_options_st.h
+++ b/src/app/config/or_options_st.h
@@ -674,6 +674,9 @@ struct or_options_t {
/** If true, include statistics file contents in extra-info documents. */
int ExtraInfoStatistics;
+ /** If true, include overload statistics in extra-info documents. */
+ int OverloadStatistics;
+
/** If true, do not believe anybody who tells us that a domain resolves
* to an internal address, or that an internal address has a PTR mapping.
* Helps avoid some cross-site attacks. */
diff --git a/src/feature/relay/router.c b/src/feature/relay/router.c
index 73e8393761..88160bd1cb 100644
--- a/src/feature/relay/router.c
+++ b/src/feature/relay/router.c
@@ -3372,6 +3372,12 @@ extrainfo_dump_to_string_stats_helper(smartlist_t *chunks,
if (contents)
smartlist_add(chunks, contents);
}
+ if (options->OverloadStatistics) {
+ contents = rep_hist_get_overload_stats_lines();
+ if (contents) {
+ smartlist_add(chunks, contents);
+ }
+ }
/* bridge statistics */
if (should_record_bridge_info(options)) {
const char *bridge_stats = geoip_get_bridge_stats_extrainfo(now);
diff --git a/src/feature/stats/rephist.c b/src/feature/stats/rephist.c
index f7c9336bff..64bec000df 100644
--- a/src/feature/stats/rephist.c
+++ b/src/feature/stats/rephist.c
@@ -183,6 +183,122 @@ static time_t started_tracking_stability = 0;
/** Map from hex OR identity digest to or_history_t. */
static digestmap_t *history_map = NULL;
+/** Represents a state of overload stats.
+ *
+ * All the timestamps in this structure have already been rounded down to the
+ * nearest hour. */
+typedef struct {
+ /* When did we last experience a general overload? */
+ time_t overload_general_time;
+
+ /* When did we last experience a bandwidth-related overload? */
+ time_t overload_ratelimits_time;
+ /* How many times have we gone off the our read limits? */
+ uint64_t overload_read_count;
+ /* How many times have we gone off the our write limits? */
+ uint64_t overload_write_count;
+
+ /* When did we last experience a file descriptor exhaustion? */
+ time_t overload_fd_exhausted_time;
+ /* How many times have we experienced a file descriptor exhaustion? */
+ uint64_t overload_fd_exhausted;
+} overload_stats_t;
+
+/** Current state of overload stats */
+static overload_stats_t overload_stats;
+
+/** Return true if this overload happened within the last `n_hours`. */
+static bool
+overload_happened_recently(time_t overload_time, unsigned n_hours)
+{
+ /* An overload is relevant if it happened in the last 72 hours */
+ if (overload_time > approx_time() - 3600 * n_hours) {
+ return true;
+ }
+ return false;
+}
+
+/* The current version of the overload stats version */
+#define OVERLOAD_STATS_VERSION 1
+
+/** Returns an allocated string for extra-info documents for publishing
+ * overload statistics. */
+char *
+rep_hist_get_overload_stats_lines(void)
+{
+ char *result = NULL;
+ smartlist_t *chunks = smartlist_new();
+ char tbuf[ISO_TIME_LEN+1];
+
+ /* First encode the general overload */
+ if (overload_happened_recently(overload_stats.overload_general_time, 72)) {
+ format_iso_time(tbuf, overload_stats.overload_general_time);
+ smartlist_add_asprintf(chunks, "overload-general %d %s",
+ OVERLOAD_STATS_VERSION, tbuf);
+ }
+
+ /* Now do bandwidth-related overloads */
+ if (overload_happened_recently(overload_stats.overload_ratelimits_time,24)) {
+ const or_options_t *options = get_options();
+ format_iso_time(tbuf, overload_stats.overload_ratelimits_time);
+ smartlist_add_asprintf(chunks,
+ "overload-ratelimits %d %s %" PRIu64 " %" PRIu64
+ " %" PRIu64 " %" PRIu64,
+ OVERLOAD_STATS_VERSION, tbuf,
+ options->BandwidthRate, options->BandwidthBurst,
+ overload_stats.overload_read_count,
+ overload_stats.overload_write_count);
+ }
+
+ /* Finally file descriptor overloads */
+ if (overload_happened_recently(
+ overload_stats.overload_fd_exhausted_time, 72)) {
+ format_iso_time(tbuf, overload_stats.overload_fd_exhausted_time);
+ smartlist_add_asprintf(chunks, "overload-fd-exhausted %d %s",
+ OVERLOAD_STATS_VERSION, tbuf);
+ }
+
+ /* Bail early if we had nothing to write */
+ if (smartlist_len(chunks) == 0) {
+ goto done;
+ }
+
+ result = smartlist_join_strings(chunks, "\n", 0, NULL);
+
+ done:
+ SMARTLIST_FOREACH(chunks, char *, cp, tor_free(cp));
+ smartlist_free(chunks);
+ return result;
+}
+
+/** Round down the time in `a` to the beginning of the current hour */
+#define SET_TO_START_OF_HOUR(a) STMT_BEGIN \
+ (a) = approx_time() - (approx_time() % 3600); \
+STMT_END
+
+/** Note down an overload event of type `overload`. */
+void
+rep_hist_note_overload(overload_type_t overload)
+{
+ switch (overload) {
+ case OVERLOAD_GENERAL:
+ SET_TO_START_OF_HOUR(overload_stats.overload_general_time);
+ break;
+ case OVERLOAD_READ:
+ SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
+ overload_stats.overload_read_count++;
+ break;
+ case OVERLOAD_WRITE:
+ SET_TO_START_OF_HOUR(overload_stats.overload_ratelimits_time);
+ overload_stats.overload_write_count++;
+ break;
+ case OVERLOAD_FD_EXHAUSTED:
+ SET_TO_START_OF_HOUR(overload_stats.overload_fd_exhausted_time);
+ overload_stats.overload_fd_exhausted++;
+ break;
+ }
+}
+
/** Return the or_history_t for the OR with identity digest <b>id</b>,
* creating it if necessary. */
static or_history_t *
diff --git a/src/feature/stats/rephist.h b/src/feature/stats/rephist.h
index 7ecbfebfd7..45020d7967 100644
--- a/src/feature/stats/rephist.h
+++ b/src/feature/stats/rephist.h
@@ -140,6 +140,24 @@ void rep_hist_reset_padding_counts(void);
void rep_hist_prep_published_padding_counts(time_t now);
void rep_hist_padding_count_timers(uint64_t num_timers);
+/**
+ * Represents the various types of overload we keep track of and expose in our
+ * extra-info descriptor.
+*/
+typedef enum {
+ /* A general overload -- can have many different causes. */
+ OVERLOAD_GENERAL,
+ /* We went over our configured read rate/burst bandwidth limit */
+ OVERLOAD_READ,
+ /* We went over our configured write rate/burst bandwidth limit */
+ OVERLOAD_WRITE,
+ /* We exhausted the file descriptors in this system */
+ OVERLOAD_FD_EXHAUSTED,
+} overload_type_t;
+
+void rep_hist_note_overload(overload_type_t overload);
+char *rep_hist_get_overload_stats_lines(void);
+
#ifdef TOR_UNIT_TESTS
struct hs_v2_stats_t;
const struct hs_v2_stats_t *rep_hist_get_hs_v2_stats(void);
diff --git a/src/test/test_stats.c b/src/test/test_stats.c
index 617a36faba..1f14255e31 100644
--- a/src/test/test_stats.c
+++ b/src/test/test_stats.c
@@ -703,6 +703,114 @@ test_load_stats_file(void *arg)
tor_free(content);
}
+/** Test the overload stats logic. */
+static void
+test_overload_stats(void *arg)
+{
+ time_t current_time = 1010101010;
+ char *stats_str = NULL;
+ (void) arg;
+
+ /* Change time to 03-01-2002 23:36 UTC */
+ /* This should make the extrainfo timestamp be "2002-01-03 23:00:00" */
+ update_approx_time(current_time);
+
+ /* With an empty rephist we shouldn't get anything back */
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_assert(!stats_str);
+
+ /* Note a DNS overload */
+ rep_hist_note_overload(OVERLOAD_GENERAL);
+
+ /* Move the time forward one hour */
+ current_time += 3600;
+ update_approx_time(current_time);
+
+ /* Now check the string */
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_str_op("overload-general 1 2002-01-03 23:00:00", OP_EQ, stats_str);
+ tor_free(stats_str);
+
+ /* Move the time forward 72 hours: see that the line has disappeared. */
+ current_time += 3600*72;
+ update_approx_time(current_time);
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_assert(!stats_str);
+
+ /* Now the time should be 2002-01-07 00:00:00 */
+
+ /* Note a DNS overload */
+ rep_hist_note_overload(OVERLOAD_GENERAL);
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_str_op("overload-general 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+ tor_free(stats_str);
+
+ /* Also note an fd exhaustion event */
+ rep_hist_note_overload(OVERLOAD_FD_EXHAUSTED);
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_str_op("overload-general 1 2002-01-07 00:00:00\n"
+ "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+ tor_free(stats_str);
+
+ /* Move the time forward. Register DNS overload. See that the time changed */
+ current_time += 3600*2;
+ update_approx_time(current_time);
+
+ rep_hist_note_overload(OVERLOAD_GENERAL);
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
+ "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+ tor_free(stats_str);
+
+ /* Move the time forward. Register a bandwidth ratelimit event. See that the
+ string is added */
+ current_time += 3600*2;
+ update_approx_time(current_time);
+
+ /* Register the rate limit event */
+ rep_hist_note_overload(OVERLOAD_READ);
+ /* Also set some rate limiting values that should be reflected on the log */
+ get_options_mutable()->BandwidthRate = 1000;
+ get_options_mutable()->BandwidthBurst = 2000;
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
+ "overload-ratelimits 1 2002-01-07 04:00:00 1000 2000 1 0\n"
+ "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+ tor_free(stats_str);
+
+ /* Move the time forward 24 hours: no rate limit line anymore. */
+ current_time += 3600*24;
+ update_approx_time(current_time);
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_str_op("overload-general 1 2002-01-07 02:00:00\n"
+ "overload-fd-exhausted 1 2002-01-07 00:00:00", OP_EQ, stats_str);
+ tor_free(stats_str);
+
+ /* Move the time forward 44 hours: no fd exhausted line anymore. */
+ current_time += 3600*44;
+ update_approx_time(current_time);
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_str_op("overload-general 1 2002-01-07 02:00:00", OP_EQ, stats_str);
+ tor_free(stats_str);
+
+ /* Move the time forward 2 hours: there is nothing left. */
+ current_time += 3600*2;
+ update_approx_time(current_time);
+
+ stats_str = rep_hist_get_overload_stats_lines();
+ tt_assert(!stats_str);
+
+ done:
+ tor_free(stats_str);
+}
+
#define ENT(name) \
{ #name, test_ ## name , 0, NULL, NULL }
#define FORK(name) \
@@ -718,6 +826,7 @@ struct testcase_t stats_tests[] = {
FORK(get_bandwidth_lines),
FORK(rephist_v3_onions),
FORK(load_stats_file),
+ FORK(overload_stats),
END_OF_TESTCASES
};