commit 9c2fa3498233bbb5f347a03188433c6c5f6de24f Author: David Goulet dgoulet@torproject.org Date: Thu Apr 15 14:23:47 2021 -0400
relay: Add the OOM invocation metrics
With this commit, a relay now emits metrics event on the MetricsPort related to the OOM invocation for:
- DNS cache - GeoIP database - Cell queues - HSDir caches
Everytime the OOM is invoked, the number of bytes is added to the metrics counter for that specific type of invocation.
Related to #40367
Signed-off-by: David Goulet dgoulet@torproject.org --- src/core/or/circuitlist.c | 12 ++++--- src/core/or/circuitlist.h | 2 +- src/core/or/relay.c | 22 +++++++++--- src/core/or/relay.h | 5 +++ src/feature/relay/relay_metrics.c | 74 ++++++++++++++++++++++++++++++++++++++- src/feature/relay/relay_metrics.h | 10 +++--- 6 files changed, 108 insertions(+), 17 deletions(-)
diff --git a/src/core/or/circuitlist.c b/src/core/or/circuitlist.c index 4f62284e29..46be358dec 100644 --- a/src/core/or/circuitlist.c +++ b/src/core/or/circuitlist.c @@ -2586,8 +2586,10 @@ conns_compare_by_buffer_age_(const void **a_, const void **b_)
/** We're out of memory for cells, having allocated <b>current_allocation</b> * bytes' worth. Kill the 'worst' circuits until we're under - * FRACTION_OF_DATA_TO_RETAIN_ON_OOM of our maximum usage. */ -void + * FRACTION_OF_DATA_TO_RETAIN_ON_OOM of our maximum usage. + * + * Return the number of bytes removed. */ +size_t circuits_handle_oom(size_t current_allocation) { smartlist_t *circlist; @@ -2613,12 +2615,11 @@ circuits_handle_oom(size_t current_allocation) tor_zstd_get_total_allocation(), tor_lzma_get_total_allocation(), hs_cache_get_total_allocation()); - { size_t mem_target = (size_t)(get_options()->MaxMemInQueues * FRACTION_OF_DATA_TO_RETAIN_ON_OOM); if (current_allocation <= mem_target) - return; + return 0; mem_to_recover = current_allocation - mem_target; }
@@ -2697,7 +2698,6 @@ circuits_handle_oom(size_t current_allocation) } SMARTLIST_FOREACH_END(circ);
done_recovering_mem: - log_notice(LD_GENERAL, "Removed %"TOR_PRIuSZ" bytes by killing %d circuits; " "%d circuits remain alive. Also killed %d non-linked directory " "connections.", @@ -2705,6 +2705,8 @@ circuits_handle_oom(size_t current_allocation) n_circuits_killed, smartlist_len(circlist) - n_circuits_killed, n_dirconns_killed); + + return mem_recovered; }
/** Verify that circuit <b>c</b> has all of its invariants diff --git a/src/core/or/circuitlist.h b/src/core/or/circuitlist.h index f5791d7c12..147e2cb2f8 100644 --- a/src/core/or/circuitlist.h +++ b/src/core/or/circuitlist.h @@ -232,7 +232,7 @@ int circuit_count_pending_on_channel(channel_t *chan);
MOCK_DECL(void, assert_circuit_ok,(const circuit_t *c)); void circuit_free_all(void); -void circuits_handle_oom(size_t current_allocation); +size_t circuits_handle_oom(size_t current_allocation);
void circuit_clear_testing_cell_stats(circuit_t *circ);
diff --git a/src/core/or/relay.c b/src/core/or/relay.c index 2248b2c180..7e1f1dc43d 100644 --- a/src/core/or/relay.c +++ b/src/core/or/relay.c @@ -2701,11 +2701,18 @@ cell_queues_get_total_allocation(void) /** The time at which we were last low on memory. */ static time_t last_time_under_memory_pressure = 0;
+/** Statistics on how many bytes were removed by the OOM per type. */ +uint64_t oom_stats_n_bytes_removed_dns = 0; +uint64_t oom_stats_n_bytes_removed_cell = 0; +uint64_t oom_stats_n_bytes_removed_geoip = 0; +uint64_t oom_stats_n_bytes_removed_hsdir = 0; + /** Check whether we've got too much space used for cells. If so, * call the OOM handler and return 1. Otherwise, return 0. */ STATIC int cell_queues_check_size(void) { + size_t removed = 0; time_t now = time(NULL); size_t alloc = cell_queues_get_total_allocation(); alloc += half_streams_get_total_allocation(); @@ -2730,20 +2737,27 @@ cell_queues_check_size(void) if (hs_cache_total > get_options()->MaxMemInQueues / 5) { const size_t bytes_to_remove = hs_cache_total - (size_t)(get_options()->MaxMemInQueues / 10); - alloc -= hs_cache_handle_oom(now, bytes_to_remove); + removed = hs_cache_handle_oom(now, bytes_to_remove); + oom_stats_n_bytes_removed_hsdir += removed; + alloc -= removed; } if (geoip_client_cache_total > get_options()->MaxMemInQueues / 5) { const size_t bytes_to_remove = geoip_client_cache_total - (size_t)(get_options()->MaxMemInQueues / 10); - alloc -= geoip_client_cache_handle_oom(now, bytes_to_remove); + removed = geoip_client_cache_handle_oom(now, bytes_to_remove); + oom_stats_n_bytes_removed_geoip += removed; + alloc -= removed; } if (dns_cache_total > get_options()->MaxMemInQueues / 5) { const size_t bytes_to_remove = dns_cache_total - (size_t)(get_options()->MaxMemInQueues / 10); - alloc -= dns_cache_handle_oom(now, bytes_to_remove); + removed = dns_cache_handle_oom(now, bytes_to_remove); + oom_stats_n_bytes_removed_dns += removed; + alloc -= removed; } - circuits_handle_oom(alloc); + removed = circuits_handle_oom(alloc); + oom_stats_n_bytes_removed_cell += removed; return 1; } } diff --git a/src/core/or/relay.h b/src/core/or/relay.h index 2f337d5d16..eac920f491 100644 --- a/src/core/or/relay.h +++ b/src/core/or/relay.h @@ -49,6 +49,11 @@ extern uint64_t stats_n_data_bytes_packaged; extern uint64_t stats_n_data_cells_received; extern uint64_t stats_n_data_bytes_received;
+extern uint64_t oom_stats_n_bytes_removed_dns; +extern uint64_t oom_stats_n_bytes_removed_cell; +extern uint64_t oom_stats_n_bytes_removed_geoip; +extern uint64_t oom_stats_n_bytes_removed_hsdir; + void dump_cell_pool_usage(int severity); size_t packed_cell_mem_cost(void);
diff --git a/src/feature/relay/relay_metrics.c b/src/feature/relay/relay_metrics.c index 3d392847e1..2686249548 100644 --- a/src/feature/relay/relay_metrics.c +++ b/src/feature/relay/relay_metrics.c @@ -10,6 +10,9 @@
#include "orconfig.h"
+#include "core/or/or.h" +#include "core/or/relay.h" + #include "lib/malloc/malloc.h" #include "lib/container/smartlist.h" #include "lib/metrics/metrics_store.h" @@ -17,16 +20,78 @@
#include "feature/relay/relay_metrics.h"
+/** Declarations of each fill function for metrics defined in base_metrics. */ +static void fill_oom_values(void); + /** The base metrics that is a static array of metrics added to the metrics * store. * * The key member MUST be also the index of the entry in the array. */ -static const relay_metrics_entry_t base_metrics[] = {}; +static const relay_metrics_entry_t base_metrics[] = +{ + { + .key = RELAY_METRICS_NUM_OOM_BYTES, + .type = METRICS_TYPE_COUNTER, + .name = METRICS_NAME(relay_load_oom_bytes_total), + .help = "Total number of bytes the OOM has freed by subsystem", + .fill_fn = fill_oom_values, + }, +}; static const size_t num_base_metrics = ARRAY_LENGTH(base_metrics);
/** The only and single store of all the relay metrics. */ static metrics_store_t *the_store;
+/** Fill function for the RELAY_METRICS_NUM_OOM_BYTES metrics. */ +static void +fill_oom_values(void) +{ + metrics_store_entry_t *sentry; + const relay_metrics_entry_t *rentry = + &base_metrics[RELAY_METRICS_NUM_OOM_BYTES]; + + sentry = metrics_store_add(the_store, rentry->type, rentry->name, + rentry->help); + metrics_store_entry_add_label(sentry, "subsys=cell"); + metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_cell); + + sentry = metrics_store_add(the_store, rentry->type, rentry->name, + rentry->help); + metrics_store_entry_add_label(sentry, "subsys=dns"); + metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_dns); + + sentry = metrics_store_add(the_store, rentry->type, rentry->name, + rentry->help); + metrics_store_entry_add_label(sentry, "subsys=geoip"); + metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_geoip); + + sentry = metrics_store_add(the_store, rentry->type, rentry->name, + rentry->help); + metrics_store_entry_add_label(sentry, "subsys=hsdir"); + metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_hsdir); +} + +/** Reset the global store and fill it with all the metrics from base_metrics + * and their associated values. + * + * To pull this off, every metrics has a "fill" function that is called and in + * charge of adding the metrics to the store, appropriate labels and finally + * updating the value to report. */ +static void +fill_store(void) +{ + /* Reset the current store, we are about to fill it with all the things. */ + metrics_store_reset(the_store); + + /* Call the fill function for each metrics. */ + for (size_t i = 0; i < num_base_metrics; i++) { + if (BUG(!base_metrics[i].fill_fn)) { + continue; + } + base_metrics[i].fill_fn(); + } +} + /** Return a list of all the relay metrics stores. This is the * function attached to the .get_metrics() member of the subsys_t. */ const smartlist_t * @@ -36,6 +101,13 @@ relay_metrics_get_stores(void) * simply update it. */ static smartlist_t *stores_list = NULL;
+ /* We dynamically fill the store with all the metrics upon a request. The + * reason for this is because the exposed metrics of a relay are often + * internal counters in the fast path and thus we fetch the value when a + * metrics port request arrives instead of keeping a local metrics store of + * those values. */ + fill_store(); + if (!stores_list) { stores_list = smartlist_new(); smartlist_add(stores_list, the_store); diff --git a/src/feature/relay/relay_metrics.h b/src/feature/relay/relay_metrics.h index 7bc4760916..7d644badd3 100644 --- a/src/feature/relay/relay_metrics.h +++ b/src/feature/relay/relay_metrics.h @@ -12,13 +12,11 @@ #include "lib/container/smartlist.h" #include "lib/metrics/metrics_common.h"
-#ifdef RELAY_METRICS_ENTRY_PRIVATE - /** Metrics key for each reported metrics. This key is also used as an index in * the base_metrics array. */ typedef enum { - /* XXX So code compiles. */ - PLACEHOLDER = 0, + /** Number of OOM invocation. */ + RELAY_METRICS_NUM_OOM_BYTES = 0, } relay_metrics_key_t;
/** The metadata of a relay metric. */ @@ -31,10 +29,10 @@ typedef struct relay_metrics_entry_t { const char *name; /* Metrics output help comment. */ const char *help; + /* Update value function. */ + void (*fill_fn)(void); } relay_metrics_entry_t;
-#endif /* RELAY_METRICS_ENTRY_PRIVATE */ - /* Init. */ void relay_metrics_init(void); void relay_metrics_free(void);