[tor-commits] [tor/master] relay: Add the OOM invocation metrics

dgoulet at torproject.org dgoulet at torproject.org
Wed May 12 16:03:06 UTC 2021


commit 9c2fa3498233bbb5f347a03188433c6c5f6de24f
Author: David Goulet <dgoulet at torproject.org>
Date:   Thu Apr 15 14:23:47 2021 -0400

    relay: Add the OOM invocation metrics
    
    With this commit, a relay now emits metrics event on the MetricsPort
    related to the OOM invocation for:
    
      - DNS cache
      - GeoIP database
      - Cell queues
      - HSDir caches
    
    Everytime the OOM is invoked, the number of bytes is added to the
    metrics counter for that specific type of invocation.
    
    Related to #40367
    
    Signed-off-by: David Goulet <dgoulet at torproject.org>
---
 src/core/or/circuitlist.c         | 12 ++++---
 src/core/or/circuitlist.h         |  2 +-
 src/core/or/relay.c               | 22 +++++++++---
 src/core/or/relay.h               |  5 +++
 src/feature/relay/relay_metrics.c | 74 ++++++++++++++++++++++++++++++++++++++-
 src/feature/relay/relay_metrics.h | 10 +++---
 6 files changed, 108 insertions(+), 17 deletions(-)

diff --git a/src/core/or/circuitlist.c b/src/core/or/circuitlist.c
index 4f62284e29..46be358dec 100644
--- a/src/core/or/circuitlist.c
+++ b/src/core/or/circuitlist.c
@@ -2586,8 +2586,10 @@ conns_compare_by_buffer_age_(const void **a_, const void **b_)
 
 /** We're out of memory for cells, having allocated <b>current_allocation</b>
  * bytes' worth.  Kill the 'worst' circuits until we're under
- * FRACTION_OF_DATA_TO_RETAIN_ON_OOM of our maximum usage. */
-void
+ * FRACTION_OF_DATA_TO_RETAIN_ON_OOM of our maximum usage.
+ *
+ * Return the number of bytes removed. */
+size_t
 circuits_handle_oom(size_t current_allocation)
 {
   smartlist_t *circlist;
@@ -2613,12 +2615,11 @@ circuits_handle_oom(size_t current_allocation)
              tor_zstd_get_total_allocation(),
              tor_lzma_get_total_allocation(),
              hs_cache_get_total_allocation());
-
   {
     size_t mem_target = (size_t)(get_options()->MaxMemInQueues *
                                  FRACTION_OF_DATA_TO_RETAIN_ON_OOM);
     if (current_allocation <= mem_target)
-      return;
+      return 0;
     mem_to_recover = current_allocation - mem_target;
   }
 
@@ -2697,7 +2698,6 @@ circuits_handle_oom(size_t current_allocation)
   } SMARTLIST_FOREACH_END(circ);
 
  done_recovering_mem:
-
   log_notice(LD_GENERAL, "Removed %"TOR_PRIuSZ" bytes by killing %d circuits; "
              "%d circuits remain alive. Also killed %d non-linked directory "
              "connections.",
@@ -2705,6 +2705,8 @@ circuits_handle_oom(size_t current_allocation)
              n_circuits_killed,
              smartlist_len(circlist) - n_circuits_killed,
              n_dirconns_killed);
+
+  return mem_recovered;
 }
 
 /** Verify that circuit <b>c</b> has all of its invariants
diff --git a/src/core/or/circuitlist.h b/src/core/or/circuitlist.h
index f5791d7c12..147e2cb2f8 100644
--- a/src/core/or/circuitlist.h
+++ b/src/core/or/circuitlist.h
@@ -232,7 +232,7 @@ int circuit_count_pending_on_channel(channel_t *chan);
 
 MOCK_DECL(void, assert_circuit_ok,(const circuit_t *c));
 void circuit_free_all(void);
-void circuits_handle_oom(size_t current_allocation);
+size_t circuits_handle_oom(size_t current_allocation);
 
 void circuit_clear_testing_cell_stats(circuit_t *circ);
 
diff --git a/src/core/or/relay.c b/src/core/or/relay.c
index 2248b2c180..7e1f1dc43d 100644
--- a/src/core/or/relay.c
+++ b/src/core/or/relay.c
@@ -2701,11 +2701,18 @@ cell_queues_get_total_allocation(void)
 /** The time at which we were last low on memory. */
 static time_t last_time_under_memory_pressure = 0;
 
+/** Statistics on how many bytes were removed by the OOM per type. */
+uint64_t oom_stats_n_bytes_removed_dns = 0;
+uint64_t oom_stats_n_bytes_removed_cell = 0;
+uint64_t oom_stats_n_bytes_removed_geoip = 0;
+uint64_t oom_stats_n_bytes_removed_hsdir = 0;
+
 /** Check whether we've got too much space used for cells.  If so,
  * call the OOM handler and return 1.  Otherwise, return 0. */
 STATIC int
 cell_queues_check_size(void)
 {
+  size_t removed = 0;
   time_t now = time(NULL);
   size_t alloc = cell_queues_get_total_allocation();
   alloc += half_streams_get_total_allocation();
@@ -2730,20 +2737,27 @@ cell_queues_check_size(void)
       if (hs_cache_total > get_options()->MaxMemInQueues / 5) {
         const size_t bytes_to_remove =
           hs_cache_total - (size_t)(get_options()->MaxMemInQueues / 10);
-        alloc -= hs_cache_handle_oom(now, bytes_to_remove);
+        removed = hs_cache_handle_oom(now, bytes_to_remove);
+        oom_stats_n_bytes_removed_hsdir += removed;
+        alloc -= removed;
       }
       if (geoip_client_cache_total > get_options()->MaxMemInQueues / 5) {
         const size_t bytes_to_remove =
           geoip_client_cache_total -
           (size_t)(get_options()->MaxMemInQueues / 10);
-        alloc -= geoip_client_cache_handle_oom(now, bytes_to_remove);
+        removed = geoip_client_cache_handle_oom(now, bytes_to_remove);
+        oom_stats_n_bytes_removed_geoip += removed;
+        alloc -= removed;
       }
       if (dns_cache_total > get_options()->MaxMemInQueues / 5) {
         const size_t bytes_to_remove =
           dns_cache_total - (size_t)(get_options()->MaxMemInQueues / 10);
-        alloc -= dns_cache_handle_oom(now, bytes_to_remove);
+        removed = dns_cache_handle_oom(now, bytes_to_remove);
+        oom_stats_n_bytes_removed_dns += removed;
+        alloc -= removed;
       }
-      circuits_handle_oom(alloc);
+      removed = circuits_handle_oom(alloc);
+      oom_stats_n_bytes_removed_cell += removed;
       return 1;
     }
   }
diff --git a/src/core/or/relay.h b/src/core/or/relay.h
index 2f337d5d16..eac920f491 100644
--- a/src/core/or/relay.h
+++ b/src/core/or/relay.h
@@ -49,6 +49,11 @@ extern uint64_t stats_n_data_bytes_packaged;
 extern uint64_t stats_n_data_cells_received;
 extern uint64_t stats_n_data_bytes_received;
 
+extern uint64_t oom_stats_n_bytes_removed_dns;
+extern uint64_t oom_stats_n_bytes_removed_cell;
+extern uint64_t oom_stats_n_bytes_removed_geoip;
+extern uint64_t oom_stats_n_bytes_removed_hsdir;
+
 void dump_cell_pool_usage(int severity);
 size_t packed_cell_mem_cost(void);
 
diff --git a/src/feature/relay/relay_metrics.c b/src/feature/relay/relay_metrics.c
index 3d392847e1..2686249548 100644
--- a/src/feature/relay/relay_metrics.c
+++ b/src/feature/relay/relay_metrics.c
@@ -10,6 +10,9 @@
 
 #include "orconfig.h"
 
+#include "core/or/or.h"
+#include "core/or/relay.h"
+
 #include "lib/malloc/malloc.h"
 #include "lib/container/smartlist.h"
 #include "lib/metrics/metrics_store.h"
@@ -17,16 +20,78 @@
 
 #include "feature/relay/relay_metrics.h"
 
+/** Declarations of each fill function for metrics defined in base_metrics. */
+static void fill_oom_values(void);
+
 /** The base metrics that is a static array of metrics added to the metrics
  * store.
  *
  * The key member MUST be also the index of the entry in the array. */
-static const relay_metrics_entry_t base_metrics[] = {};
+static const relay_metrics_entry_t base_metrics[] =
+{
+  {
+    .key = RELAY_METRICS_NUM_OOM_BYTES,
+    .type = METRICS_TYPE_COUNTER,
+    .name = METRICS_NAME(relay_load_oom_bytes_total),
+    .help = "Total number of bytes the OOM has freed by subsystem",
+    .fill_fn = fill_oom_values,
+  },
+};
 static const size_t num_base_metrics = ARRAY_LENGTH(base_metrics);
 
 /** The only and single store of all the relay metrics. */
 static metrics_store_t *the_store;
 
+/** Fill function for the RELAY_METRICS_NUM_OOM_BYTES metrics. */
+static void
+fill_oom_values(void)
+{
+  metrics_store_entry_t *sentry;
+  const relay_metrics_entry_t *rentry =
+    &base_metrics[RELAY_METRICS_NUM_OOM_BYTES];
+
+  sentry = metrics_store_add(the_store, rentry->type, rentry->name,
+                             rentry->help);
+  metrics_store_entry_add_label(sentry, "subsys=cell");
+  metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_cell);
+
+  sentry = metrics_store_add(the_store, rentry->type, rentry->name,
+                             rentry->help);
+  metrics_store_entry_add_label(sentry, "subsys=dns");
+  metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_dns);
+
+  sentry = metrics_store_add(the_store, rentry->type, rentry->name,
+                             rentry->help);
+  metrics_store_entry_add_label(sentry, "subsys=geoip");
+  metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_geoip);
+
+  sentry = metrics_store_add(the_store, rentry->type, rentry->name,
+                             rentry->help);
+  metrics_store_entry_add_label(sentry, "subsys=hsdir");
+  metrics_store_entry_update(sentry, oom_stats_n_bytes_removed_hsdir);
+}
+
+/** Reset the global store and fill it with all the metrics from base_metrics
+ * and their associated values.
+ *
+ * To pull this off, every metrics has a "fill" function that is called and in
+ * charge of adding the metrics to the store, appropriate labels and finally
+ * updating the value to report. */
+static void
+fill_store(void)
+{
+  /* Reset the current store, we are about to fill it with all the things. */
+  metrics_store_reset(the_store);
+
+  /* Call the fill function for each metrics. */
+  for (size_t i = 0; i < num_base_metrics; i++) {
+    if (BUG(!base_metrics[i].fill_fn)) {
+      continue;
+    }
+    base_metrics[i].fill_fn();
+  }
+}
+
 /** Return a list of all the relay metrics stores. This is the
  * function attached to the .get_metrics() member of the subsys_t. */
 const smartlist_t *
@@ -36,6 +101,13 @@ relay_metrics_get_stores(void)
    * simply update it. */
   static smartlist_t *stores_list = NULL;
 
+  /* We dynamically fill the store with all the metrics upon a request. The
+   * reason for this is because the exposed metrics of a relay are often
+   * internal counters in the fast path and thus we fetch the value when a
+   * metrics port request arrives instead of keeping a local metrics store of
+   * those values. */
+  fill_store();
+
   if (!stores_list) {
     stores_list = smartlist_new();
     smartlist_add(stores_list, the_store);
diff --git a/src/feature/relay/relay_metrics.h b/src/feature/relay/relay_metrics.h
index 7bc4760916..7d644badd3 100644
--- a/src/feature/relay/relay_metrics.h
+++ b/src/feature/relay/relay_metrics.h
@@ -12,13 +12,11 @@
 #include "lib/container/smartlist.h"
 #include "lib/metrics/metrics_common.h"
 
-#ifdef RELAY_METRICS_ENTRY_PRIVATE
-
 /** Metrics key for each reported metrics. This key is also used as an index in
  * the base_metrics array. */
 typedef enum {
-  /* XXX So code compiles. */
-  PLACEHOLDER = 0,
+  /** Number of OOM invocation. */
+  RELAY_METRICS_NUM_OOM_BYTES = 0,
 } relay_metrics_key_t;
 
 /** The metadata of a relay metric. */
@@ -31,10 +29,10 @@ typedef struct relay_metrics_entry_t {
   const char *name;
   /* Metrics output help comment. */
   const char *help;
+  /* Update value function. */
+  void (*fill_fn)(void);
 } relay_metrics_entry_t;
 
-#endif /* RELAY_METRICS_ENTRY_PRIVATE */
-
 /* Init. */
 void relay_metrics_init(void);
 void relay_metrics_free(void);





More information about the tor-commits mailing list