commit f29d158330a8c6bcc91b71a888db741766135aaf
Author: David Goulet <dgoulet(a)torproject.org>
Date: Tue Feb 6 15:40:17 2018 -0500
relay: Avoid connecting to down relays
If we failed to connect at the TCP level to a relay, note it down and refuse
to connect again for another 60 seconds.
Fixes #24767
Signed-off-by: David Goulet <dgoulet(a)torproject.org>
---
changes/bug24767 | 5 ++
src/or/connection_or.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 229 insertions(+)
diff --git a/changes/bug24767 b/changes/bug24767
new file mode 100644
index 000000000..56fbe51a9
--- /dev/null
+++ b/changes/bug24767
@@ -0,0 +1,5 @@
+ o Major bugfixes (relay, connection):
+ - Refuse to connect again to a relay from which we failed previously with
+ a connection refused, timeout or error (at the TCP level). The relay
+ won't be retried for 60 seconds after the failure occured. Fixes bug
+ 24767; bugfix on 0.0.6.
diff --git a/src/or/connection_or.c b/src/or/connection_or.c
index 272a086a3..976889ad0 100644
--- a/src/or/connection_or.c
+++ b/src/or/connection_or.c
@@ -1122,6 +1122,216 @@ connection_or_group_set_badness_(smartlist_t *group, int force)
} SMARTLIST_FOREACH_END(or_conn);
}
+/* Lifetime of a connection failure. After that, we'll retry. This is in
+ * seconds. */
+#define OR_CONNECT_FAILURE_LIFETIME 60
+/* The interval to use with when to clean up the failure cache. */
+#define OR_CONNECT_FAILURE_CLEANUP_INTERVAL 60
+
+/* When is the next time we have to cleanup the failure map. We keep this
+ * because we clean it opportunistically. */
+static time_t or_connect_failure_map_next_cleanup_ts = 0;
+
+/* OR connection failure entry data structure. It is kept in the connection
+ * failure map defined below and indexed by OR identity digest, address and
+ * port.
+ *
+ * We need to identify a connection failure with these three values because we
+ * want to avoid to wrongfully blacklist a relay if someone is trying to
+ * extend to a known identity digest but with the wrong IP/port. For instance,
+ * it can happen if a relay changed its port but the client still has an old
+ * descriptor with the old port. We want to stop connecting to that
+ * IP/port/identity all together, not only the relay identity. */
+typedef struct or_connect_failure_entry_t {
+ HT_ENTRY(or_connect_failure_entry_t) node;
+ /* Identity digest of the connection where it is connecting to. */
+ uint8_t identity_digest[DIGEST_LEN];
+ /* This is the connection address from the base connection_t. After the
+ * connection is checked for canonicity, the base address should represent
+ * what we know instead of where we are connecting to. This is what we need
+ * so we can correlate known relays within the consensus. */
+ tor_addr_t addr;
+ uint16_t port;
+ /* Last time we were unable to connect. */
+ time_t last_failed_connect_ts;
+} or_connect_failure_entry_t;
+
+/* Map where we keep connection failure entries. They are indexed by addr,
+ * port and identity digest. */
+static HT_HEAD(or_connect_failure_ht, or_connect_failure_entry_t)
+ or_connect_failures_map = HT_INITIALIZER();
+
+/* Helper: Hashtable equal function. Return 1 if equal else 0. */
+static int
+or_connect_failure_ht_eq(const or_connect_failure_entry_t *a,
+ const or_connect_failure_entry_t *b)
+{
+ return fast_memeq(a->identity_digest, b->identity_digest, DIGEST_LEN) &&
+ tor_addr_eq(&a->addr, &b->addr) &&
+ a->port == b->port;
+}
+
+/* Helper: Return the hash for the hashtable of the given entry. For this
+ * table, it is a combination of address, port and identity digest. */
+static unsigned int
+or_connect_failure_ht_hash(const or_connect_failure_entry_t *entry)
+{
+ size_t offset = 0, addr_size;
+ const void *addr_ptr;
+ /* Largest size is IPv6 and IPv4 is smaller so it is fine. */
+ uint8_t data[16 + sizeof(uint16_t) + DIGEST_LEN];
+
+ /* Get the right address bytes depending on the family. */
+ switch (tor_addr_family(&entry->addr)) {
+ case AF_INET:
+ addr_size = 4;
+ addr_ptr = &entry->addr.addr.in_addr.s_addr;
+ break;
+ case AF_INET6:
+ addr_size = 16;
+ addr_ptr = &entry->addr.addr.in6_addr.s6_addr;
+ break;
+ default:
+ tor_assert_nonfatal_unreached();
+ return 0;
+ }
+
+ memcpy(data, addr_ptr, addr_size);
+ offset += addr_size;
+ memcpy(data + offset, entry->identity_digest, DIGEST_LEN);
+ offset += DIGEST_LEN;
+ set_uint16(data + offset, entry->port);
+ offset += sizeof(uint16_t);
+
+ return (unsigned int) siphash24g(data, offset);
+}
+
+HT_PROTOTYPE(or_connect_failure_ht, or_connect_failure_entry_t, node,
+ or_connect_failure_ht_hash, or_connect_failure_ht_eq)
+
+HT_GENERATE2(or_connect_failure_ht, or_connect_failure_entry_t, node,
+ or_connect_failure_ht_hash, or_connect_failure_ht_eq,
+ 0.6, tor_reallocarray_, tor_free_)
+
+/* Initialize a given connect failure entry with the given identity_digest,
+ * addr and port. All field are optional except ocf. */
+static void
+or_connect_failure_init(const char *identity_digest, const tor_addr_t *addr,
+ uint16_t port, or_connect_failure_entry_t *ocf)
+{
+ tor_assert(ocf);
+ if (identity_digest) {
+ memcpy(ocf->identity_digest, identity_digest,
+ sizeof(ocf->identity_digest));
+ }
+ if (addr) {
+ tor_addr_copy(&ocf->addr, addr);
+ }
+ ocf->port = port;
+}
+
+/* Return a newly allocated connection failure entry. It is initialized with
+ * the given or_conn data. This can't fail. */
+static or_connect_failure_entry_t *
+or_connect_failure_new(const or_connection_t *or_conn)
+{
+ or_connect_failure_entry_t *ocf = tor_malloc_zero(sizeof(*ocf));
+ or_connect_failure_init(or_conn->identity_digest, &or_conn->real_addr,
+ TO_CONN(or_conn)->port, ocf);
+ return ocf;
+}
+
+/* Return a connection failure entry matching the given or_conn. NULL is
+ * returned if not found. */
+static or_connect_failure_entry_t *
+or_connect_failure_find(const or_connection_t *or_conn)
+{
+ or_connect_failure_entry_t lookup;
+ tor_assert(or_conn);
+ or_connect_failure_init(or_conn->identity_digest, &TO_CONN(or_conn)->addr,
+ TO_CONN(or_conn)->port, &lookup);
+ return HT_FIND(or_connect_failure_ht, &or_connect_failures_map, &lookup);
+}
+
+/* Note down in the connection failure cache that a failure occurred on the
+ * given or_conn. */
+static void
+note_or_connect_failed(const or_connection_t *or_conn)
+{
+ or_connect_failure_entry_t *ocf = NULL;
+
+ tor_assert(or_conn);
+
+ ocf = or_connect_failure_find(or_conn);
+ if (ocf == NULL) {
+ ocf = or_connect_failure_new(or_conn);
+ HT_INSERT(or_connect_failure_ht, &or_connect_failures_map, ocf);
+ }
+ ocf->last_failed_connect_ts = approx_time();
+}
+
+/* Cleanup the connection failure cache and remove all entries below the
+ * given cutoff. */
+static void
+or_connect_failure_map_cleanup(time_t cutoff)
+{
+ or_connect_failure_entry_t **ptr, **next, *entry;
+
+ for (ptr = HT_START(or_connect_failure_ht, &or_connect_failures_map);
+ ptr != NULL; ptr = next) {
+ entry = *ptr;
+ if (entry->last_failed_connect_ts <= cutoff) {
+ next = HT_NEXT_RMV(or_connect_failure_ht, &or_connect_failures_map, ptr);
+ tor_free(entry);
+ } else {
+ next = HT_NEXT(or_connect_failure_ht, &or_connect_failures_map, ptr);
+ }
+ }
+}
+
+/* Return true iff the given OR connection can connect to its destination that
+ * is the triplet identity_digest, address and port.
+ *
+ * The or_conn MUST have gone through connection_or_check_canonicity() so the
+ * base address is properly set to what we know or doesn't know. */
+static int
+should_connect_to_relay(const or_connection_t *or_conn)
+{
+ time_t now, cutoff;
+ time_t connect_failed_since_ts = 0;
+ or_connect_failure_entry_t *ocf;
+
+ tor_assert(or_conn);
+
+ now = approx_time();
+ cutoff = now - OR_CONNECT_FAILURE_LIFETIME;
+
+ /* Opportunistically try to cleanup the failure cache. We do that at regular
+ * interval so it doesn't grow too big. */
+ if (or_connect_failure_map_next_cleanup_ts <= now) {
+ or_connect_failure_map_cleanup(cutoff);
+ or_connect_failure_map_next_cleanup_ts =
+ now + OR_CONNECT_FAILURE_CLEANUP_INTERVAL;
+ }
+
+ /* Look if we have failed previously to the same destination as this
+ * OR connection. */
+ ocf = or_connect_failure_find(or_conn);
+ if (ocf) {
+ connect_failed_since_ts = ocf->last_failed_connect_ts;
+ }
+ /* If we do have an unable to connect timestamp and it is below cutoff, we
+ * can connect. Or we have never failed before so let it connect. */
+ if (connect_failed_since_ts > cutoff) {
+ goto no_connect;
+ }
+
+ /* Ok we can connect! */
+ return 1;
+ no_connect:
+ return 0;
+}
+
/** <b>conn</b> is in the 'connecting' state, and it failed to complete
* a TCP connection. Send notifications appropriately.
*
@@ -1135,6 +1345,7 @@ connection_or_connect_failed(or_connection_t *conn,
control_event_or_conn_status(conn, OR_CONN_EVENT_FAILED, reason);
if (!authdir_mode_tests_reachability(get_options()))
control_event_bootstrap_prob_or(msg, reason, conn);
+ note_or_connect_failed(conn);
}
/** <b>conn</b> got an error in connection_handle_read_impl() or
@@ -1225,6 +1436,19 @@ connection_or_connect, (const tor_addr_t *_addr, uint16_t port,
conn->chan = chan;
chan->conn = conn;
connection_or_init_conn_from_address(conn, &addr, port, id_digest, ed_id, 1);
+
+ /* We have a proper OR connection setup, now check if we can connect to it
+ * that is we haven't had a failure earlier. This is to avoid to try to
+ * constantly connect to relays that we think are not reachable. */
+ if (!should_connect_to_relay(conn)) {
+ log_info(LD_GENERAL, "Can't connect to identity %s at %s:%u because we "
+ "failed earlier. Refusing.",
+ hex_str(id_digest, DIGEST_LEN), fmt_addr(&TO_CONN(conn)->addr),
+ TO_CONN(conn)->port);
+ connection_free_(TO_CONN(conn));
+ return NULL;
+ }
+
connection_or_change_state(conn, OR_CONN_STATE_CONNECTING);
control_event_or_conn_status(conn, OR_CONN_EVENT_LAUNCHED, 0);