commit f29d158330a8c6bcc91b71a888db741766135aaf Author: David Goulet dgoulet@torproject.org Date: Tue Feb 6 15:40:17 2018 -0500
relay: Avoid connecting to down relays
If we failed to connect at the TCP level to a relay, note it down and refuse to connect again for another 60 seconds.
Fixes #24767
Signed-off-by: David Goulet dgoulet@torproject.org --- changes/bug24767 | 5 ++ src/or/connection_or.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+)
diff --git a/changes/bug24767 b/changes/bug24767 new file mode 100644 index 000000000..56fbe51a9 --- /dev/null +++ b/changes/bug24767 @@ -0,0 +1,5 @@ + o Major bugfixes (relay, connection): + - Refuse to connect again to a relay from which we failed previously with + a connection refused, timeout or error (at the TCP level). The relay + won't be retried for 60 seconds after the failure occured. Fixes bug + 24767; bugfix on 0.0.6. diff --git a/src/or/connection_or.c b/src/or/connection_or.c index 272a086a3..976889ad0 100644 --- a/src/or/connection_or.c +++ b/src/or/connection_or.c @@ -1122,6 +1122,216 @@ connection_or_group_set_badness_(smartlist_t *group, int force) } SMARTLIST_FOREACH_END(or_conn); }
+/* Lifetime of a connection failure. After that, we'll retry. This is in + * seconds. */ +#define OR_CONNECT_FAILURE_LIFETIME 60 +/* The interval to use with when to clean up the failure cache. */ +#define OR_CONNECT_FAILURE_CLEANUP_INTERVAL 60 + +/* When is the next time we have to cleanup the failure map. We keep this + * because we clean it opportunistically. */ +static time_t or_connect_failure_map_next_cleanup_ts = 0; + +/* OR connection failure entry data structure. It is kept in the connection + * failure map defined below and indexed by OR identity digest, address and + * port. + * + * We need to identify a connection failure with these three values because we + * want to avoid to wrongfully blacklist a relay if someone is trying to + * extend to a known identity digest but with the wrong IP/port. For instance, + * it can happen if a relay changed its port but the client still has an old + * descriptor with the old port. We want to stop connecting to that + * IP/port/identity all together, not only the relay identity. */ +typedef struct or_connect_failure_entry_t { + HT_ENTRY(or_connect_failure_entry_t) node; + /* Identity digest of the connection where it is connecting to. */ + uint8_t identity_digest[DIGEST_LEN]; + /* This is the connection address from the base connection_t. After the + * connection is checked for canonicity, the base address should represent + * what we know instead of where we are connecting to. This is what we need + * so we can correlate known relays within the consensus. */ + tor_addr_t addr; + uint16_t port; + /* Last time we were unable to connect. */ + time_t last_failed_connect_ts; +} or_connect_failure_entry_t; + +/* Map where we keep connection failure entries. They are indexed by addr, + * port and identity digest. */ +static HT_HEAD(or_connect_failure_ht, or_connect_failure_entry_t) + or_connect_failures_map = HT_INITIALIZER(); + +/* Helper: Hashtable equal function. Return 1 if equal else 0. */ +static int +or_connect_failure_ht_eq(const or_connect_failure_entry_t *a, + const or_connect_failure_entry_t *b) +{ + return fast_memeq(a->identity_digest, b->identity_digest, DIGEST_LEN) && + tor_addr_eq(&a->addr, &b->addr) && + a->port == b->port; +} + +/* Helper: Return the hash for the hashtable of the given entry. For this + * table, it is a combination of address, port and identity digest. */ +static unsigned int +or_connect_failure_ht_hash(const or_connect_failure_entry_t *entry) +{ + size_t offset = 0, addr_size; + const void *addr_ptr; + /* Largest size is IPv6 and IPv4 is smaller so it is fine. */ + uint8_t data[16 + sizeof(uint16_t) + DIGEST_LEN]; + + /* Get the right address bytes depending on the family. */ + switch (tor_addr_family(&entry->addr)) { + case AF_INET: + addr_size = 4; + addr_ptr = &entry->addr.addr.in_addr.s_addr; + break; + case AF_INET6: + addr_size = 16; + addr_ptr = &entry->addr.addr.in6_addr.s6_addr; + break; + default: + tor_assert_nonfatal_unreached(); + return 0; + } + + memcpy(data, addr_ptr, addr_size); + offset += addr_size; + memcpy(data + offset, entry->identity_digest, DIGEST_LEN); + offset += DIGEST_LEN; + set_uint16(data + offset, entry->port); + offset += sizeof(uint16_t); + + return (unsigned int) siphash24g(data, offset); +} + +HT_PROTOTYPE(or_connect_failure_ht, or_connect_failure_entry_t, node, + or_connect_failure_ht_hash, or_connect_failure_ht_eq) + +HT_GENERATE2(or_connect_failure_ht, or_connect_failure_entry_t, node, + or_connect_failure_ht_hash, or_connect_failure_ht_eq, + 0.6, tor_reallocarray_, tor_free_) + +/* Initialize a given connect failure entry with the given identity_digest, + * addr and port. All field are optional except ocf. */ +static void +or_connect_failure_init(const char *identity_digest, const tor_addr_t *addr, + uint16_t port, or_connect_failure_entry_t *ocf) +{ + tor_assert(ocf); + if (identity_digest) { + memcpy(ocf->identity_digest, identity_digest, + sizeof(ocf->identity_digest)); + } + if (addr) { + tor_addr_copy(&ocf->addr, addr); + } + ocf->port = port; +} + +/* Return a newly allocated connection failure entry. It is initialized with + * the given or_conn data. This can't fail. */ +static or_connect_failure_entry_t * +or_connect_failure_new(const or_connection_t *or_conn) +{ + or_connect_failure_entry_t *ocf = tor_malloc_zero(sizeof(*ocf)); + or_connect_failure_init(or_conn->identity_digest, &or_conn->real_addr, + TO_CONN(or_conn)->port, ocf); + return ocf; +} + +/* Return a connection failure entry matching the given or_conn. NULL is + * returned if not found. */ +static or_connect_failure_entry_t * +or_connect_failure_find(const or_connection_t *or_conn) +{ + or_connect_failure_entry_t lookup; + tor_assert(or_conn); + or_connect_failure_init(or_conn->identity_digest, &TO_CONN(or_conn)->addr, + TO_CONN(or_conn)->port, &lookup); + return HT_FIND(or_connect_failure_ht, &or_connect_failures_map, &lookup); +} + +/* Note down in the connection failure cache that a failure occurred on the + * given or_conn. */ +static void +note_or_connect_failed(const or_connection_t *or_conn) +{ + or_connect_failure_entry_t *ocf = NULL; + + tor_assert(or_conn); + + ocf = or_connect_failure_find(or_conn); + if (ocf == NULL) { + ocf = or_connect_failure_new(or_conn); + HT_INSERT(or_connect_failure_ht, &or_connect_failures_map, ocf); + } + ocf->last_failed_connect_ts = approx_time(); +} + +/* Cleanup the connection failure cache and remove all entries below the + * given cutoff. */ +static void +or_connect_failure_map_cleanup(time_t cutoff) +{ + or_connect_failure_entry_t **ptr, **next, *entry; + + for (ptr = HT_START(or_connect_failure_ht, &or_connect_failures_map); + ptr != NULL; ptr = next) { + entry = *ptr; + if (entry->last_failed_connect_ts <= cutoff) { + next = HT_NEXT_RMV(or_connect_failure_ht, &or_connect_failures_map, ptr); + tor_free(entry); + } else { + next = HT_NEXT(or_connect_failure_ht, &or_connect_failures_map, ptr); + } + } +} + +/* Return true iff the given OR connection can connect to its destination that + * is the triplet identity_digest, address and port. + * + * The or_conn MUST have gone through connection_or_check_canonicity() so the + * base address is properly set to what we know or doesn't know. */ +static int +should_connect_to_relay(const or_connection_t *or_conn) +{ + time_t now, cutoff; + time_t connect_failed_since_ts = 0; + or_connect_failure_entry_t *ocf; + + tor_assert(or_conn); + + now = approx_time(); + cutoff = now - OR_CONNECT_FAILURE_LIFETIME; + + /* Opportunistically try to cleanup the failure cache. We do that at regular + * interval so it doesn't grow too big. */ + if (or_connect_failure_map_next_cleanup_ts <= now) { + or_connect_failure_map_cleanup(cutoff); + or_connect_failure_map_next_cleanup_ts = + now + OR_CONNECT_FAILURE_CLEANUP_INTERVAL; + } + + /* Look if we have failed previously to the same destination as this + * OR connection. */ + ocf = or_connect_failure_find(or_conn); + if (ocf) { + connect_failed_since_ts = ocf->last_failed_connect_ts; + } + /* If we do have an unable to connect timestamp and it is below cutoff, we + * can connect. Or we have never failed before so let it connect. */ + if (connect_failed_since_ts > cutoff) { + goto no_connect; + } + + /* Ok we can connect! */ + return 1; + no_connect: + return 0; +} + /** <b>conn</b> is in the 'connecting' state, and it failed to complete * a TCP connection. Send notifications appropriately. * @@ -1135,6 +1345,7 @@ connection_or_connect_failed(or_connection_t *conn, control_event_or_conn_status(conn, OR_CONN_EVENT_FAILED, reason); if (!authdir_mode_tests_reachability(get_options())) control_event_bootstrap_prob_or(msg, reason, conn); + note_or_connect_failed(conn); }
/** <b>conn</b> got an error in connection_handle_read_impl() or @@ -1225,6 +1436,19 @@ connection_or_connect, (const tor_addr_t *_addr, uint16_t port, conn->chan = chan; chan->conn = conn; connection_or_init_conn_from_address(conn, &addr, port, id_digest, ed_id, 1); + + /* We have a proper OR connection setup, now check if we can connect to it + * that is we haven't had a failure earlier. This is to avoid to try to + * constantly connect to relays that we think are not reachable. */ + if (!should_connect_to_relay(conn)) { + log_info(LD_GENERAL, "Can't connect to identity %s at %s:%u because we " + "failed earlier. Refusing.", + hex_str(id_digest, DIGEST_LEN), fmt_addr(&TO_CONN(conn)->addr), + TO_CONN(conn)->port); + connection_free_(TO_CONN(conn)); + return NULL; + } + connection_or_change_state(conn, OR_CONN_STATE_CONNECTING); control_event_or_conn_status(conn, OR_CONN_EVENT_LAUNCHED, 0);