commit f993fccf0e1ad319920a3ae446d26f3ae8b5de43
Author: Damian Johnson <atagar(a)torproject.org>
Date: Sat Oct 10 16:23:54 2015 -0700
Notification for especially high latency
We've had ongoing latency issues with Faravahar for a while now. Adding a
notice level check to better surface this...
NOTICE: Downloading the consensus from Faravahar took 74.5s. Median download
time is 3.0s: maatuska => 9.6s, tor26 => 2.6s, longclaw => 2.8s, dizum =>
2.7s, gabelmoo => 2.9s, moria1 => 3.0s, dannenberg => 3.3s, Faravahar =>
74.5s
---
consensus_health_checker.py | 17 +++++++++++++++++
data/consensus_health.cfg | 1 +
2 files changed, 18 insertions(+)
diff --git a/consensus_health_checker.py b/consensus_health_checker.py
index d762f74..d34db57 100755
--- a/consensus_health_checker.py
+++ b/consensus_health_checker.py
@@ -140,6 +140,11 @@ class Issue(object):
attr.update({'authorities': ''})
return CONFIG['msg'][self._template].format(**attr).replace(' ', '_')
+ elif self._template == 'LATENCY':
+ attr = dict(self._attr)
+ attr.update({'authority': '', 'time_taken': '', 'median_time': '', 'authority_times': ''})
+
+ return CONFIG['msg'][self._template].format(**attr).replace(' ', '_')
else:
return self.get_message().replace(' ', '_')
@@ -775,9 +780,13 @@ def _get_documents(label, resource):
validate = True,
)
+ times_taken = {}
+
for authority, query in queries.items():
try:
+ start_time = time.time()
documents[authority] = query.run()[0]
+ times_taken[authority] = time.time() - start_time
except Exception as exc:
if label == 'vote':
# try to download the vote via the other authorities
@@ -798,6 +807,14 @@ def _get_documents(label, resource):
issues.append(Issue(Runlevel.ERROR, 'AUTHORITY_UNAVAILABLE', fetch_type = label, authority = authority, url = query.download_url, error = exc, to = [authority]))
+ if label == 'consensus':
+ median_time = sorted(times_taken.values())[len(times_taken) / 2]
+ authority_times = ', '.join(['%s => %0.1fs' % (authority, time_taken) for authority, time_taken in times_taken.items()])
+
+ for authority, time_taken in times_taken.items():
+ if time_taken > median_time * 5:
+ issues.append(Issue(Runlevel.NOTICE, 'LATENCY', authority = authority, time_taken = '%0.1fs' % time_taken, median_time = '%0.1fs' % median_time, authority_times = authority_times, to = [authority]))
+
return documents, issues
diff --git a/data/consensus_health.cfg b/data/consensus_health.cfg
index 7e30fd0..2c5652a 100644
--- a/data/consensus_health.cfg
+++ b/data/consensus_health.cfg
@@ -1,5 +1,6 @@
# message templates for notifications we send
+msg LATENCY => Downloading the consensus from {authority} took {time_taken}. Median download time is {median_time}: {authority_times}
msg MISSING_LATEST_CONSENSUS => The consensuses published by the following directory authorities are more than one hour old and therefore not fresh anymore: {authorities}
msg CONSENSUS_METHOD_UNSUPPORTED => The following directory authorities do not support the consensus method that the consensus uses: {authorities}
msg DIFFERENT_RECOMMENDED_VERSION => The following directory authorities recommend other {type} versions than the consensus: {differences}