commit 1bfdf45380ef0232bd563e18ccaaaeda9fd2b178
Author: Damian Johnson <atagar(a)torproject.org>
Date: Sun Sep 28 12:14:28 2014 -0700
Properly suppress staggered messages
We updated suppression times if and only if its duration had expired. Take the
following scenario with the suppression time of six hours per message...
2:00 - issue A comes up
4:00 - issue B comes up
At 4:00 an email is sent having both issue A and B, so the proper thing would
be to suppress both until 10:00. However, message A's suppression time wasn't
being updated so we'd email at both 8:00 and 10:00 instead.
This should greatly cut down on the amount of noise since all messages have a
default suppression time.
---
consensus_health_checker.py | 95 ++++++++++++++++++++++++++-----------------
1 file changed, 58 insertions(+), 37 deletions(-)
diff --git a/consensus_health_checker.py b/consensus_health_checker.py
index 30693d5..54c50da 100755
--- a/consensus_health_checker.py
+++ b/consensus_health_checker.py
@@ -85,6 +85,27 @@ class Issue(object):
return self._runlevel
@lru_cache()
+ def get_suppression_key(self):
+ """
+ Provides the key used for issue suppression.
+
+ :returns: **str** used for the configuration key of this issue in the
+ suppressions file
+ """
+
+ if self._template == 'TOO_MANY_UNMEASURED_RELAYS':
+ # Hack because this message has too much dynamic data to be effectively
+ # suppressed. Hate doing this here, so better would be to make this a
+ # config property.
+
+ attr = dict(self._attr)
+ attr.update({'unmeasured': 0, 'total': 0, 'percentage': 0})
+
+ return CONFIG['msg'][self._template].format(**attr).replace(' ', '_')
+ else:
+ return self.get_message().replace(' ', '_')
+
+ @lru_cache()
def get_suppression_duration(self):
"""
Provides the number of hours we should suppress this message after it has
@@ -115,42 +136,49 @@ class Issue(object):
return "%s: %s" % (self.get_runlevel(), self.get_message())
-def rate_limit_notice(key, hours = 0, days = 0):
+def is_rate_limited(issue):
"""
Check if we have sent a notice with this key within a given period of time.
- If we have then this returns **False**, otherwise this records the fact that
- we're sending the message now and returns **True**.
- :param str key: unique identifier for this notification
- :param int hours: number of hours to suppress this message for after being sent
- :param int days: number of days to suppress this message for after being sent
+ :param Issue issue: issue to check the suppression status for
"""
- if hours == 0 and days == 0:
- return True
-
- config = stem.util.conf.get_config('last_notified')
- config_path = util.get_path('data', 'last_notified.cfg')
+ key = issue.get_suppression_key()
+ hours = issue.get_suppression_duration()
- try:
- config.clear()
- config.load(config_path)
- except:
- pass
+ if hours == 0:
+ return True
current_time = int(time.time())
- last_seen = config.get(key, 0)
- suppression_time = (3600 * hours) + (86400 * days)
+ last_seen = stem.util.conf.get_config('last_notified').get(key, 0)
+ suppression_time = 3600 * hours
suppression_time += 1800 # adding a half hour so timing doesn't coinside with our hourly cron
suppression_time_remaining = suppression_time - (current_time - last_seen)
if suppression_time_remaining <= 0:
- config.set(key, str(current_time), overwrite = True)
- config.save(config_path)
- return True
+ return False
else:
log.info("Suppressing %s, time remaining is %i hours" % (key, (suppression_time_remaining / 3600) + 1))
- return False
+ return True
+
+
+def rate_limit_notice(issue):
+ """
+ Record that this notice is being sent, so further runs will take this into
+ account for rate limitation.
+
+ :param Issue issue: issue to update the suppression status for
+ """
+
+ key = issue.get_suppression_key()
+ hours = issue.get_suppression_duration()
+
+ if hours == 0:
+ return
+
+ config = stem.util.conf.get_config('last_notified')
+ config.set(key, str(int(time.time())), overwrite = True)
+ config.save()
@lru_cache()
@@ -166,6 +194,9 @@ def main():
config = stem.util.conf.get_config("consensus_health")
config.load(util.get_path('data', 'consensus_health.cfg'))
+ config = stem.util.conf.get_config('last_notified')
+ config.load(util.get_path('data', 'last_notified.cfg'))
+
consensuses, consensus_fetching_issues = get_consensuses()
votes, vote_fetching_issues = get_votes()
issues = consensus_fetching_issues + vote_fetching_issues
@@ -178,26 +209,16 @@ def main():
is_all_suppressed = True # either no issues or they're all already suppressed
for issue in issues:
- if issue._template == 'TOO_MANY_UNMEASURED_RELAYS':
- # Hack because this message has too much dynamic data to be effectively
- # suppressed. Hate doing this here, so better would be to make this a
- # config property.
-
- attr = dict(issue._attr)
- attr.update({'unmeasured': 0, 'total': 0, 'percentage': 0})
-
- key = CONFIG['msg'][issue._template].format(**attr).replace(' ', '_')
- else:
- key = issue.get_message().replace(' ', '_')
-
- duration = issue.get_suppression_duration()
-
- if rate_limit_notice(key, duration):
+ if not is_rate_limited(issue):
is_all_suppressed = False
+ break
if not is_all_suppressed:
log.debug("Sending notification for issues")
+ for issue in issues:
+ rate_limit_notice(issue)
+
if TEST_RUN:
print '\n'.join(map(str, issues))
else: