[doctor/master] Properly suppress staggered messages

28 Sep 2014

commit 1bfdf45380ef0232bd563e18ccaaaeda9fd2b178
Author: Damian Johnson <atagar@torproject.org>
Date:   Sun Sep 28 12:14:28 2014 -0700

    Properly suppress staggered messages
    
    We updated suppression times if and only if its duration had expired. Take the
    following scenario with the suppression time of six hours per message...
    
      2:00 - issue A comes up
      4:00 - issue B comes up
    
    At 4:00 an email is sent having both issue A and B, so the proper thing would
    be to suppress both until 10:00. However, message A's suppression time wasn't
    being updated so we'd email at both 8:00 and 10:00 instead.
    
    This should greatly cut down on the amount of noise since all messages have a
    default suppression time.
---
 consensus_health_checker.py |   95 ++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 37 deletions(-)

diff --git a/consensus_health_checker.py b/consensus_health_checker.py
index 30693d5..54c50da 100755
--- a/consensus_health_checker.py
+++ b/consensus_health_checker.py
@@ -85,6 +85,27 @@ class Issue(object):
     return self._runlevel
 
   @lru_cache()
+  def get_suppression_key(self):
+    """
+    Provides the key used for issue suppression.
+
+    :returns: **str** used for the configuration key of this issue in the
+      suppressions file
+    """
+
+    if self._template == 'TOO_MANY_UNMEASURED_RELAYS':
+      # Hack because this message has too much dynamic data to be effectively
+      # suppressed. Hate doing this here, so better would be to make this a
+      # config property.
+
+      attr = dict(self._attr)
+      attr.update({'unmeasured': 0, 'total': 0, 'percentage': 0})
+
+      return CONFIG['msg'][self._template].format(**attr).replace(' ', '_')
+    else:
+      return self.get_message().replace(' ', '_')
+
+  @lru_cache()
   def get_suppression_duration(self):
     """
     Provides the number of hours we should suppress this message after it has
@@ -115,42 +136,49 @@ class Issue(object):
     return "%s: %s" % (self.get_runlevel(), self.get_message())
 
 
-def rate_limit_notice(key, hours = 0, days = 0):
+def is_rate_limited(issue):
   """
   Check if we have sent a notice with this key within a given period of time.
-  If we have then this returns **False**, otherwise this records the fact that
-  we're sending the message now and returns **True**.
 
-  :param str key: unique identifier for this notification
-  :param int hours: number of hours to suppress this message for after being sent
-  :param int days: number of days to suppress this message for after being sent
+  :param Issue issue: issue to check the suppression status for
   """
 
-  if hours == 0 and days == 0:
-    return True
-
-  config = stem.util.conf.get_config('last_notified')
-  config_path = util.get_path('data', 'last_notified.cfg')
+  key = issue.get_suppression_key()
+  hours = issue.get_suppression_duration()
 
-  try:
-    config.clear()
-    config.load(config_path)
-  except:
-    pass
+  if hours == 0:
+    return True
 
   current_time = int(time.time())
-  last_seen = config.get(key, 0)
-  suppression_time = (3600 * hours) + (86400 * days)
+  last_seen = stem.util.conf.get_config('last_notified').get(key, 0)
+  suppression_time = 3600 * hours
   suppression_time += 1800  # adding a half hour so timing doesn't coinside with our hourly cron
   suppression_time_remaining = suppression_time - (current_time - last_seen)
 
   if suppression_time_remaining <= 0:
-    config.set(key, str(current_time), overwrite = True)
-    config.save(config_path)
-    return True
+    return False
   else:
     log.info("Suppressing %s, time remaining is %i hours" % (key, (suppression_time_remaining / 3600) + 1))
-    return False
+    return True
+
+
+def rate_limit_notice(issue):
+  """
+  Record that this notice is being sent, so further runs will take this into
+  account for rate limitation.
+
+  :param Issue issue: issue to update the suppression status for
+  """
+
+  key = issue.get_suppression_key()
+  hours = issue.get_suppression_duration()
+
+  if hours == 0:
+    return
+
+  config = stem.util.conf.get_config('last_notified')
+  config.set(key, str(int(time.time())), overwrite = True)
+  config.save()
 
 
 @lru_cache()
@@ -166,6 +194,9 @@ def main():
   config = stem.util.conf.get_config("consensus_health")
   config.load(util.get_path('data', 'consensus_health.cfg'))
 
+  config = stem.util.conf.get_config('last_notified')
+  config.load(util.get_path('data', 'last_notified.cfg'))
+
   consensuses, consensus_fetching_issues = get_consensuses()
   votes, vote_fetching_issues = get_votes()
   issues = consensus_fetching_issues + vote_fetching_issues
@@ -178,26 +209,16 @@ def main():
   is_all_suppressed = True  # either no issues or they're all already suppressed
 
   for issue in issues:
-    if issue._template == 'TOO_MANY_UNMEASURED_RELAYS':
-      # Hack because this message has too much dynamic data to be effectively
-      # suppressed. Hate doing this here, so better would be to make this a
-      # config property.
-
-      attr = dict(issue._attr)
-      attr.update({'unmeasured': 0, 'total': 0, 'percentage': 0})
-
-      key = CONFIG['msg'][issue._template].format(**attr).replace(' ', '_')
-    else:
-      key = issue.get_message().replace(' ', '_')
-
-    duration = issue.get_suppression_duration()
-
-    if rate_limit_notice(key, duration):
+    if not is_rate_limited(issue):
       is_all_suppressed = False
+      break
 
   if not is_all_suppressed:
     log.debug("Sending notification for issues")
 
+    for issue in issues:
+      rate_limit_notice(issue)
+
     if TEST_RUN:
       print '\n'.join(map(str, issues))
     else:

    

atagar＠torproject.org

tags

participants (1)