[tor-commits] [ooni-probe/master] Implement retry of director startup with bounded exponential binary backoff

art at torproject.org art at torproject.org
Mon Sep 19 12:14:25 UTC 2016


commit 4f80a3bb07df8d742cb79f8ca918734412159422
Author: Arturo Filastò <arturo at filasto.net>
Date:   Thu Sep 8 16:17:28 2016 +0200

    Implement retry of director startup with bounded exponential binary backoff
---
 ooni/director.py      |  7 +++----
 ooni/geoip.py         |  8 +++++++-
 ooni/ui/web/server.py | 29 ++++++++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/ooni/director.py b/ooni/director.py
index cb2bdf0..1807e65 100644
--- a/ooni/director.py
+++ b/ooni/director.py
@@ -203,11 +203,9 @@ class Director(object):
             yield probe_ip.lookup()
             self.notify(DirectorEvent("success", "Looked up probe IP"))
 
-        self.notify(DirectorEvent("success",
-                                  "Running system tasks"))
+        self.notify(DirectorEvent("success", "Running system tasks"))
         yield run_system_tasks(no_input_store=not create_input_store)
-        self.notify(DirectorEvent("success",
-                                  "Ran system tasks"))
+        self.notify(DirectorEvent("success", "Ran system tasks"))
 
     @defer.inlineCallbacks
     def start(self, start_tor=False, check_incoherences=True,
@@ -218,6 +216,7 @@ class Director(object):
             self._director_starting.callback(self._director_state)
         except Exception as exc:
             self._director_starting.errback(Failure(exc))
+            raise
 
     @property
     def measurementSuccessRatio(self):
diff --git a/ooni/geoip.py b/ooni/geoip.py
index f271790..40fad25 100644
--- a/ooni/geoip.py
+++ b/ooni/geoip.py
@@ -181,12 +181,17 @@ class ProbeIP(object):
         self._state = INITIAL
         self._looking_up = defer.Deferred()
         self._looking_up.addCallback(self._looked_up)
+        self._looking_up.addErrback(self._lookup_failed)
 
     def _looked_up(self, result):
         self._last_lookup = time.time()
         self._reset_state()
         return result
 
+    def _lookup_failed(self, failure):
+        self._reset_state()
+        return failure
+
     def resolveGeodata(self):
         from ooni.settings import config
 
@@ -227,8 +232,9 @@ class ProbeIP(object):
                 self.resolveGeodata()
                 self._looking_up.callback(self.address)
                 defer.returnValue(self.address)
-            except Exception:
+            except Exception as exc:
                 log.msg("Unable to lookup the probe IP via GeoIPService")
+                self._looking_up.errback(defer.failure.Failure(exc))
                 raise
 
     @defer.inlineCallbacks
diff --git a/ooni/ui/web/server.py b/ooni/ui/web/server.py
index 26bfd47..aed9951 100644
--- a/ooni/ui/web/server.py
+++ b/ooni/ui/web/server.py
@@ -4,6 +4,7 @@ import os
 import json
 import errno
 import string
+import random
 from functools import wraps
 from random import SystemRandom
 
@@ -150,6 +151,7 @@ class WebUIAPI(object):
     _enable_xsrf_protection = True
 
     def __init__(self, config, director, scheduler, _reactor=reactor):
+        self._reactor = reactor
         self.director = director
         self.scheduler = scheduler
 
@@ -165,6 +167,12 @@ class WebUIAPI(object):
         self._director_started = False
         self._is_initialized = config.is_initialized()
 
+        # We use exponential backoff to trigger retries of the startup of
+        # the director.
+        self._director_startup_retries = 0
+        # Maximum delay should be 6 hours
+        self._director_max_retry_delay = 6*60*60
+
         self.status_poller = LongPoller(
             self._long_polling_timeout, _reactor)
         self.director_event_poller = LongPoller(
@@ -179,9 +187,11 @@ class WebUIAPI(object):
             self.start_director()
 
     def start_director(self):
+        log.debug("Starting director")
         d = self.director.start()
 
         d.addCallback(self.director_started)
+        d.addErrback(self.director_startup_failed)
         d.addBoth(lambda _: self.status_poller.notify())
 
     @property
@@ -208,7 +218,24 @@ class WebUIAPI(object):
         log.debug("Handling event {0}".format(event.type))
         self.director_event_poller.notify(event)
 
+    def director_startup_failed(self, failure):
+        self._director_startup_retries += 1
+
+        # We delay the startup using binary exponential backoff with an
+        # upper bound.
+        startup_delay = random.uniform(
+            0, min(2**self._director_startup_retries,
+                   self._director_max_retry_delay)
+        )
+        log.err("Failed to start the director, "
+                "retrying in {0}s".format(startup_delay))
+        self._reactor.callLater(
+            startup_delay,
+            self.start_director
+        )
+
     def director_started(self, _):
+        log.debug("Started director")
         self._director_started = True
 
     @app.handle_errors(NotFound)
@@ -435,7 +462,7 @@ class WebUIAPI(object):
             deck.load(deck_data)
             self.run_deck(deck)
 
-        except errors.MissingRequiredOption, option_name:
+        except errors.MissingRequiredOption as option_name:
             raise WebUIError(
                 400, 'Missing required option: "{}"'.format(option_name)
             )





More information about the tor-commits mailing list