commit 4f80a3bb07df8d742cb79f8ca918734412159422 Author: Arturo Filastò arturo@filasto.net Date: Thu Sep 8 16:17:28 2016 +0200
Implement retry of director startup with bounded exponential binary backoff --- ooni/director.py | 7 +++---- ooni/geoip.py | 8 +++++++- ooni/ui/web/server.py | 29 ++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/ooni/director.py b/ooni/director.py index cb2bdf0..1807e65 100644 --- a/ooni/director.py +++ b/ooni/director.py @@ -203,11 +203,9 @@ class Director(object): yield probe_ip.lookup() self.notify(DirectorEvent("success", "Looked up probe IP"))
- self.notify(DirectorEvent("success", - "Running system tasks")) + self.notify(DirectorEvent("success", "Running system tasks")) yield run_system_tasks(no_input_store=not create_input_store) - self.notify(DirectorEvent("success", - "Ran system tasks")) + self.notify(DirectorEvent("success", "Ran system tasks"))
@defer.inlineCallbacks def start(self, start_tor=False, check_incoherences=True, @@ -218,6 +216,7 @@ class Director(object): self._director_starting.callback(self._director_state) except Exception as exc: self._director_starting.errback(Failure(exc)) + raise
@property def measurementSuccessRatio(self): diff --git a/ooni/geoip.py b/ooni/geoip.py index f271790..40fad25 100644 --- a/ooni/geoip.py +++ b/ooni/geoip.py @@ -181,12 +181,17 @@ class ProbeIP(object): self._state = INITIAL self._looking_up = defer.Deferred() self._looking_up.addCallback(self._looked_up) + self._looking_up.addErrback(self._lookup_failed)
def _looked_up(self, result): self._last_lookup = time.time() self._reset_state() return result
+ def _lookup_failed(self, failure): + self._reset_state() + return failure + def resolveGeodata(self): from ooni.settings import config
@@ -227,8 +232,9 @@ class ProbeIP(object): self.resolveGeodata() self._looking_up.callback(self.address) defer.returnValue(self.address) - except Exception: + except Exception as exc: log.msg("Unable to lookup the probe IP via GeoIPService") + self._looking_up.errback(defer.failure.Failure(exc)) raise
@defer.inlineCallbacks diff --git a/ooni/ui/web/server.py b/ooni/ui/web/server.py index 26bfd47..aed9951 100644 --- a/ooni/ui/web/server.py +++ b/ooni/ui/web/server.py @@ -4,6 +4,7 @@ import os import json import errno import string +import random from functools import wraps from random import SystemRandom
@@ -150,6 +151,7 @@ class WebUIAPI(object): _enable_xsrf_protection = True
def __init__(self, config, director, scheduler, _reactor=reactor): + self._reactor = reactor self.director = director self.scheduler = scheduler
@@ -165,6 +167,12 @@ class WebUIAPI(object): self._director_started = False self._is_initialized = config.is_initialized()
+ # We use exponential backoff to trigger retries of the startup of + # the director. + self._director_startup_retries = 0 + # Maximum delay should be 6 hours + self._director_max_retry_delay = 6*60*60 + self.status_poller = LongPoller( self._long_polling_timeout, _reactor) self.director_event_poller = LongPoller( @@ -179,9 +187,11 @@ class WebUIAPI(object): self.start_director()
def start_director(self): + log.debug("Starting director") d = self.director.start()
d.addCallback(self.director_started) + d.addErrback(self.director_startup_failed) d.addBoth(lambda _: self.status_poller.notify())
@property @@ -208,7 +218,24 @@ class WebUIAPI(object): log.debug("Handling event {0}".format(event.type)) self.director_event_poller.notify(event)
+ def director_startup_failed(self, failure): + self._director_startup_retries += 1 + + # We delay the startup using binary exponential backoff with an + # upper bound. + startup_delay = random.uniform( + 0, min(2**self._director_startup_retries, + self._director_max_retry_delay) + ) + log.err("Failed to start the director, " + "retrying in {0}s".format(startup_delay)) + self._reactor.callLater( + startup_delay, + self.start_director + ) + def director_started(self, _): + log.debug("Started director") self._director_started = True
@app.handle_errors(NotFound) @@ -435,7 +462,7 @@ class WebUIAPI(object): deck.load(deck_data) self.run_deck(deck)
- except errors.MissingRequiredOption, option_name: + except errors.MissingRequiredOption as option_name: raise WebUIError( 400, 'Missing required option: "{}"'.format(option_name) )