commit 4f80a3bb07df8d742cb79f8ca918734412159422
Author: Arturo Filastò <arturo(a)filasto.net>
Date: Thu Sep 8 16:17:28 2016 +0200
Implement retry of director startup with bounded exponential binary backoff
---
ooni/director.py | 7 +++----
ooni/geoip.py | 8 +++++++-
ooni/ui/web/server.py | 29 ++++++++++++++++++++++++++++-
3 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/ooni/director.py b/ooni/director.py
index cb2bdf0..1807e65 100644
--- a/ooni/director.py
+++ b/ooni/director.py
@@ -203,11 +203,9 @@ class Director(object):
yield probe_ip.lookup()
self.notify(DirectorEvent("success", "Looked up probe IP"))
- self.notify(DirectorEvent("success",
- "Running system tasks"))
+ self.notify(DirectorEvent("success", "Running system tasks"))
yield run_system_tasks(no_input_store=not create_input_store)
- self.notify(DirectorEvent("success",
- "Ran system tasks"))
+ self.notify(DirectorEvent("success", "Ran system tasks"))
@defer.inlineCallbacks
def start(self, start_tor=False, check_incoherences=True,
@@ -218,6 +216,7 @@ class Director(object):
self._director_starting.callback(self._director_state)
except Exception as exc:
self._director_starting.errback(Failure(exc))
+ raise
@property
def measurementSuccessRatio(self):
diff --git a/ooni/geoip.py b/ooni/geoip.py
index f271790..40fad25 100644
--- a/ooni/geoip.py
+++ b/ooni/geoip.py
@@ -181,12 +181,17 @@ class ProbeIP(object):
self._state = INITIAL
self._looking_up = defer.Deferred()
self._looking_up.addCallback(self._looked_up)
+ self._looking_up.addErrback(self._lookup_failed)
def _looked_up(self, result):
self._last_lookup = time.time()
self._reset_state()
return result
+ def _lookup_failed(self, failure):
+ self._reset_state()
+ return failure
+
def resolveGeodata(self):
from ooni.settings import config
@@ -227,8 +232,9 @@ class ProbeIP(object):
self.resolveGeodata()
self._looking_up.callback(self.address)
defer.returnValue(self.address)
- except Exception:
+ except Exception as exc:
log.msg("Unable to lookup the probe IP via GeoIPService")
+ self._looking_up.errback(defer.failure.Failure(exc))
raise
@defer.inlineCallbacks
diff --git a/ooni/ui/web/server.py b/ooni/ui/web/server.py
index 26bfd47..aed9951 100644
--- a/ooni/ui/web/server.py
+++ b/ooni/ui/web/server.py
@@ -4,6 +4,7 @@ import os
import json
import errno
import string
+import random
from functools import wraps
from random import SystemRandom
@@ -150,6 +151,7 @@ class WebUIAPI(object):
_enable_xsrf_protection = True
def __init__(self, config, director, scheduler, _reactor=reactor):
+ self._reactor = reactor
self.director = director
self.scheduler = scheduler
@@ -165,6 +167,12 @@ class WebUIAPI(object):
self._director_started = False
self._is_initialized = config.is_initialized()
+ # We use exponential backoff to trigger retries of the startup of
+ # the director.
+ self._director_startup_retries = 0
+ # Maximum delay should be 6 hours
+ self._director_max_retry_delay = 6*60*60
+
self.status_poller = LongPoller(
self._long_polling_timeout, _reactor)
self.director_event_poller = LongPoller(
@@ -179,9 +187,11 @@ class WebUIAPI(object):
self.start_director()
def start_director(self):
+ log.debug("Starting director")
d = self.director.start()
d.addCallback(self.director_started)
+ d.addErrback(self.director_startup_failed)
d.addBoth(lambda _: self.status_poller.notify())
@property
@@ -208,7 +218,24 @@ class WebUIAPI(object):
log.debug("Handling event {0}".format(event.type))
self.director_event_poller.notify(event)
+ def director_startup_failed(self, failure):
+ self._director_startup_retries += 1
+
+ # We delay the startup using binary exponential backoff with an
+ # upper bound.
+ startup_delay = random.uniform(
+ 0, min(2**self._director_startup_retries,
+ self._director_max_retry_delay)
+ )
+ log.err("Failed to start the director, "
+ "retrying in {0}s".format(startup_delay))
+ self._reactor.callLater(
+ startup_delay,
+ self.start_director
+ )
+
def director_started(self, _):
+ log.debug("Started director")
self._director_started = True
@app.handle_errors(NotFound)
@@ -435,7 +462,7 @@ class WebUIAPI(object):
deck.load(deck_data)
self.run_deck(deck)
- except errors.MissingRequiredOption, option_name:
+ except errors.MissingRequiredOption as option_name:
raise WebUIError(
400, 'Missing required option: "{}"'.format(option_name)
)