[tor-commits] [stem/master] Download helper utility

atagar at torproject.org atagar at torproject.org
Sat Aug 17 20:44:27 UTC 2019


commit 6d4cbd2180d11682d7d65b6926c562155907d049
Author: Damian Johnson <atagar at torproject.org>
Date:   Thu Aug 1 19:58:07 2019 -0700

    Download helper utility
    
    Stem only raises documented exceptions, but urllib makes this difficult in that
    it raises a wide variety of exceptions. Wrapping it within a DownloadFailed
    exception that retains its wrapped exception.
---
 stem/descriptor/collector.py  |  76 +++-------------------------
 stem/util/connection.py       | 112 ++++++++++++++++++++++++++++++++++++++++++
 test/integ/util/connection.py |  33 +++++++++++--
 test/unit/util/connection.py  |  43 ++++++++++++++++
 4 files changed, 192 insertions(+), 72 deletions(-)

diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index a78d60c4..d9f159e1 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -54,20 +54,13 @@ import json
 import os
 import re
 import shutil
-import sys
 import tempfile
 import time
 
+import stem.util.connection
 import stem.util.str_tools
 
 from stem.descriptor import Compression, parse_file
-from stem.util import log
-
-try:
-  # account for urllib's change between python 2.x and 3.x
-  import urllib.request as urllib
-except ImportError:
-  import urllib2 as urllib
 
 COLLECTOR_URL = 'https://collector.torproject.org/'
 REFRESH_INDEX_RATE = 3600  # get new index if cached copy is an hour old
@@ -148,12 +141,7 @@ def get_server_descriptors(start = None, end = None, cache_to = None, timeout =
     :class:`~stem.descriptor.server_descriptor.ServerDescriptor` for the given
     time range
 
-  :raises:
-    * **socket.timeout** if our request timed out
-    * **urllib2.URLError** for most request failures
-
-    Note that the urllib2 module may fail with other exception types, in
-    which case we'll pass it along.
+  :raises: :class:`~stem.util.connection.DownloadFailed` if the download fails
   """
 
   for f in get_instance().files('server-descriptor', start, end):
@@ -161,43 +149,6 @@ def get_server_descriptors(start = None, end = None, cache_to = None, timeout =
       yield desc
 
 
-def _download(url, timeout, retries):
-  """
-  Download from the given url.
-
-  :param str url: uncompressed url to download from
-  :param int timeout: timeout when connection becomes idle, no timeout applied
-    if **None**
-  :param int retires: maximum attempts to impose
-
-  :returns: content of the given url
-
-  :raises:
-    * **socket.timeout** if our request timed out
-    * **urllib2.URLError** for most request failures
-
-    Note that the urllib2 module may fail with other exception types, in
-    which case we'll pass it along.
-  """
-
-  start_time = time.time()
-
-  try:
-    return urllib.urlopen(url, timeout = timeout).read()
-  except:
-    exc = sys.exc_info()[1]
-
-    if timeout is not None:
-      timeout -= time.time() - start_time
-
-    if retries > 0 and (timeout is None or timeout > 0):
-      log.debug("Failed to download from CollecTor at '%s' (%i retries remaining): %s" % (url, retries, exc))
-      return _download(url, timeout, retries - 1)
-    else:
-      log.debug("Failed to download from CollecTor at '%s': %s" % (url, exc))
-      raise
-
-
 class File(object):
   """
   File within CollecTor.
@@ -258,11 +209,7 @@ class File(object):
     :raises:
       * **ValueError** if unable to determine the descirptor type
       * **TypeError** if we cannot parse this descriptor type
-      * **socket.timeout** if our request timed out
-      * **urllib2.URLError** for most request failures
-
-      Note that the urllib2 module may fail with other exception types, in
-      which case we'll pass it along.
+      * :class:`~stem.util.connection.DownloadFailed` if the download fails
     """
 
     if descriptor_type is None:
@@ -309,12 +256,7 @@ class File(object):
 
     :returns: **str** with the path we downloaded to
 
-    :raises:
-      * **socket.timeout** if our request timed out
-      * **urllib2.URLError** for most request failures
-
-      Note that the urllib2 module may fail with other exception types, in
-      which case we'll pass it along.
+    :raises: :class:`~stem.util.connection.DownloadFailed` if the download fails
     """
 
     # TODO: If checksums get added to the index we should replace
@@ -334,7 +276,7 @@ class File(object):
     elif os.path.exists(path):
       return path  # file already exists
 
-    response = _download(COLLECTOR_URL + self.path, timeout, retries)
+    response = stem.util.connection.download(COLLECTOR_URL + self.path, timeout, retries)
 
     if decompress:
       response = self.compression.decompress(response)
@@ -441,8 +383,7 @@ class CollecTor(object):
 
         * **ValueError** if json is malformed
         * **IOError** if unable to decompress
-        * **socket.timeout** if our request timed out
-        * **urllib2.URLError** for most request failures
+        * :class:`~stem.util.connection.DownloadFailed` if the download fails
     """
 
     if not self._cached_index or time.time() - self._cached_index_at >= REFRESH_INDEX_RATE:
@@ -456,7 +397,7 @@ class CollecTor(object):
 
       extension = compression.extension if compression != Compression.PLAINTEXT else ''
       url = COLLECTOR_URL + 'index/index.json' + extension
-      response = compression.decompress(_download(url, self.timeout, self.retries))
+      response = compression.decompress(stem.util.connection.download(url, self.timeout, self.retries))
 
       self._cached_index = json.loads(stem.util.str_tools._to_unicode(response))
       self._cached_index_at = time.time()
@@ -478,8 +419,7 @@ class CollecTor(object):
 
         * **ValueError** if json is malformed
         * **IOError** if unable to decompress
-        * **socket.timeout** if our request timed out
-        * **urllib2.URLError** for most request failures
+        * :class:`~stem.util.connection.DownloadFailed` if the download fails
     """
 
     if not self._cached_files or time.time() - self._cached_index_at >= REFRESH_INDEX_RATE:
diff --git a/stem/util/connection.py b/stem/util/connection.py
index c23d74e7..7be7fe09 100644
--- a/stem/util/connection.py
+++ b/stem/util/connection.py
@@ -8,6 +8,10 @@ Connection and networking based utility functions.
 
 ::
 
+  DownloadFailed - Inability to download a resource.
+    +- DownloadTimeout - Download timeout reached.
+
+  download - download from a given url
   get_connections - quieries the connections belonging to a given process
   system_resolvers - provides connection resolution methods that are likely to be available
   port_usage - brief description of the common usage for a port
@@ -58,6 +62,10 @@ import collections
 import os
 import platform
 import re
+import socket
+import sys
+import time
+import traceback
 
 import stem.util
 import stem.util.proc
@@ -65,6 +73,12 @@ import stem.util.system
 
 from stem.util import conf, enum, log, str_tools
 
+try:
+  # account for urllib's change between python 2.x and 3.x
+  import urllib.request as urllib
+except ImportError:
+  import urllib2 as urllib
+
 # Connection resolution is risky to log about since it's highly likely to
 # contain sensitive information. That said, it's also difficult to get right in
 # a platform independent fashion. To opt into the logging requried to
@@ -162,6 +176,104 @@ class Connection(collections.namedtuple('Connection', ['local_address', 'local_p
   """
 
 
+class DownloadFailed(IOError):
+  """
+  Inability to download a resource. Python's urllib module raises
+  a wide variety of undocumented exceptions (urllib2.URLError,
+  socket.timeout, and others).
+
+  This wraps lower level failures in a common exception type that
+  retains their exception and `stacktrace
+  <https://docs.python.org/3/library/traceback.html>`_.
+
+  .. versionadded:: 1.8.0
+
+  :var str url: url we failed to download from
+  :var Exception error: original urllib exception
+  :var traceback stacktrace: original stacktrace
+  :var str stacktrace_str: string representation of the stacktrace
+  """
+
+  def __init__(self, url, error, stacktrace, message = None):
+    if message is None:
+      # The string representation of exceptions can reside in several places.
+      # urllib.URLError use a 'reason' attribute that in turn may referrence
+      # low level structures such as socket.gaierror. Whereas most exceptions
+      # use a 'message' attribute.
+
+      reason = str(error)
+
+      all_str_repr = (
+        getattr(getattr(error, 'reason', None), 'strerror', None),
+        getattr(error, 'reason', None),
+        getattr(error, 'message', None),
+      )
+
+      for str_repr in all_str_repr:
+        if str_repr and isinstance(str_repr, str):
+          reason = str_repr
+          break
+
+      message = 'Failed to download from %s (%s): %s' % (url, type(error).__name__, reason)
+
+    super(DownloadFailed, self).__init__(message)
+
+    self.url = url
+    self.error = error
+    self.stacktrace = stacktrace
+    self.stacktrace_str = ''.join(traceback.format_tb(stacktrace))
+
+
+class DownloadTimeout(DownloadFailed):
+  """
+  Timeout reached while downloading this resource.
+
+  .. versionadded:: 1.8.0
+  """
+
+  def __init__(self, url, error, stacktrace, timeout):
+    super(DownloadTimeout, self).__init__('Failed to download from %s: %0.1f second timeout reached' % (url, timeout))
+
+
+def download(url, timeout = None, retries = None):
+  """
+  Download from the given url.
+
+  .. versionadded:: 1.8.0
+
+  :param str url: uncompressed url to download from
+  :param int timeout: timeout when connection becomes idle, no timeout applied
+    if **None**
+  :param int retires: maximum attempts to impose
+
+  :returns: **bytes** content of the given url
+
+  :raises: :class:`~stem.util.connection.DownloadFailed` if the download fails
+  """
+
+  if retries is None:
+    retries = 0
+
+  start_time = time.time()
+
+  try:
+    return urllib.urlopen(url, timeout = timeout).read()
+  except socket.timeout as exc:
+    raise DownloadTimeout(url, exc, sys.exc_info()[2], timeout)
+  except:
+    exc, stacktrace = sys.exc_info()[1:3]
+
+    if timeout is not None:
+      timeout -= time.time() - start_time
+
+    if retries > 0 and (timeout is None or timeout > 0):
+      log.debug('Failed to download from %s (%i retries remaining): %s' % (url, retries, exc))
+      return download(url, timeout, retries - 1)
+    else:
+      log.debug('Failed to download from %s: %s' % (url, exc))
+      raise DownloadFailed(url, exc, stacktrace)
+
+
 def get_connections(resolver = None, process_pid = None, process_name = None):
   """
   Retrieves a list of the current connections for a given process. This
diff --git a/test/integ/util/connection.py b/test/integ/util/connection.py
index 12ce8ac4..4617fe56 100644
--- a/test/integ/util/connection.py
+++ b/test/integ/util/connection.py
@@ -5,11 +5,18 @@ that we're running.
 
 import unittest
 
+import stem.util.connection
 import stem.util.system
 import test.require
 import test.runner
 
-from stem.util.connection import RESOLVER_COMMAND, Resolver, get_connections, system_resolvers
+from stem.util.connection import Resolver
+
+try:
+  # account for urllib's change between python 2.x and 3.x
+  import urllib.request as urllib
+except ImportError:
+  import urllib2 as urllib
 
 
 class TestConnection(unittest.TestCase):
@@ -20,22 +27,40 @@ class TestConnection(unittest.TestCase):
     if test.runner.Torrc.PORT not in runner.get_options():
       self.skipTest('(no control port)')
       return
-    elif resolver not in system_resolvers():
+    elif resolver not in stem.util.connection.system_resolvers():
       self.skipTest('(resolver unavailable on this platform)')
       return
 
     with runner.get_tor_socket():
-      connections = get_connections(resolver, process_pid = runner.get_pid())
+      connections = stem.util.connection.get_connections(resolver, process_pid = runner.get_pid())
 
       for conn in connections:
         if conn.local_address == '127.0.0.1' and conn.local_port == test.runner.CONTROL_PORT:
           return
 
-      resolver_command = RESOLVER_COMMAND[resolver].format(pid = runner.get_pid())
+      resolver_command = stem.util.connection.RESOLVER_COMMAND[resolver].format(pid = runner.get_pid())
       resolver_output = stem.util.system.call(resolver_command)
 
       self.fail('Unable to find our controller connection with %s (%s). Connections found were...\n\n%s\n\nCommand output was...\n\n%s' % (resolver, resolver_command, '\n'.join(map(str, connections)), resolver_output))
 
+  @test.require.only_run_once
+  @test.require.online
+  def test_download(self):
+    response = stem.util.connection.download('https://collector.torproject.org/index/index.json')
+    self.assertTrue(b'"path":"https://collector.torproject.org"' in response)
+
+  @test.require.only_run_once
+  @test.require.online
+  def test_download_failure(self):
+    try:
+      stem.util.connection.download('https://no.such.testing.url')
+      self.fail('expected a stem.util.connection.DownloadFailed to be raised')
+    except stem.util.connection.DownloadFailed as exc:
+      self.assertEqual('Failed to download from https://no.such.testing.url (URLError): Name or service not known', str(exc))
+      self.assertEqual('https://no.such.testing.url', exc.url)
+      self.assertEqual('Name or service not known', exc.error.reason.strerror)
+      self.assertEqual(urllib.URLError, type(exc.error))
+
   def test_connections_by_proc(self):
     self.check_resolver(Resolver.PROC)
 
diff --git a/test/unit/util/connection.py b/test/unit/util/connection.py
index a2162029..57718446 100644
--- a/test/unit/util/connection.py
+++ b/test/unit/util/connection.py
@@ -2,6 +2,7 @@
 Unit tests for the stem.util.connection functions.
 """
 
+import io
 import platform
 import unittest
 
@@ -10,11 +11,20 @@ import stem.util.connection
 from stem.util.connection import Resolver, Connection
 
 try:
+  # account for urllib's change between python 2.x and 3.x
+  import urllib.request as urllib
+except ImportError:
+  import urllib2 as urllib
+
+try:
   # added in python 3.3
   from unittest.mock import Mock, patch
 except ImportError:
   from mock import Mock, patch
 
+URL_OPEN = 'urllib.request.urlopen' if stem.prereq.is_python_3() else 'urllib2.urlopen'
+URL = 'https://example.unit.test.url'
+
 NETSTAT_OUTPUT = """\
 Active Internet connections (w/o servers)
 Proto Recv-Q Send-Q Local Address           Foreign Address         State       PID/Program name
@@ -166,6 +176,39 @@ _tor     tor        15843   20* internet stream tcp 0x0 192.168.1.100:36174 -->
 
 
 class TestConnection(unittest.TestCase):
+  @patch(URL_OPEN)
+  def test_download(self, urlopen_mock):
+    urlopen_mock.return_value = io.BytesIO(b'hello')
+
+    self.assertEqual(b'hello', stem.util.connection.download(URL))
+    urlopen_mock.assert_called_with(URL, timeout = None)
+
+  @patch(URL_OPEN)
+  def test_download_failure(self, urlopen_mock):
+    urlopen_mock.side_effect = urllib.URLError('boom')
+
+    try:
+      stem.util.connection.download(URL)
+      self.fail('expected a stem.util.connection.DownloadFailed to be raised')
+    except stem.util.connection.DownloadFailed as exc:
+      self.assertEqual('Failed to download from https://example.unit.test.url (URLError): boom', str(exc))
+      self.assertEqual(URL, exc.url)
+      self.assertEqual('boom', exc.error.reason)
+      self.assertEqual(urllib.URLError, type(exc.error))
+      self.assertTrue('return urllib.urlopen(url, timeout = timeout).read()' in exc.stacktrace_str)
+
+  @patch(URL_OPEN)
+  def test_download_retries(self, urlopen_mock):
+    urlopen_mock.side_effect = urllib.URLError('boom')
+
+    self.assertRaisesRegexp(IOError, 'boom', stem.util.connection.download, URL)
+    self.assertEqual(1, urlopen_mock.call_count)
+
+    urlopen_mock.reset_mock()
+
+    self.assertRaisesRegexp(IOError, 'boom', stem.util.connection.download, URL, retries = 4)
+    self.assertEqual(5, urlopen_mock.call_count)
+
   @patch('os.access')
   @patch('stem.util.system.is_available')
   @patch('stem.util.proc.is_available')





More information about the tor-commits mailing list