[tor-commits] [stem/master] Initial download function

Sat Aug 17 20:44:27 UTC 2019

commit 4196e37f0f43d033b23f740c1e3e3d9612ba4655
Author: Damian Johnson <atagar at torproject.org>
Date:   Mon Jul 29 19:49:37 2019 -0700

    Initial download function
    
    Few rough edges, but a copy-paste of the initial pydoc demo now works.
---
 stem/descriptor/collector.py       | 69 +++++++++++++++++++++++++++++++++++---
 stem/descriptor/remote.py          |  6 +---
 test/integ/descriptor/collector.py |  4 +--
 3 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index f76fa225..a78d60c4 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -16,7 +16,7 @@ With this you can either download and read directly from CollecTor...
   import datetime
   import stem.descriptor.collector
 
-  yesterday = datetime.date.today() - datetime.timedelta(1)
+  yesterday = datetime.datetime.today() - datetime.timedelta(1)
 
   # provide yesterday's exits
 
@@ -33,7 +33,7 @@ With this you can either download and read directly from CollecTor...
   import stem.descriptor
   import stem.descriptor.collector
 
-  yesterday = datetime.date.today() - datetime.timedelta(1)
+  yesterday = datetime.datetime.today() - datetime.timedelta(1)
   path = os.path.expanduser('~/descriptor_cache/server_desc_today')
 
   with open(path, 'wb') as cache_file:
@@ -53,6 +53,7 @@ import datetime
 import json
 import os
 import re
+import shutil
 import sys
 import tempfile
 import time
@@ -70,6 +71,7 @@ except ImportError:
 
 COLLECTOR_URL = 'https://collector.torproject.org/'
 REFRESH_INDEX_RATE = 3600  # get new index if cached copy is an hour old
+SINGLETON_COLLECTOR = None
 
 YEAR_DATE = re.compile('-(\\d{4})-(\\d{2})\\.')
 SEC_DATE = re.compile('(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})')
@@ -113,6 +115,52 @@ COLLECTOR_DESC_TYPES = {
 }
 
 
+def get_instance():
+  """
+  Provides the singleton :class:`~stem.descriptor.collector.CollecTor`
+  used for this module's shorthand functions.
+
+  :returns: singleton :class:`~stem.descriptor.collector.CollecTor` instance
+  """
+
+  global SINGLETON_COLLECTOR
+
+  if SINGLETON_COLLECTOR is None:
+    SINGLETON_COLLECTOR = CollecTor()
+
+  return SINGLETON_COLLECTOR
+
+
+def get_server_descriptors(start = None, end = None, cache_to = None, timeout = None, retries = 3):
+  """
+  Provides server descriptors for the given time range, sorted oldest to
+  newest.
+
+  :param datetime.datetime start: time range to begin with
+  :param datetime.datetime end: time range to end with
+  :param str cache_to: directory to cache archives into, if an archive is
+    available here it is not downloaded
+  :param int timeout: timeout for downloading each individual archive when the
+    connection becomes idle, no timeout applied if **None**
+  :param int retires: maximum attempts to impose on a per-archive basis
+
+  :returns: **iterator** of
+    :class:`~stem.descriptor.server_descriptor.ServerDescriptor` for the given
+    time range
+
+  :raises:
+    * **socket.timeout** if our request timed out
+    * **urllib2.URLError** for most request failures
+
+    Note that the urllib2 module may fail with other exception types, in
+    which case we'll pass it along.
+  """
+
+  for f in get_instance().files('server-descriptor', start, end):
+    for desc in f.read(cache_to, timeout = timeout, retries = retries):
+      yield desc
+
+
 def _download(url, timeout, retries):
   """
   Download from the given url.
@@ -229,13 +277,24 @@ class File(object):
       if self._downloaded_to and os.path.exists(self._downloaded_to):
         directory = os.path.dirname(self._downloaded_to)
       else:
-        with tempfile.TemporaryDirectory() as tmp_directory:
-          return self.read(tmp_directory, descriptor_type, timeout, retries)
+        # TODO: The following can be replaced with simpler usage of
+        # tempfile.TemporaryDirectory when we drop python 2.x support.
+
+        tmp_directory = tempfile.mkdtemp()
+
+        for desc in self.read(tmp_directory, descriptor_type, timeout, retries):
+          yield desc
+
+        shutil.rmtree(tmp_directory)
+
+        return
 
     # TODO: the following will not work if the tar contains multiple types or a type we do not support
 
     path = self.download(directory, True, timeout, retries)
-    return parse_file(path, descriptor_type)
+
+    for desc in parse_file(path, descriptor_type):
+      yield desc
 
   def download(self, directory, decompress = True, timeout = None, retries = 3):
     """
diff --git a/stem/descriptor/remote.py b/stem/descriptor/remote.py
index 251fc26f..af24f624 100644
--- a/stem/descriptor/remote.py
+++ b/stem/descriptor/remote.py
@@ -164,11 +164,7 @@ DIR_PORT_BLACKLIST = ('tor26', 'Serge')
 def get_instance():
   """
   Provides the singleton :class:`~stem.descriptor.remote.DescriptorDownloader`
-  used for the following functions...
-
-    * :func:`stem.descriptor.remote.get_server_descriptors`
-    * :func:`stem.descriptor.remote.get_extrainfo_descriptors`
-    * :func:`stem.descriptor.remote.get_consensus`
+  used for this module's shorthand functions.
 
   .. versionadded:: 1.5.0
 
diff --git a/test/integ/descriptor/collector.py b/test/integ/descriptor/collector.py
index dbb09d5a..6a0ec5ac 100644
--- a/test/integ/descriptor/collector.py
+++ b/test/integ/descriptor/collector.py
@@ -35,8 +35,8 @@ class TestCollector(unittest.TestCase):
     if compression and not compression.available:
       self.skipTest('(%s unavailable)' % compression)
 
-    collector = CollecTor(compression = compression)
-    index = collector.index()
+    collector = CollecTor()
+    index = collector.index(compression = compression)
 
     self.assertEqual('https://collector.torproject.org', index['path'])
     self.assertEqual(['archive', 'contrib', 'recent'], [entry['path'] for entry in index['directories']])