
commit 07bf976cbf57824e0425ee33bfd9763934915bbb Author: Damian Johnson <atagar@torproject.org> Date: Wed Jul 24 18:12:29 2019 -0700 Initial descriptor reading function Still quite a few rough edges, but works. Exercised with... import stem.descriptor.collector collector = stem.descriptor.collector.CollecTor() f = list(filter(lambda x: 'server-descriptor 1.0' in x._guess_descriptor_types(), collector.files()))[0] for desc in f.read('/home/atagar/Desktop/foo'): print(desc) We're splitting the download and read methods so descriptor archives can optionally cache locally. --- stem/descriptor/collector.py | 123 ++++++++++++++++++++++++++++++++++++-- test/unit/descriptor/collector.py | 8 +-- 2 files changed, 122 insertions(+), 9 deletions(-) diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py index 754a96b9..1f7ddb96 100644 --- a/stem/descriptor/collector.py +++ b/stem/descriptor/collector.py @@ -51,11 +51,15 @@ With this you can either download and read directly from CollecTor... import datetime import json +import os import re import sys +import tempfile import time -from stem.descriptor import Compression +import stem.util.str_tools + +from stem.descriptor import Compression, parse_file from stem.util import log try: @@ -121,7 +125,6 @@ def _download(url, timeout, retries): :returns: content of the given url :raises: - * **IOError** if unable to decompress * **socket.timeout** if our request timed out * **urllib2.URLError** for most request failures @@ -174,8 +177,118 @@ class File(object): self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M') self._guessed_type = None + self._downloaded_to = None # location we last downloaded to + + def read(self, directory = None, descriptor_type = None, timeout = None, retries = 3): + """ + Provides descriptors from this archive. Descriptors are downloaded or read + from disk as follows... + + * If this file has already been downloaded through + :func:`~stem.descriptor.collector.CollecTor.download' these descriptors + are read from disk. + + * If a **directory** argument is provided and the file is already present + these descriptors are read from disk. + + * If a **directory** argument is provided and the file is not present the + file is downloaded this location then read. + + * If the file has neither been downloaded and no **directory** argument + is provided then the file is downloaded to a temporary directory that's + deleted after it is read. + + :param str directory: destination to download into + :param str descriptor_type: `descriptor type + <https://metrics.torproject.org/collector.html#data-formats>`_, this is + guessed if not provided + :param int timeout: timeout when connection becomes idle, no timeout + applied if **None** + :param int retires: maximum attempts to impose + + :returns: iterator for :class:`~stem.descriptor.__init__.Descriptor` + instances in the file + + :raises: + * **ValueError** if unable to determine the descirptor type + * **TypeError** if we cannot parse this descriptor type + * **socket.timeout** if our request timed out + * **urllib2.URLError** for most request failures + + Note that the urllib2 module may fail with other exception types, in + which case we'll pass it along. + """ + + if descriptor_type is None: + descriptor_types = self._guess_descriptor_types() + + if not descriptor_types: + raise ValueError("Unable to determine this file's descriptor type") + elif len(descriptor_types) > 1: + raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(descriptor_types)) + + descriptor_type = descriptor_types[0] + + if directory is None: + if self._downloaded_to and os.path.exists(self._downloaded_to): + directory = os.path.dirname(self._downloaded_to) + else: + with tempfile.TemporaryDirectory() as tmp_directory: + return self.read(tmp_directory, timeout, retries) + + path = self.download(directory, True, timeout, retries) + return parse_file(path, descriptor_type) + + def download(self, directory, decompress = True, timeout = None, retries = 3): + """ + Downloads this file to the given location. If a file already exists this is + a no-op. + + :param str directory: destination to download into + :param bool decompress: decompress written file + :param int timeout: timeout when connection becomes idle, no timeout + applied if **None** + :param int retires: maximum attempts to impose + + :returns: **str** with the path we downloaded to + + :raises: + * **socket.timeout** if our request timed out + * **urllib2.URLError** for most request failures + + Note that the urllib2 module may fail with other exception types, in + which case we'll pass it along. + """ + + # TODO: If checksums get added to the index we should replace + # the path check below to verify that... + # + # https://trac.torproject.org/projects/tor/ticket/31204 + + filename = self.path.split('/')[-1] + + if decompress: + filename = filename.rsplit('.', 1)[0] + + path = os.path.join(directory, filename) + + if not os.path.exists(directory): + os.makedirs(directory) + elif os.path.exists(path): + return path # file already exists + + with open(path, 'wb') as output_file: + response = _download(COLLECTOR_URL + self.path, timeout, retries) + + if decompress: + response = self.compression.decompress(response) + + output_file.write(response) + + self._downloaded_to = path + return path - def guess_descriptor_types(self): + def _guess_descriptor_types(self): """ Descriptor @type this file is expected to have based on its path. If unable to determine any this tuple is empty. @@ -290,7 +403,7 @@ class CollecTor(object): url = COLLECTOR_URL + 'index/index.json' + extension response = compression.decompress(_download(url, self.timeout, self.retries)) - self._cached_index = json.loads(response) + self._cached_index = json.loads(stem.util.str_tools._to_unicode(response)) self._cached_index_at = time.time() return self._cached_index @@ -325,7 +438,7 @@ class CollecTor(object): elif end and (entry.end is None or entry.end > end): continue - if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in entry.guess_descriptor_types()]): + if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in entry._guess_descriptor_types()]): matches.append(entry) return matches diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py index 86641f32..914b52b9 100644 --- a/test/unit/descriptor/collector.py +++ b/test/unit/descriptor/collector.py @@ -209,13 +209,13 @@ class TestCollector(unittest.TestCase): def test_guess_descriptor_types(self): f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21') - self.assertEqual(('bridge-extra-info 1.3',), f.guess_descriptor_types()) + self.assertEqual(('bridge-extra-info 1.3',), f._guess_descriptor_types()) f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59') - self.assertEqual(('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), f.guess_descriptor_types()) + self.assertEqual(('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), f._guess_descriptor_types()) f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07') - self.assertEqual((), f.guess_descriptor_types()) + self.assertEqual((), f._guess_descriptor_types()) f = File('archive/no_such_file.tar', 20480, '2018-03-19 16:07') - self.assertEqual((), f.guess_descriptor_types()) + self.assertEqual((), f._guess_descriptor_types())