commit ce8474dcf61cdb3800108e3820dadcef545220ee Author: Damian Johnson atagar@torproject.org Date: Mon Dec 25 13:10:14 2017 -0800
Stub initial CollecTor module
Presently doesn't do much. Just starting with url resolution and fetching the index. --- stem/descriptor/__init__.py | 1 + stem/descriptor/collector.py | 145 +++++++++++++++++++++++++++++++++++++ test/integ/descriptor/__init__.py | 1 + test/integ/descriptor/collector.py | 20 +++++ test/settings.cfg | 2 + test/unit/descriptor/__init__.py | 1 + test/unit/descriptor/collector.py | 16 ++++ 7 files changed, 186 insertions(+)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py index ef6530ed..4d13ec60 100644 --- a/stem/descriptor/__init__.py +++ b/stem/descriptor/__init__.py @@ -113,6 +113,7 @@ except ImportError: __all__ = [ 'bandwidth_file', 'certificate', + 'collector', 'export', 'extrainfo_descriptor', 'hidden_service_descriptor', diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py new file mode 100644 index 00000000..ca0e6921 --- /dev/null +++ b/stem/descriptor/collector.py @@ -0,0 +1,145 @@ +# Copyright 2017, Damian Johnson and The Tor Project +# See LICENSE for licensing information + +""" +Module for downloading from Tor's descriptor archive, CollecTor... + + https://collector.torproject.org/ + +This stores descriptors going back in time. If you need to know what the +network topology looked like at a past point in time, this is the place to go. + +With this you can either download and read directly from CollecTor... + +:: + + import datetime + import stem.descriptor.collector + + collector = stem.descriptor.collector.CollecTor() + yesterday = datetime.date.today() - datetime.timedelta(1) + + # provide yesterday's exits + + for desc in collector.get_server_descriptors(start = yesterday): + if desc.exit_policy.is_exiting_allowed(): + print ' %s (%s)' % (desc.nickname, desc.fingerprint) + +... or download the descriptors to disk and read them later. + +:: + + import datetime + import stem.descriptor.collector + import stem.descriptor.reader + + collector = stem.descriptor.collector.CollecTor() + yesterday = datetime.date.today() - datetime.timedelta(1) + + collector.download_server_descriptors( + destination = '~/descriptor_cache', + start = yesterday, + ).join() + + reader = stem.descriptor.reader.DescriptorReader('~/descriptor_cache') + + for desc in reader: + if desc.exit_policy.is_exiting_allowed(): + print ' %s (%s)' % (desc.nickname, desc.fingerprint) + +.. versionadded:: 1.7.0 +""" + +import json +import time + +try: + # account for urllib's change between python 2.x and 3.x + import urllib.request as urllib +except ImportError: + import urllib2 as urllib + +import stem.util.enum + +Compression = stem.util.enum.Enum('NONE', 'BZ2', 'GZ', 'XZ') + +COLLECTOR_URL = 'https://collector.torproject.org/' +REFRESH_INDEX_RATE = 3600 # get new index if cached copy is an hour old + +COMPRESSION_SUFFIX = { + Compression.NONE: '', + Compression.BZ2: '.bz2', + Compression.GZ: '.gz', + Compression.XZ: '.xz', +} + + +def url(resource, compression = Compression.NONE): + """ + Provides CollecTor url for the given resource. + + :param str resource: resource type of the url + :param descriptor.collector.Compression compression: compression type to + download from + + :returns: **str** with the CollecTor url + """ + + if compression not in COMPRESSION_SUFFIX: + raise ValueError("'%s' isn't a compression enumeration" % compression) + + # TODO: Not yet sure how to most elegantly map resources to urls. No doubt + # this'll change as we add more types. + + if resource == 'index': + path = ('index', 'index.json') + else: + raise ValueError("'%s' isn't a recognized resource type" % resource) + + return ''.join((COLLECTOR_URL, '/'.join(path), COMPRESSION_SUFFIX[compression])) + + +class CollecTor(object): + """ + Downloader for descriptors from CollecTor. The contents of CollecTor are + provided in `an index https://collector.torproject.org/index/index.json`_ + that's fetched as required. + + :var descriptor.collector.Compression compression: compression type to + download from + :var int retries: number of times to attempt the request if downloading it + fails + :var float timeout: duration before we'll time out our request + """ + + def __init__(self, compression = Compression.XZ, retries = 2, timeout = None): + self.compression = compression + self.retries = retries + self.timeout = timeout + + self._cached_index = None + self._cached_index_at = 0 + + def index(self): + """ + Provides the archives available in CollecTor. + + :returns: **dict** with the archive contents + + :raises: + If unable to retrieve the index this provide... + + * **ValueError** if the index is malformed + * **socket.timeout** if our request timed out + * **urllib2.URLError** for most request failures + """ + + if not self._cached_index or time.time() - self._cached_index_at >= REFRESH_INDEX_RATE: + response = urllib.urlopen(url('index', self.compression), timeout = self.timeout).read() + + # TODO: add compression and retry support + + self._cached_index = json.loads(response) + self._cached_index_at = time.time() + + return self._cached_index diff --git a/test/integ/descriptor/__init__.py b/test/integ/descriptor/__init__.py index 331316a2..2ed1feef 100644 --- a/test/integ/descriptor/__init__.py +++ b/test/integ/descriptor/__init__.py @@ -3,6 +3,7 @@ Integration tests for stem.descriptor.* contents. """
__all__ = [ + 'collector', 'extrainfo_descriptor', 'microdescriptor', 'networkstatus', diff --git a/test/integ/descriptor/collector.py b/test/integ/descriptor/collector.py new file mode 100644 index 00000000..25f5d503 --- /dev/null +++ b/test/integ/descriptor/collector.py @@ -0,0 +1,20 @@ +""" +Integration tests for stem.descriptor.collector. +""" + +import unittest + +import test.require + +from stem.descriptor.collector import CollecTor, Compression + + +class TestCollector(unittest.TestCase): + @test.require.only_run_once + @test.require.online + def test_index(self): + collector = CollecTor(compression = Compression.NONE) + index = collector.index() + + self.assertEqual('https://collector.torproject.org', index['path']) + self.assertEqual(['archive', 'recent'], [entry['path'] for entry in index['directories']]) diff --git a/test/settings.cfg b/test/settings.cfg index d422ffa8..6f71a329 100644 --- a/test/settings.cfg +++ b/test/settings.cfg @@ -239,6 +239,7 @@ test.unit_tests |test.unit.util.tor_tools.TestTorTools |test.unit.util.__init__.TestBaseUtil |test.unit.installation.TestInstallation +|test.unit.descriptor.collector.TestCollector |test.unit.descriptor.descriptor.TestDescriptor |test.unit.descriptor.export.TestExport |test.unit.descriptor.reader.TestDescriptorReader @@ -309,6 +310,7 @@ test.integ_tests |test.integ.connection.connect.TestConnect |test.integ.control.base_controller.TestBaseController |test.integ.control.controller.TestController +|test.integ.descriptor.collector.TestCollector |test.integ.descriptor.remote.TestDescriptorDownloader |test.integ.descriptor.server_descriptor.TestServerDescriptor |test.integ.descriptor.extrainfo_descriptor.TestExtraInfoDescriptor diff --git a/test/unit/descriptor/__init__.py b/test/unit/descriptor/__init__.py index a2c03f1d..c5cf01e1 100644 --- a/test/unit/descriptor/__init__.py +++ b/test/unit/descriptor/__init__.py @@ -6,6 +6,7 @@ import os
__all__ = [ 'bandwidth_file', + 'collector', 'export', 'extrainfo_descriptor', 'microdescriptor', diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py new file mode 100644 index 00000000..5dc12164 --- /dev/null +++ b/test/unit/descriptor/collector.py @@ -0,0 +1,16 @@ +""" +Unit tests for stem.descriptor.collector. +""" + +import unittest + +from stem.descriptor.collector import Compression, url + + +class TestCollector(unittest.TestCase): + def test_url(self): + self.assertEqual('https://collector.torproject.org/index/index.json', url('index')) + self.assertEqual('https://collector.torproject.org/index/index.json', url('index', compression = Compression.NONE)) + self.assertEqual('https://collector.torproject.org/index/index.json.gz', url('index', compression = Compression.GZ)) + self.assertEqual('https://collector.torproject.org/index/index.json.bz2', url('index', compression = Compression.BZ2)) + self.assertEqual('https://collector.torproject.org/index/index.json.xz', url('index', compression = Compression.XZ))
tor-commits@lists.torproject.org