commit 4c744badc658d3d93d113972bfbf7cb463298ee4 Author: Damian Johnson atagar@torproject.org Date: Mon Jul 29 17:35:32 2019 -0700
Adjust CollecTor File class
Handful of very tiny adjustments. Dropping the unused tar attribute, making descriptor guessing a static function (like the others), fixing some minor edge cases, etc. --- stem/descriptor/collector.py | 51 ++++++------ test/unit/descriptor/collector.py | 161 ++++++++++++++++---------------------- 2 files changed, 92 insertions(+), 120 deletions(-)
diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py index d1c90e0e..f76fa225 100644 --- a/stem/descriptor/collector.py +++ b/stem/descriptor/collector.py @@ -157,7 +157,6 @@ class File(object): :var str path: file path within collector :var stem.descriptor.Compression compression: file compression, **None** if this cannot be determined - :var bool tar: **True** if a tarball, **False** otherwise :var int size: size of the file
:var datetime start: beginning of the time range descriptors are for, @@ -170,13 +169,12 @@ class File(object): def __init__(self, path, size, last_modified): self.path = path self.compression = File._guess_compression(path) - self.tar = path.endswith('.tar') or '.tar.' in path self.size = size
self.start, self.end = File._guess_time_range(path) self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M')
- self._guessed_type = None + self._guessed_type = File._guess_descriptor_types(path) self._downloaded_to = None # location we last downloaded to
def read(self, directory = None, descriptor_type = None, timeout = None, retries = 3): @@ -220,21 +218,21 @@ class File(object): """
if descriptor_type is None: - descriptor_types = self._guess_descriptor_types() - - if not descriptor_types: + if not self._guessed_type: raise ValueError("Unable to determine this file's descriptor type") - elif len(descriptor_types) > 1: - raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(descriptor_types)) + elif len(self._guessed_type) > 1: + raise ValueError("Unable to determine disambiguate file's descriptor type from %s" % ', '.join(self._guessed_type))
- descriptor_type = descriptor_types[0] + descriptor_type = self._guessed_type[0]
if directory is None: if self._downloaded_to and os.path.exists(self._downloaded_to): directory = os.path.dirname(self._downloaded_to) else: with tempfile.TemporaryDirectory() as tmp_directory: - return self.read(tmp_directory, timeout, retries) + return self.read(tmp_directory, descriptor_type, timeout, retries) + + # TODO: the following will not work if the tar contains multiple types or a type we do not support
path = self.download(directory, True, timeout, retries) return parse_file(path, descriptor_type) @@ -267,7 +265,7 @@ class File(object):
filename = self.path.split('/')[-1]
- if decompress: + if self.compression != Compression.PLAINTEXT and decompress: filename = filename.rsplit('.', 1)[0]
path = os.path.join(directory, filename) @@ -277,36 +275,35 @@ class File(object): elif os.path.exists(path): return path # file already exists
- with open(path, 'wb') as output_file: - response = _download(COLLECTOR_URL + self.path, timeout, retries) + response = _download(COLLECTOR_URL + self.path, timeout, retries)
- if decompress: - response = self.compression.decompress(response) + if decompress: + response = self.compression.decompress(response)
+ with open(path, 'wb') as output_file: output_file.write(response)
self._downloaded_to = path return path
- def _guess_descriptor_types(self): + @staticmethod + def _guess_descriptor_types(path): """ Descriptor @type this file is expected to have based on its path. If unable to determine any this tuple is empty.
- :returns: **tuple** with the descriptor types this file is expected to have - """ + Hopefully this will be replaced with an explicit value in the future:
- if self._guessed_type is None: - guessed_type = () + https://trac.torproject.org/projects/tor/ticket/31204
- for path_prefix, types in COLLECTOR_DESC_TYPES.items(): - if self.path.startswith(path_prefix): - guessed_type = (types,) if isinstance(types, str) else types - break + :returns: **tuple** with the descriptor types this file is expected to have + """
- self._guessed_type = guessed_type + for path_prefix, types in COLLECTOR_DESC_TYPES.items(): + if path.startswith(path_prefix): + return (types,) if isinstance(types, str) else types
- return self._guessed_type + return ()
@staticmethod def _guess_compression(path): @@ -437,7 +434,7 @@ class CollecTor(object): elif end and (f.end is None or f.end > end): continue
- if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in f._guess_descriptor_types()]): + if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in f._guessed_type]): matches.append(f)
return matches diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py index ad0087dd..77c1c460 100644 --- a/test/unit/descriptor/collector.py +++ b/test/unit/descriptor/collector.py @@ -21,68 +21,105 @@ except ImportError:
URL_OPEN = 'urllib.request.urlopen' if stem.prereq.is_python_3() else 'urllib2.urlopen'
-MINIMAL_INDEX = { - 'index_created': '2017-12-25 21:06', - 'build_revision': '56a303e', - 'path': 'https://collector.torproject.org' -} - -MINIMAL_INDEX_JSON = b'{"index_created":"2017-12-25 21:06","build_revision":"56a303e","path":"https://collector.torproject.org%22%7D'
with open(get_resource('collector_index.json'), 'rb') as index_file: - EXAMPLE_INDEX_CONTENT = index_file.read() + EXAMPLE_INDEX_JSON = index_file.read()
class TestCollector(unittest.TestCase): + # tests for the File class + + def test_file_guess_descriptor_types(self): + test_values = { + 'archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz': ('bridge-extra-info 1.3',), + 'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz': ('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), + 'archive/webstats/webstats-2015-03.tar': (), + 'archive/no_such_file.tar': (), + } + + for path, expected in test_values.items(): + self.assertEqual(expected, File._guess_descriptor_types(path)) + + def test_file_guess_compression(self): + test_values = { + 'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz': Compression.LZMA, + 'archive/webstats/webstats-2015-03.tar': Compression.PLAINTEXT, + 'recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos': Compression.PLAINTEXT, + } + + for path, expected in test_values.items(): + self.assertEqual(expected, File._guess_compression(path)) + + def test_file_guess_time_range(self): + test_values = { + 'archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz': + (datetime.datetime(2014, 1, 1), datetime.datetime(2014, 2, 1)), + 'recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos': + (datetime.datetime(2019, 7, 3, 2, 5, 0), datetime.datetime(2019, 7, 3, 3, 5, 0)), + 'archive/relay-descriptors/certs.tar.xz': + (None, None), + 'archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz': + (datetime.datetime(2014, 12, 1), datetime.datetime(2015, 1, 1)), + 'recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos': + (datetime.datetime(2019, 7, 3, 23, 5, 0), datetime.datetime(2019, 7, 4, 0, 5, 0)) + } + + for path, (expected_start, expected_end) in test_values.items(): + f = File(path, 7515396, '2014-02-07 03:59') + self.assertEqual(expected_start, f.start) + self.assertEqual(expected_end, f.end) + + # tests for the CollecTor class + @patch(URL_OPEN) - def test_download_plaintext(self, urlopen_mock): - urlopen_mock.return_value = io.BytesIO(MINIMAL_INDEX_JSON) + def test_index_plaintext(self, urlopen_mock): + urlopen_mock.return_value = io.BytesIO(EXAMPLE_INDEX_JSON)
collector = CollecTor() - self.assertEqual(MINIMAL_INDEX, collector.index(Compression.PLAINTEXT)) + self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.PLAINTEXT)) urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json', timeout = None)
@patch(URL_OPEN) - def test_download_gzip(self, urlopen_mock): + def test_index_gzip(self, urlopen_mock): if not Compression.GZIP.available: self.skipTest('(gzip compression unavailable)') return
import zlib - urlopen_mock.return_value = io.BytesIO(zlib.compress(MINIMAL_INDEX_JSON)) + urlopen_mock.return_value = io.BytesIO(zlib.compress(EXAMPLE_INDEX_JSON))
collector = CollecTor() - self.assertEqual(MINIMAL_INDEX, collector.index(Compression.GZIP)) + self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.GZIP)) urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.gz', timeout = None)
@patch(URL_OPEN) - def test_download_bz2(self, urlopen_mock): + def test_index_bz2(self, urlopen_mock): if not Compression.BZ2.available: self.skipTest('(bz2 compression unavailable)') return
import bz2 - urlopen_mock.return_value = io.BytesIO(bz2.compress(MINIMAL_INDEX_JSON)) + urlopen_mock.return_value = io.BytesIO(bz2.compress(EXAMPLE_INDEX_JSON))
collector = CollecTor() - self.assertEqual(MINIMAL_INDEX, collector.index(Compression.BZ2)) + self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.BZ2)) urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.bz2', timeout = None)
@patch(URL_OPEN) - def test_download_lzma(self, urlopen_mock): + def test_index_lzma(self, urlopen_mock): if not Compression.LZMA.available: self.skipTest('(lzma compression unavailable)') return
import lzma - urlopen_mock.return_value = io.BytesIO(lzma.compress(MINIMAL_INDEX_JSON)) + urlopen_mock.return_value = io.BytesIO(lzma.compress(EXAMPLE_INDEX_JSON))
collector = CollecTor() - self.assertEqual(MINIMAL_INDEX, collector.index(Compression.LZMA)) + self.assertEqual(EXAMPLE_INDEX, collector.index(Compression.LZMA)) urlopen_mock.assert_called_with('https://collector.torproject.org/index/index.json.xz', timeout = None)
@patch(URL_OPEN) - def test_download_retries(self, urlopen_mock): + def test_index_retries(self, urlopen_mock): urlopen_mock.side_effect = IOError('boom')
collector = CollecTor(retries = 0) @@ -95,11 +132,6 @@ class TestCollector(unittest.TestCase): self.assertRaisesRegexp(IOError, 'boom', collector.index) self.assertEqual(5, urlopen_mock.call_count)
- @patch(URL_OPEN, Mock(return_value = io.BytesIO(MINIMAL_INDEX_JSON))) - def test_index(self): - collector = CollecTor() - self.assertEqual(MINIMAL_INDEX, collector.index(Compression.PLAINTEXT)) - @patch(URL_OPEN, Mock(return_value = io.BytesIO(b'not json'))) def test_index_malformed_json(self): collector = CollecTor() @@ -118,104 +150,47 @@ class TestCollector(unittest.TestCase): collector = CollecTor() self.assertRaisesRegexp(IOError, 'Failed to decompress as %s' % compression, collector.index, compression)
- @patch(URL_OPEN, Mock(return_value = io.BytesIO(EXAMPLE_INDEX_CONTENT))) - def test_real_index(self): - collector = CollecTor() - self.assertEqual(EXAMPLE_INDEX, collector.index(compression = Compression.PLAINTEXT)) - @patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX)) - def test_contents(self): + def test_files(self): collector = CollecTor() files = collector.files() - self.assertEqual(85, len(files)) - test_path = 'archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz'
- extrainfo_file = list(filter(lambda x: x.path == test_path, files))[0] - self.assertEqual(test_path, extrainfo_file.path) + extrainfo_file = list(filter(lambda x: x.path.endswith('extra-infos-2007-09.tar.xz'), files))[0] + self.assertEqual('archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz', extrainfo_file.path) self.assertEqual(Compression.LZMA, extrainfo_file.compression) - self.assertEqual(True, extrainfo_file.tar) self.assertEqual(6459884, extrainfo_file.size) self.assertEqual(datetime.datetime(2016, 6, 23, 9, 54), extrainfo_file.last_modified)
- def test_file_compression_attributes(self): - f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59') - self.assertEqual(Compression.LZMA, f.compression) - self.assertEqual(True, f.tar) - - f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07') - self.assertEqual(Compression.PLAINTEXT, f.compression) - self.assertEqual(True, f.tar) - - f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05') - self.assertEqual(Compression.PLAINTEXT, f.compression) - self.assertEqual(False, f.tar) - - def test_file_date_attributes(self): - f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59') - self.assertEqual(datetime.datetime(2014, 1, 1), f.start) - self.assertEqual(datetime.datetime(2014, 2, 1), f.end) - - f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05') - self.assertEqual(datetime.datetime(2019, 7, 3, 2, 5, 0), f.start) - self.assertEqual(datetime.datetime(2019, 7, 3, 3, 5, 0), f.end) - - f = File('archive/relay-descriptors/certs.tar.xz', 144696, '2019-07-03 03:29') - self.assertEqual(None, f.start) - self.assertEqual(None, f.end) - - # check date boundaries - - f = File('archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz', 7515396, '2014-02-07 03:59') - self.assertEqual(datetime.datetime(2015, 1, 1), f.end) - - f = File('recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos', 1162899, '2019-07-03 02:05') - self.assertEqual(datetime.datetime(2019, 7, 4, 0, 5, 0), f.end) - @patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX)) - def test_file_query_by_type(self): + def test_files_by_descriptor_type(self): collector = CollecTor()
- expected = [ + self.assertEqual([ 'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz', 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz', 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz', 'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors', 'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors', 'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors', - ] - - self.assertEqual(expected, list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor')))) + ], [f.path for f in collector.files(descriptor_type = 'server-descriptor')])
@patch('stem.descriptor.collector.CollecTor.index', Mock(return_value = EXAMPLE_INDEX)) - def test_file_query_by_date(self): + def test_file_by_date(self): collector = CollecTor()
self.assertEqual([ 'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors', 'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors', 'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors', - ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1))))) + ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1))])
self.assertEqual([ 'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz', 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz', 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz', - ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1))))) + ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1))])
self.assertEqual([ 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz', - ], list(map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1))))) - - def test_guess_descriptor_types(self): - f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21') - self.assertEqual(('bridge-extra-info 1.3',), f._guess_descriptor_types()) - - f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59') - self.assertEqual(('network-status-microdesc-consensus-3 1.0', 'microdescriptor 1.0'), f._guess_descriptor_types()) - - f = File('archive/webstats/webstats-2015-03.tar', 20480, '2018-03-19 16:07') - self.assertEqual((), f._guess_descriptor_types()) - - f = File('archive/no_such_file.tar', 20480, '2018-03-19 16:07') - self.assertEqual((), f._guess_descriptor_types()) + ], [f.path for f in collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1))])
tor-commits@lists.torproject.org