commit 756aba1eb6c73c79bc7c09b151c93c3c3a3c3c80 Author: Damian Johnson atagar@torproject.org Date: Mon Jul 22 15:42:32 2019 -0700
Filter files by descriptor type or time
Methods I'll be offering to users include filtering by time range or descriptor type so including these as arguments in our files() method. --- stem/descriptor/collector.py | 38 +++++++++++++++++++++++++++++--------- test/unit/descriptor/collector.py | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 10 deletions(-)
diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py index c1458784..426e48ae 100644 --- a/stem/descriptor/collector.py +++ b/stem/descriptor/collector.py @@ -74,6 +74,9 @@ REFRESH_INDEX_RATE = 3600 # get new index if cached copy is an hour old YEAR_DATE = re.compile('-(\d{4})-(\d{2})\.') SEC_DATE = re.compile('(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})')
+# distant future date so we can sort files without a timestamp at the end + +FUTURE = datetime.datetime(9999, 1, 1)
# mapping of path prefixes to their descriptor type (sampled 7/11/19)
@@ -272,7 +275,7 @@ class CollecTor(object): self.timeout = timeout
self._cached_index = None - self._cached_files = None # {path => file} mappings in the index + self._cached_files = None self._cached_index_at = 0
if compression == 'best': @@ -306,11 +309,15 @@ class CollecTor(object):
return self._cached_index
- def files(self): + def files(self, descriptor_type = None, start = None, end = None): """ - Provides files CollecTor presently has. + Provides files CollecTor presently has, sorted oldest to newest. + + :param str descriptor_type: descriptor type or prefix to retrieve + :param datetime.datetime start: time range to begin with + :param datetime.datetime end: time range to end with
- :returns: **hash** mapping paths to :class:`~stem.descriptor.collector.File` + :returns: **list** of :class:`~stem.descriptor.collector.File`
:raises: If unable to retrieve the index this provide... @@ -324,7 +331,20 @@ class CollecTor(object): if not self._cached_files or time.time() - self._cached_index_at >= REFRESH_INDEX_RATE: self._cached_files = CollecTor._files(self.index(), [])
- return self._cached_files + matches = [] + + for entry in self._cached_files: + if start and (entry.start is None or entry.start < start): + continue + elif end and (entry.end is None or entry.end > end): + continue + + if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in entry.guess_descriptor_types()]): + matches.append(entry) + + matches.sort(key = lambda x: x.start if x.start else FUTURE) + + return matches
@staticmethod def _files(val, path): @@ -334,19 +354,19 @@ class CollecTor(object): :param dict val: index hash :param list path: path we've transversed into
- :returns: **dict** mapping paths to files + :returns: **list** of :class:`~stem.descriptor.collector.File` """
- files = {} + files = []
if isinstance(val, dict): for k, v in val.items(): if k == 'files': for attr in v: file_path = '/'.join(path + [attr.get('path')]) - files[file_path] = File(file_path, attr.get('size'), attr.get('last_modified')) + files.append(File(file_path, attr.get('size'), attr.get('last_modified'))) elif k == 'directories': for attr in v: - files.update(CollecTor._files(attr, path + [attr.get('path')])) + files.extend(CollecTor._files(attr, path + [attr.get('path')]))
return files diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py index 297050ea..9d442deb 100644 --- a/test/unit/descriptor/collector.py +++ b/test/unit/descriptor/collector.py @@ -131,7 +131,7 @@ class TestCollector(unittest.TestCase): self.assertEqual(85, len(files)) test_path = 'archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz'
- extrainfo_file = files[test_path] + extrainfo_file = filter(lambda x: x.path == test_path, files)[0] self.assertEqual(test_path, extrainfo_file.path) self.assertEqual(Compression.LZMA, extrainfo_file.compression) self.assertEqual(True, extrainfo_file.tar) @@ -172,6 +172,41 @@ class TestCollector(unittest.TestCase): f = File('recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos', 1162899, '2019-07-03 02:05') self.assertEqual(datetime.datetime(2019, 7, 4, 0, 5, 0), f.end)
+ @patch(URL_OPEN, Mock(return_value = io.BytesIO(EXAMPLE_INDEX_CONTENT))) + def test_file_query_by_type(self): + collector = CollecTor(compression = Compression.PLAINTEXT) + + expected = [ + 'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz', + 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz', + 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz', + 'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors', + 'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors', + 'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors', + ] + + self.assertEqual(expected, map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor'))) + + @patch(URL_OPEN, Mock(return_value = io.BytesIO(EXAMPLE_INDEX_CONTENT))) + def test_file_query_by_date(self): + collector = CollecTor(compression = Compression.PLAINTEXT) + + self.assertEqual([ + 'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors', + 'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors', + 'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors', + ], map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1)))) + + self.assertEqual([ + 'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz', + 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz', + 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz', + ], map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1)))) + + self.assertEqual([ + 'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz', + ], map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1)))) + def test_guess_descriptor_types(self): f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21') self.assertEqual(('bridge-extra-info 1.3',), f.guess_descriptor_types())
tor-commits@lists.torproject.org