[tor-commits] [stem/master] Filter files by descriptor type or time

atagar at torproject.org atagar at torproject.org
Sat Aug 17 20:44:27 UTC 2019


commit 756aba1eb6c73c79bc7c09b151c93c3c3a3c3c80
Author: Damian Johnson <atagar at torproject.org>
Date:   Mon Jul 22 15:42:32 2019 -0700

    Filter files by descriptor type or time
    
    Methods I'll be offering to users include filtering by time range or descriptor
    type so including these as arguments in our files() method.
---
 stem/descriptor/collector.py      | 38 +++++++++++++++++++++++++++++---------
 test/unit/descriptor/collector.py | 37 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index c1458784..426e48ae 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -74,6 +74,9 @@ REFRESH_INDEX_RATE = 3600  # get new index if cached copy is an hour old
 YEAR_DATE = re.compile('-(\\d{4})-(\\d{2})\\.')
 SEC_DATE = re.compile('(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})')
 
+# distant future date so we can sort files without a timestamp at the end
+
+FUTURE = datetime.datetime(9999, 1, 1)
 
 # mapping of path prefixes to their descriptor type (sampled 7/11/19)
 
@@ -272,7 +275,7 @@ class CollecTor(object):
     self.timeout = timeout
 
     self._cached_index = None
-    self._cached_files = None  # {path => file} mappings in the index
+    self._cached_files = None
     self._cached_index_at = 0
 
     if compression == 'best':
@@ -306,11 +309,15 @@ class CollecTor(object):
 
     return self._cached_index
 
-  def files(self):
+  def files(self, descriptor_type = None, start = None, end = None):
     """
-    Provides files CollecTor presently has.
+    Provides files CollecTor presently has, sorted oldest to newest.
+
+    :param str descriptor_type: descriptor type or prefix to retrieve
+    :param datetime.datetime start: time range to begin with
+    :param datetime.datetime end: time range to end with
 
-    :returns: **hash** mapping paths to :class:`~stem.descriptor.collector.File`
+    :returns: **list** of :class:`~stem.descriptor.collector.File`
 
     :raises:
       If unable to retrieve the index this provide...
@@ -324,7 +331,20 @@ class CollecTor(object):
     if not self._cached_files or time.time() - self._cached_index_at >= REFRESH_INDEX_RATE:
       self._cached_files = CollecTor._files(self.index(), [])
 
-    return self._cached_files
+    matches = []
+
+    for entry in self._cached_files:
+      if start and (entry.start is None or entry.start < start):
+        continue
+      elif end and (entry.end is None or entry.end > end):
+        continue
+
+      if descriptor_type is None or any([desc_type.startswith(descriptor_type) for desc_type in entry.guess_descriptor_types()]):
+        matches.append(entry)
+
+    matches.sort(key = lambda x: x.start if x.start else FUTURE)
+
+    return matches
 
   @staticmethod
   def _files(val, path):
@@ -334,19 +354,19 @@ class CollecTor(object):
     :param dict val: index hash
     :param list path: path we've transversed into
 
-    :returns: **dict** mapping paths to files
+    :returns: **list** of :class:`~stem.descriptor.collector.File`
     """
 
-    files = {}
+    files = []
 
     if isinstance(val, dict):
       for k, v in val.items():
         if k == 'files':
           for attr in v:
             file_path = '/'.join(path + [attr.get('path')])
-            files[file_path] = File(file_path, attr.get('size'), attr.get('last_modified'))
+            files.append(File(file_path, attr.get('size'), attr.get('last_modified')))
         elif k == 'directories':
           for attr in v:
-            files.update(CollecTor._files(attr, path + [attr.get('path')]))
+            files.extend(CollecTor._files(attr, path + [attr.get('path')]))
 
     return files
diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py
index 297050ea..9d442deb 100644
--- a/test/unit/descriptor/collector.py
+++ b/test/unit/descriptor/collector.py
@@ -131,7 +131,7 @@ class TestCollector(unittest.TestCase):
     self.assertEqual(85, len(files))
     test_path = 'archive/relay-descriptors/extra-infos/extra-infos-2007-09.tar.xz'
 
-    extrainfo_file = files[test_path]
+    extrainfo_file = filter(lambda x: x.path == test_path, files)[0]
     self.assertEqual(test_path, extrainfo_file.path)
     self.assertEqual(Compression.LZMA, extrainfo_file.compression)
     self.assertEqual(True, extrainfo_file.tar)
@@ -172,6 +172,41 @@ class TestCollector(unittest.TestCase):
     f = File('recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos', 1162899, '2019-07-03 02:05')
     self.assertEqual(datetime.datetime(2019, 7, 4, 0, 5, 0), f.end)
 
+  @patch(URL_OPEN, Mock(return_value = io.BytesIO(EXAMPLE_INDEX_CONTENT)))
+  def test_file_query_by_type(self):
+    collector = CollecTor(compression = Compression.PLAINTEXT)
+
+    expected = [
+      'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz',
+      'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz',
+      'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
+      'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors',
+      'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors',
+      'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors',
+    ]
+
+    self.assertEqual(expected, map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor')))
+
+  @patch(URL_OPEN, Mock(return_value = io.BytesIO(EXAMPLE_INDEX_CONTENT)))
+  def test_file_query_by_date(self):
+    collector = CollecTor(compression = Compression.PLAINTEXT)
+
+    self.assertEqual([
+      'recent/relay-descriptors/server-descriptors/2019-07-03-02-05-00-server-descriptors',
+      'recent/relay-descriptors/server-descriptors/2019-07-03-03-05-00-server-descriptors',
+      'recent/relay-descriptors/server-descriptors/2019-07-03-04-05-00-server-descriptors',
+    ], map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2007, 1, 1))))
+
+    self.assertEqual([
+      'archive/relay-descriptors/server-descriptors/server-descriptors-2005-12.tar.xz',
+      'archive/relay-descriptors/server-descriptors/server-descriptors-2006-02.tar.xz',
+      'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
+    ], map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', end = datetime.datetime(2007, 1, 1))))
+
+    self.assertEqual([
+      'archive/relay-descriptors/server-descriptors/server-descriptors-2006-03.tar.xz',
+    ], map(lambda x: x.path, collector.files(descriptor_type = 'server-descriptor', start = datetime.datetime(2006, 2, 10), end = datetime.datetime(2007, 1, 1))))
+
   def test_guess_descriptor_types(self):
     f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21')
     self.assertEqual(('bridge-extra-info 1.3',), f.guess_descriptor_types())





More information about the tor-commits mailing list