[tor-commits] [stem/master] Estimate descriptor dates from filenames

atagar at torproject.org atagar at torproject.org
Sat Aug 17 20:44:27 UTC 2019


commit 8be08440456199873a1d3c6eb2f8e3ed968eedc0
Author: Damian Johnson <atagar at torproject.org>
Date:   Sun Jul 14 14:37:52 2019 -0700

    Estimate descriptor dates from filenames
    
    Now that we have descriptor types and compression the last ingredient we need
    are the time ranges a file is for. Guessing this from filenames.
---
 stem/descriptor/collector.py      | 65 ++++++++++++++++++++++++++++++++++-----
 test/unit/descriptor/collector.py | 21 +++++++++++++
 2 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py
index a47612cb..c1458784 100644
--- a/stem/descriptor/collector.py
+++ b/stem/descriptor/collector.py
@@ -51,6 +51,7 @@ With this you can either download and read directly from CollecTor...
 
 import datetime
 import json
+import re
 import sys
 import time
 
@@ -70,6 +71,10 @@ import stem.util.str_tools
 COLLECTOR_URL = 'https://collector.torproject.org/'
 REFRESH_INDEX_RATE = 3600  # get new index if cached copy is an hour old
 
+YEAR_DATE = re.compile('-(\\d{4})-(\\d{2})\\.')
+SEC_DATE = re.compile('(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})')
+
+
 # mapping of path prefixes to their descriptor type (sampled 7/11/19)
 
 COLLECTOR_DESC_TYPES = {
@@ -165,23 +170,24 @@ class File(object):
     this cannot be determined
   :var bool tar: **True** if a tarball, **False** otherwise
   :var int size: size of the file
+
+  :var datetime start: beginning of the time range descriptors are for,
+    **None** if this cannot be determined
+  :var datetime end: ending of the time range descriptors are for,
+    **None** if this cannot be determined
   :var datetime last_modified: when the file was last modified
   """
 
   def __init__(self, path, size, last_modified):
     self.path = path
-    self.compression = None
+    self.compression = File._guess_compression(path)
     self.tar = path.endswith('.tar') or '.tar.' in path
     self.size = size
+
+    self.start, self.end = File._guess_time_range(path)
     self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M')
-    self._guessed_type = None
 
-    if '.' not in self.path or self.path.endswith('.tar'):
-      self.compression = Compression.PLAINTEXT
-    else:
-      for compression in (Compression.LZMA, Compression.BZ2, Compression.GZIP):
-        if self.path.endswith(compression.extension):
-          self.compression = compression
+    self._guessed_type = None
 
   def guess_descriptor_types(self):
     """
@@ -203,6 +209,49 @@ class File(object):
 
     return self._guessed_type
 
+  @staticmethod
+  def _guess_compression(path):
+    """
+    Determine file comprssion from CollecTor's filename.
+    """
+
+    if '.' not in path or path.endswith('.tar'):
+      return Compression.PLAINTEXT
+    else:
+      for compression in (Compression.LZMA, Compression.BZ2, Compression.GZIP):
+        if path.endswith(compression.extension):
+          return compression
+
+  @staticmethod
+  def _guess_time_range(path):
+    """
+    Attemt to determine the (start, end) time range from CollecTor's filename.
+    This provides (None, None) if this cannot be determined.
+    """
+
+    year_match = YEAR_DATE.search(path)
+
+    if year_match:
+      year, month = map(int, year_match.groups())
+      start = datetime.datetime(year, month, 1)
+
+      if month < 12:
+        return (start, datetime.datetime(year, month + 1, 1))
+      else:
+        return (start, datetime.datetime(year + 1, 1, 1))
+
+    sec_match = SEC_DATE.search(path)
+
+    if sec_match:
+      # Descriptors in the 'recent/*' section have filenames with second level
+      # granularity. Not quite sure why, but since consensus documents are
+      # published hourly we'll use that as the delta here.
+
+      start = datetime.datetime.strptime(sec_match.group(1), '%Y-%m-%d-%H-%M-%S')
+      return (start, start + datetime.timedelta(seconds = 3600))
+
+    return (None, None)
+
 
 class CollecTor(object):
   """
diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py
index 3c4d39a0..297050ea 100644
--- a/test/unit/descriptor/collector.py
+++ b/test/unit/descriptor/collector.py
@@ -151,6 +151,27 @@ class TestCollector(unittest.TestCase):
     self.assertEqual(Compression.PLAINTEXT, f.compression)
     self.assertEqual(False, f.tar)
 
+  def test_file_date_attributes(self):
+    f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59')
+    self.assertEqual(datetime.datetime(2014, 1, 1), f.start)
+    self.assertEqual(datetime.datetime(2014, 2, 1), f.end)
+
+    f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05')
+    self.assertEqual(datetime.datetime(2019, 7, 3, 2, 5, 0), f.start)
+    self.assertEqual(datetime.datetime(2019, 7, 3, 3, 5, 0), f.end)
+
+    f = File('archive/relay-descriptors/certs.tar.xz', 144696, '2019-07-03 03:29')
+    self.assertEqual(None, f.start)
+    self.assertEqual(None, f.end)
+
+    # check date boundaries
+
+    f = File('archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz', 7515396, '2014-02-07 03:59')
+    self.assertEqual(datetime.datetime(2015, 1, 1), f.end)
+
+    f = File('recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos', 1162899, '2019-07-03 02:05')
+    self.assertEqual(datetime.datetime(2019, 7, 4, 0, 5, 0), f.end)
+
   def test_guess_descriptor_types(self):
     f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21')
     self.assertEqual(('bridge-extra-info 1.3',), f.guess_descriptor_types())





More information about the tor-commits mailing list