[stem/master] Estimate descriptor dates from filenames

commit 8be08440456199873a1d3c6eb2f8e3ed968eedc0 Author: Damian Johnson <atagar@torproject.org> Date: Sun Jul 14 14:37:52 2019 -0700 Estimate descriptor dates from filenames Now that we have descriptor types and compression the last ingredient we need are the time ranges a file is for. Guessing this from filenames. --- stem/descriptor/collector.py | 65 ++++++++++++++++++++++++++++++++++----- test/unit/descriptor/collector.py | 21 +++++++++++++ 2 files changed, 78 insertions(+), 8 deletions(-) diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py index a47612cb..c1458784 100644 --- a/stem/descriptor/collector.py +++ b/stem/descriptor/collector.py @@ -51,6 +51,7 @@ With this you can either download and read directly from CollecTor... import datetime import json +import re import sys import time @@ -70,6 +71,10 @@ import stem.util.str_tools COLLECTOR_URL = 'https://collector.torproject.org/' REFRESH_INDEX_RATE = 3600 # get new index if cached copy is an hour old +YEAR_DATE = re.compile('-(\\d{4})-(\\d{2})\\.') +SEC_DATE = re.compile('(\\d{4}-\\d{2}-\\d{2}-\\d{2}-\\d{2}-\\d{2})') + + # mapping of path prefixes to their descriptor type (sampled 7/11/19) COLLECTOR_DESC_TYPES = { @@ -165,23 +170,24 @@ class File(object): this cannot be determined :var bool tar: **True** if a tarball, **False** otherwise :var int size: size of the file + + :var datetime start: beginning of the time range descriptors are for, + **None** if this cannot be determined + :var datetime end: ending of the time range descriptors are for, + **None** if this cannot be determined :var datetime last_modified: when the file was last modified """ def __init__(self, path, size, last_modified): self.path = path - self.compression = None + self.compression = File._guess_compression(path) self.tar = path.endswith('.tar') or '.tar.' in path self.size = size + + self.start, self.end = File._guess_time_range(path) self.last_modified = datetime.datetime.strptime(last_modified, '%Y-%m-%d %H:%M') - self._guessed_type = None - if '.' not in self.path or self.path.endswith('.tar'): - self.compression = Compression.PLAINTEXT - else: - for compression in (Compression.LZMA, Compression.BZ2, Compression.GZIP): - if self.path.endswith(compression.extension): - self.compression = compression + self._guessed_type = None def guess_descriptor_types(self): """ @@ -203,6 +209,49 @@ class File(object): return self._guessed_type + @staticmethod + def _guess_compression(path): + """ + Determine file comprssion from CollecTor's filename. + """ + + if '.' not in path or path.endswith('.tar'): + return Compression.PLAINTEXT + else: + for compression in (Compression.LZMA, Compression.BZ2, Compression.GZIP): + if path.endswith(compression.extension): + return compression + + @staticmethod + def _guess_time_range(path): + """ + Attemt to determine the (start, end) time range from CollecTor's filename. + This provides (None, None) if this cannot be determined. + """ + + year_match = YEAR_DATE.search(path) + + if year_match: + year, month = map(int, year_match.groups()) + start = datetime.datetime(year, month, 1) + + if month < 12: + return (start, datetime.datetime(year, month + 1, 1)) + else: + return (start, datetime.datetime(year + 1, 1, 1)) + + sec_match = SEC_DATE.search(path) + + if sec_match: + # Descriptors in the 'recent/*' section have filenames with second level + # granularity. Not quite sure why, but since consensus documents are + # published hourly we'll use that as the delta here. + + start = datetime.datetime.strptime(sec_match.group(1), '%Y-%m-%d-%H-%M-%S') + return (start, start + datetime.timedelta(seconds = 3600)) + + return (None, None) + class CollecTor(object): """ diff --git a/test/unit/descriptor/collector.py b/test/unit/descriptor/collector.py index 3c4d39a0..297050ea 100644 --- a/test/unit/descriptor/collector.py +++ b/test/unit/descriptor/collector.py @@ -151,6 +151,27 @@ class TestCollector(unittest.TestCase): self.assertEqual(Compression.PLAINTEXT, f.compression) self.assertEqual(False, f.tar) + def test_file_date_attributes(self): + f = File('archive/relay-descriptors/microdescs/microdescs-2014-01.tar.xz', 7515396, '2014-02-07 03:59') + self.assertEqual(datetime.datetime(2014, 1, 1), f.start) + self.assertEqual(datetime.datetime(2014, 2, 1), f.end) + + f = File('recent/relay-descriptors/extra-infos/2019-07-03-02-05-00-extra-infos', 1162899, '2019-07-03 02:05') + self.assertEqual(datetime.datetime(2019, 7, 3, 2, 5, 0), f.start) + self.assertEqual(datetime.datetime(2019, 7, 3, 3, 5, 0), f.end) + + f = File('archive/relay-descriptors/certs.tar.xz', 144696, '2019-07-03 03:29') + self.assertEqual(None, f.start) + self.assertEqual(None, f.end) + + # check date boundaries + + f = File('archive/relay-descriptors/microdescs/microdescs-2014-12.tar.xz', 7515396, '2014-02-07 03:59') + self.assertEqual(datetime.datetime(2015, 1, 1), f.end) + + f = File('recent/relay-descriptors/extra-infos/2019-07-03-23-05-00-extra-infos', 1162899, '2019-07-03 02:05') + self.assertEqual(datetime.datetime(2019, 7, 4, 0, 5, 0), f.end) + def test_guess_descriptor_types(self): f = File('archive/bridge-descriptors/extra-infos/bridge-extra-infos-2008-05.tar.xz', 377644, '2016-09-04 09:21') self.assertEqual(('bridge-extra-info 1.3',), f.guess_descriptor_types())
participants (1)
-
atagar@torproject.org