commit 78ad7080d6172365ddfbd50742ea42fa814c0485 Author: Damian Johnson atagar@torproject.org Date: Sat Oct 24 14:27:23 2020 -0700
Error when reading CollecTor file twice
When a CollecTor file is already within our cache we validate its hash against CollecTor's index. However, files we wrote to disk are decompressed by default so the hash naturally mismatches.
https://github.com/torproject/stem/issues/76
Reproduced this issue with the following script...
import stem.descriptor.collector
collector = stem.descriptor.collector.get_instance() desc_file = collector.files(descriptor_type = 'server-descriptor')[0] # pick any arbitrary file
print('Number of descriptors (first read): %s' % len(list(desc_file.read(directory = '/tmp/collector_cache')))) print('Number of descriptors (second read): %s' % len(list(desc_file.read(directory = '/tmp/collector_cache'))))
Before...
% python demo.py Number of descriptors (first read): 3112 Traceback (most recent call last): File "scrap.py", line 8, in <module> print('Number of descriptors (second read): %s' % len(list(desc_file.read(directory = '/tmp/collector_cache')))) File "/home/atagar/Desktop/stem/stem/descriptor/collector.py", line 273, in read path = self.download(directory, True, timeout, retries) File "/home/atagar/Desktop/stem/stem/descriptor/collector.py", line 335, in download raise OSError("%s already exists but mismatches CollecTor's checksum (expected: %s, actual: %s)" % (path, expected_hash, actual_hash)) OSError: /tmp/collector_cache/server-descriptors-2005-12.tar already exists but mismatches CollecTor's checksum (expected: bf700d8b6143e310219b2ce2810abd82f94bc295c7f08e9f1a88989562e33b2f, actual: 32a5ea8fd761e5967fbb8d399742f0da7cbb1c79c1539f2e58cad2e668462652)
After...
% python demo.py Number of descriptors (first read): 3112 Number of descriptors (second read): 3112
We can either solve this by dropping the hash check or caching compressed archives. Initially I leaned toward the former to expedite cache reads, but on reflection the later is conceptually simpler. Essentially, is this a network cache or a read cache? A network cache is safer in that if CollecTor replaces a file (but keeps the same filename) this will catch the change. --- stem/descriptor/collector.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-)
diff --git a/stem/descriptor/collector.py b/stem/descriptor/collector.py index 892ccc30..a84d9724 100644 --- a/stem/descriptor/collector.py +++ b/stem/descriptor/collector.py @@ -270,7 +270,7 @@ class File(object):
return
- path = self.download(directory, True, timeout, retries) + path = self.download(directory, timeout, retries)
# Archives can contain multiple descriptor types, so parsing everything and # filtering to what we're after. @@ -290,13 +290,12 @@ class File(object):
yield desc
- def download(self, directory: str, decompress: bool = True, timeout: Optional[int] = None, retries: Optional[int] = 3, overwrite: bool = False) -> str: + def download(self, directory: str, timeout: Optional[int] = None, retries: Optional[int] = 3, overwrite: bool = False) -> str: """ Downloads this file to the given location. If a file already exists this is a no-op.
:param directory: destination to download into - :param decompress: decompress written file :param timeout: timeout when connection becomes idle, no timeout applied if **None** :param retries: maximum attempts to impose @@ -311,12 +310,7 @@ class File(object): """
filename = self.path.split('/')[-1] - - if self.compression != Compression.PLAINTEXT and decompress: - filename = filename.rsplit('.', 1)[0] - directory = os.path.expanduser(directory) - path = os.path.join(directory, filename)
if not os.path.exists(directory): @@ -336,9 +330,6 @@ class File(object):
response = stem.util.connection.download(COLLECTOR_URL + self.path, timeout, retries)
- if decompress: - response = self.compression.decompress(response) - with open(path, 'wb') as output_file: output_file.write(response)