commit 9f76969739eccf740da0f77378bcabc5672a85bb Author: Damian Johnson atagar@torproject.org Date: Sat Apr 14 18:30:40 2012 -0700
Handling descriptor files in a depth first fashion
Directories enqueued all of the files that it contained prior to processing them which has a couple obvious disadvantages...
- huge targets like the root directory or years worth of descriptors can consume lots of memory with the paths alone
- this could easily cause us to have a huge startup time before we provided the caller any descriptors
This was stupid, depth first parsing makes much more sense. --- stem/descriptor/reader.py | 55 +++++++++++++++++++++++--------------------- 1 files changed, 29 insertions(+), 26 deletions(-)
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py index 66ed65a..01f8809 100644 --- a/stem/descriptor/reader.py +++ b/stem/descriptor/reader.py @@ -327,36 +327,12 @@ class DescriptorReader: # adds all of the files that it contains for root, _, files in os.walk(target, followlinks = self._follow_links): for filename in files: - remaining_files.append(os.path.join(root, filename)) + self._handle_file(os.path.join(root, filename), new_processed_files)
# this can take a while if, say, we're including the root directory if self._is_stopped.is_set(): break else: - # This is a file. Register its last modified timestamp and check if - # it's a file that we should skip. - - last_modified = int(os.stat(target).st_mtime) - last_used = self._processed_files.get(target) - new_processed_files[target] = last_modified - - if last_used and last_used >= last_modified: - self._notify_skip_listeners(target, AlreadyRead(last_modified, last_used)) - continue - - # The mimetypes module only checks the file extension. To actually - # check the content (like the 'file' command) we'd need something like - # pymagic (https://github.com/cloudburst/pymagic). - - target_type = mimetypes.guess_type(target) - - if target_type[0] in (None, 'text/plain'): - # either '.txt' or an unknown type - self._handle_descriptor_file(target) - elif tarfile.is_tarfile(target): - # handles gzip, bz2, and decompressed tarballs among others - self._handle_archive(target) - else: - self._notify_skip_listeners(target, UnrecognizedType(target_type)) + self._handle_file(target, new_processed_files)
self._processed_files = new_processed_files
@@ -377,6 +353,33 @@ class DescriptorReader: self._iter_notice.wait() self._iter_notice.clear()
+ def _handle_file(self, target, new_processed_files): + # This is a file. Register its last modified timestamp and check if + # it's a file that we should skip. + + last_modified = int(os.stat(target).st_mtime) + last_used = self._processed_files.get(target) + new_processed_files[target] = last_modified + + if last_used and last_used >= last_modified: + self._notify_skip_listeners(target, AlreadyRead(last_modified, last_used)) + return + + # The mimetypes module only checks the file extension. To actually + # check the content (like the 'file' command) we'd need something like + # pymagic (https://github.com/cloudburst/pymagic). + + target_type = mimetypes.guess_type(target) + + if target_type[0] in (None, 'text/plain'): + # either '.txt' or an unknown type + self._handle_descriptor_file(target) + elif tarfile.is_tarfile(target): + # handles gzip, bz2, and decompressed tarballs among others + self._handle_archive(target) + else: + self._notify_skip_listeners(target, UnrecognizedType(target_type)) + def _handle_descriptor_file(self, target): try: with open(target) as target_file: