[tor-commits] [stem/master] Handling descriptor files in a depth first fashion

atagar at torproject.org atagar at torproject.org
Sun Apr 15 02:50:21 UTC 2012


commit 9f76969739eccf740da0f77378bcabc5672a85bb
Author: Damian Johnson <atagar at torproject.org>
Date:   Sat Apr 14 18:30:40 2012 -0700

    Handling descriptor files in a depth first fashion
    
    Directories enqueued all of the files that it contained prior to processing
    them which has a couple obvious disadvantages...
    
    - huge targets like the root directory or years worth of descriptors can
      consume lots of memory with the paths alone
    
    - this could easily cause us to have a huge startup time before we provided the
      caller any descriptors
    
    This was stupid, depth first parsing makes much more sense.
---
 stem/descriptor/reader.py |   55 +++++++++++++++++++++++---------------------
 1 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index 66ed65a..01f8809 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -327,36 +327,12 @@ class DescriptorReader:
         # adds all of the files that it contains
         for root, _, files in os.walk(target, followlinks = self._follow_links):
           for filename in files:
-            remaining_files.append(os.path.join(root, filename))
+            self._handle_file(os.path.join(root, filename), new_processed_files)
           
           # this can take a while if, say, we're including the root directory
           if self._is_stopped.is_set(): break
       else:
-        # This is a file. Register its last modified timestamp and check if
-        # it's a file that we should skip.
-        
-        last_modified = int(os.stat(target).st_mtime)
-        last_used = self._processed_files.get(target)
-        new_processed_files[target] = last_modified
-        
-        if last_used and last_used >= last_modified:
-          self._notify_skip_listeners(target, AlreadyRead(last_modified, last_used))
-          continue
-        
-        # The mimetypes module only checks the file extension. To actually
-        # check the content (like the 'file' command) we'd need something like
-        # pymagic (https://github.com/cloudburst/pymagic).
-        
-        target_type = mimetypes.guess_type(target)
-        
-        if target_type[0] in (None, 'text/plain'):
-          # either '.txt' or an unknown type
-          self._handle_descriptor_file(target)
-        elif tarfile.is_tarfile(target):
-          # handles gzip, bz2, and decompressed tarballs among others
-          self._handle_archive(target)
-        else:
-          self._notify_skip_listeners(target, UnrecognizedType(target_type))
+        self._handle_file(target, new_processed_files)
     
     self._processed_files = new_processed_files
     
@@ -377,6 +353,33 @@ class DescriptorReader:
           self._iter_notice.wait()
           self._iter_notice.clear()
   
+  def _handle_file(self, target, new_processed_files):
+    # This is a file. Register its last modified timestamp and check if
+    # it's a file that we should skip.
+    
+    last_modified = int(os.stat(target).st_mtime)
+    last_used = self._processed_files.get(target)
+    new_processed_files[target] = last_modified
+    
+    if last_used and last_used >= last_modified:
+      self._notify_skip_listeners(target, AlreadyRead(last_modified, last_used))
+      return
+    
+    # The mimetypes module only checks the file extension. To actually
+    # check the content (like the 'file' command) we'd need something like
+    # pymagic (https://github.com/cloudburst/pymagic).
+    
+    target_type = mimetypes.guess_type(target)
+    
+    if target_type[0] in (None, 'text/plain'):
+      # either '.txt' or an unknown type
+      self._handle_descriptor_file(target)
+    elif tarfile.is_tarfile(target):
+      # handles gzip, bz2, and decompressed tarballs among others
+      self._handle_archive(target)
+    else:
+      self._notify_skip_listeners(target, UnrecognizedType(target_type))
+  
   def _handle_descriptor_file(self, target):
     try:
       with open(target) as target_file:





More information about the tor-commits mailing list