[tor-commits] [stem/master] Implement lazy router descriptor reading

Sat Oct 13 18:35:44 UTC 2012

commit eb0e424ed9459188b70f33ff401e23e9fd89138b
Author: Ravi Chandra Padmala <neenaoffline at gmail.com>
Date:   Wed Aug 8 12:39:55 2012 +0530

    Implement lazy router descriptor reading
---
 stem/descriptor/__init__.py      |   60 +++++++++++++++++++++++++++++++++++++-
 stem/descriptor/networkstatus.py |   36 ++++++++++++++++-------
 2 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index b1f3ab6..168b357 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -148,7 +148,31 @@ class Descriptor(object):
   def __str__(self):
     return self._raw_contents
 
-def _read_until_keywords(keywords, descriptor_file, inclusive = False):
+def _peek_keyword(descriptor_file):
+  """
+  Returns the keyword at the current offset of descriptor_file. Respects the
+  "opt" keyword and returns the next keyword instead.
+  
+  :param file descriptor_file: file with the descriptor content
+  
+  :returns: keyword at the current offset of descriptor_file
+  """
+  
+  last_position = descriptor_file.tell()
+  line = descriptor_file.readline()
+  if not line: return None
+  
+  if " " in line:
+    keyword = line.split(" ", 1)[0]
+    if keyword == "opt":
+        keyword = line.split(" ", 2)[1]
+  else: keyword = line.strip()
+  
+  descriptor_file.seek(last_position)
+  
+  return keyword
+
+def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False):
   """
   Reads from the descriptor file until we get to one of the given keywords or reach the
   end of the file.
@@ -156,6 +180,7 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False):
   :param str,list keywords: keyword(s) we want to read until
   :param file descriptor_file: file with the descriptor content
   :param bool inclusive: includes the line with the keyword if True
+  :param bool ignore_first: doesn't check if the first line read has one of the given keywords
   
   :returns: list with the lines until we find one of the keywords
   """
@@ -163,6 +188,10 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False):
   content = []
   if type(keywords) == str: keywords = (keywords,)
   
+  if ignore_first:
+    content.append(descriptor_file.readline())
+    if content == [None]: return []
+  
   while True:
     last_position = descriptor_file.tell()
     line = descriptor_file.readline()
@@ -181,6 +210,35 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False):
   
   return content
 
+def _skip_until_keywords(keywords, descriptor_file, inclusive = False):
+  """
+  Reads and discards lines of data from the descriptor file until we get to one
+  of the given keywords or reach the end of the file.
+  
+  :param str,list keywords: keyword(s) we want to skip until
+  :param file descriptor_file: file with the descriptor content
+  :param bool inclusive: includes the line with the keyword if True
+  
+  :returns: descriptor_file with the new offset
+  """
+  
+  if type(keywords) == str: keywords = (keywords,)
+  
+  while True:
+    last_position = descriptor_file.tell()
+    line = descriptor_file.readline()
+    if not line: break # EOF
+    
+    if " " in line: line_keyword = line.split(" ", 1)[0]
+    else: line_keyword = line.strip()
+    
+    if line_keyword in keywords:
+      if not inclusive: descriptor_file.seek(last_position)
+      
+      break
+  
+  return descriptor_file
+
 def _get_pseudo_pgp_block(remaining_contents):
   """
   Checks if given contents begins with a pseudo-Open-PGP-style block and, if
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index e4cfda1..a51fcbd 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -40,14 +40,19 @@ The documents can be obtained from any of the following sources...
 
 import re
 import datetime
+from StringIO import StringIO
 
 import stem.descriptor
 import stem.version
 import stem.exit_policy
 
+from stem.descriptor import _read_until_keywords, _skip_until_keywords, _peek_keyword
+
 _bandwidth_weights_regex = re.compile(" ".join(["W%s=\d+" % weight for weight in ["bd",
   "be", "bg", "bm", "db", "eb", "ed", "ee", "eg", "em", "gb", "gd", "gg", "gm", "mb", "md", "me", "mg", "mm"]]))
 
+_router_desc_end_kws = ["r", "bandwidth-weights", "directory-footer", "directory-signature"]
+
 def parse_file(document_file, validate = True):
   """
   Iterates over the router descriptors in a network status document.
@@ -62,13 +67,30 @@ def parse_file(document_file, validate = True):
     * IOError if the file can't be read
   """
   
-  return NetworkStatusDocument(document_file.read(), validate).router_descriptors
+  # parse until "r"
+  document_data = "".join(_read_until_keywords("r", document_file))
+  # store offset
+  r_offset = document_file.tell()
+  # skip until end of router descriptors
+  _skip_until_keywords(["bandwidth-weights", "directory-footer", "directory-signature"], document_file)
+  # parse until end
+  document_data = document_data + document_file.read()
+  document = NetworkStatusDocument(document_data, validate)
+  document_file.seek(r_offset)
+  document.router_descriptors = _router_desc_generator(document_file, document.vote_status == "vote", validate)
+  return document.router_descriptors
 
 def _strptime(string, validate = True, optional = False):
   try:
     return datetime.datetime.strptime(string, "%Y-%m-%d %H:%M:%S")
   except ValueError, exc:
     if validate or not optional: raise exc
+    else: return None
+
+def _router_desc_generator(document_file, vote, validate):
+  while _peek_keyword(document_file) == "r":
+    desc_content = "".join(_read_until_keywords(_router_desc_end_kws, document_file, False, True))
+    yield RouterDescriptor(desc_content, vote, validate)
 
 class NetworkStatusDocument(stem.descriptor.Descriptor):
   """
@@ -193,21 +215,13 @@ class NetworkStatusDocument(stem.descriptor.Descriptor):
     
     # authority section
     while doc_parser.line.startswith("dir-source "):
-      dirauth_data = doc_parser.read_until(["dir-source", "r"])
+      dirauth_data = doc_parser.read_until(["dir-source", "r", "directory-footer", "directory-signature", "bandwidth-weights"])
       self.directory_authorities.append(DirectoryAuthority(dirauth_data, vote, validate))
     
-    def _router_desc_generator(raw_content, vote, validate):
-      parser = stem.descriptor.DescriptorParser(raw_content, validate)
-      while parser.line != None:
-        descriptor = parser.read_until("r")
-        yield self._generate_router(descriptor, vote, validate)
-    
     # router descriptors
     if doc_parser.peek_keyword() == "r":
       router_descriptors_data = doc_parser.read_until(["bandwidth-weights", "directory-footer", "directory-signature"])
-      self.router_descriptors = _router_desc_generator(router_descriptors_data, vote, validate)
-    elif validate:
-      raise ValueError("No router descriptors found")
+      self.router_descriptors = _router_desc_generator(StringIO(router_descriptors_data), vote, validate)
     
     # footer section
     if self.consensus_method > 9 or vote and filter(lambda x: x >= 9, self.consensus_methods):