[tor-commits] [stem/master] Inverting network status' document and router relationship

atagar at torproject.org atagar at torproject.org
Sat Oct 13 18:35:45 UTC 2012


commit 35c5eaf2933dad64bab61cc65657d60db276a54a
Author: Damian Johnson <atagar at torproject.org>
Date:   Sun Aug 19 00:21:08 2012 -0700

    Inverting network status' document and router relationship
    
    This code was trying very hard to produce a network status document that
    contained router entries. This is right and proper from an object oriented
    standpoint, but not from a practical one. The document cannot contain the
    router entries unless we keep them all in memory, and for the vast majority of
    users that is not desirable.
    
    In later revisions Ravi addressed the memory concerns by turning the
    router_descriptors attribute into an iterator. This works, but it's confusing
    and the router_descriptors' itertor only works as long as we keep the file open
    (or buffer the content).
    
    Instead, inverting the relationship so this module provides an iterator of
    routers and those routers have a reference to the document they came from. As a
    side benefit this lets us tidy up the code a bit.
    
    There like are some use cases where we would prefer a document object that
    contains routers, so I'll probably add that option too later.
    
    This also gets rid of the "Flavour" enum. I'm not entirely sure what it was for
    - it wasn't documented and its NS value was unused. Maybe it was vestigial from
    some prior changes.
---
 stem/descriptor/__init__.py            |   19 +++--
 stem/descriptor/networkstatus.py       |  127 ++++++++++++++++----------------
 test/integ/descriptor/networkstatus.py |   15 ++--
 3 files changed, 82 insertions(+), 79 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 151f13e..74b538c 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -65,9 +65,9 @@ def parse_file(path, descriptor_file):
   elif filename == "cached-extrainfo":
     file_parser = stem.descriptor.extrainfo_descriptor.parse_file
   elif filename == "cached-consensus":
-    file_parser = lambda f: stem.descriptor.networkstatus.parse_file(f).router_descriptors
+    file_parser = stem.descriptor.networkstatus.parse_file
   elif filename == "cached-microdesc-consensus":
-    file_parser = lambda f: stem.descriptor.networkstatus.parse_file(f, True, "microdesc").router_descriptors
+    file_parser = lambda f: stem.descriptor.networkstatus.parse_file(f, True, True)
   else:
     # Metrics descriptor handling
     first_line, desc = descriptor_file.readline().strip(), None
@@ -107,14 +107,10 @@ def _parse_metrics_file(descriptor_type, major_version, minor_version, descripto
     
     yield stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor(descriptor_file.read())
   elif descriptor_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1:
-    consensus = stem.descriptor.networkstatus.parse_file(descriptor_file)
-    
-    for desc in consensus.router_descriptors:
+    for desc in stem.descriptor.networkstatus.parse_file(descriptor_file):
       yield desc
   elif descriptor_type == "network-status-microdesc-consensus-3" and major_version == 1:
-    consensus = stem.descriptor.networkstatus.parse_file(descriptor_file, flavour = "microdesc")
-    
-    for desc in consensus.router_descriptors:
+    for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, is_microdescriptor = True):
       yield desc
   else:
     raise TypeError("Unrecognized metrics descriptor format. type: '%s', version: '%i.%i'" % (descriptor_type, major_version, minor_version))
@@ -253,7 +249,7 @@ def _read_keyword_line_str(keyword, lines, validate = True, optional = False):
     raise ValueError("Error parsing network status document: Expected %s, received: %s" % (keyword, lines[0]))
   else: return None
 
-def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False):
+def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None):
   """
   Reads from the descriptor file until we get to one of the given keywords or reach the
   end of the file.
@@ -263,6 +259,7 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_fi
   :param bool inclusive: includes the line with the keyword if True
   :param bool ignore_first: doesn't check if the first line read has one of the given keywords
   :param bool skip: skips buffering content, returning None
+  :param int end_position: end if we reach this point in the file
   
   :returns: list with the lines until we find one of the keywords
   """
@@ -278,6 +275,10 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_fi
   
   while True:
     last_position = descriptor_file.tell()
+    
+    if end_position and last_position >= end_position:
+      break
+    
     line = descriptor_file.readline()
     if not line: break # EOF
     
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index 3b592ec..5054628 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -59,14 +59,6 @@ from stem.descriptor import _read_keyword_line, _read_keyword_line_str, _get_pse
 _bandwidth_weights_regex = re.compile(" ".join(["W%s=\d+" % weight for weight in ["bd",
   "be", "bg", "bm", "db", "eb", "ed", "ee", "eg", "em", "gb", "gd", "gg", "gm", "mb", "md", "me", "mg", "mm"]]))
 
-_router_desc_end_kws = ["r", "bandwidth-weights", "directory-footer", "directory-signature"]
-
-Flavour = stem.util.enum.Enum(
-  ("NONE", ""),
-  ("NS", "ns"),
-  ("MICRODESCRIPTOR", "microdesc"),
-)
-
 Flag = stem.util.enum.Enum(
   ("AUTHORITY", "Authority"),
   ("BADEXIT", "BadExit"),
@@ -82,12 +74,13 @@ Flag = stem.util.enum.Enum(
   ("VALID", "Valid"),
 )
 
-def parse_file(document_file, validate = True, flavour = Flavour.NONE):
+def parse_file(document_file, validate = True, is_microdescriptor = False):
   """
   Parses a network status document and provides a NetworkStatusDocument object.
   
   :param file document_file: file with network status document content
   :param bool validate: checks the validity of the document's contents if True, skips these checks otherwise
+  :param bool is_microdescriptor: True if this is for a microdescriptor consensus, False otherwise
   
   :returns: :class:`stem.descriptor.networkstatus.NetworkStatusDocument` object
   
@@ -96,35 +89,52 @@ def parse_file(document_file, validate = True, flavour = Flavour.NONE):
     * IOError if the file can't be read
   """
   
-  # parse until "r"
-  document_data = "".join(_read_until_keywords("r", document_file))
-  # store offset
-  r_offset = document_file.tell()
-  # skip until end of router descriptors
-  _read_until_keywords(["bandwidth-weights", "directory-footer", "directory-signature"], document_file, skip = True)
-  # parse until end
-  document_data = document_data + document_file.read()
-  
-  if flavour == Flavour.NONE:
-    document = NetworkStatusDocument(document_data, validate)
-    document_file.seek(r_offset)
-    document.router_descriptors = _ns_router_desc_generator(document_file, document.vote_status == "vote", validate)
-    return document
-  elif flavour == Flavour.MICRODESCRIPTOR:
-    document = MicrodescriptorConsensus(document_data, validate)
-    document_file.seek(r_offset)
-    document.router_descriptors = _router_microdesc_generator(document_file, validate, document.known_flags)
-    return document
-
-def _ns_router_desc_generator(document_file, vote, validate):
-  while _peek_keyword(document_file) == "r":
-    desc_content = "".join(_read_until_keywords(_router_desc_end_kws, document_file, False, True))
-    yield RouterDescriptor(desc_content, vote, validate)
+  if not is_microdescriptor:
+    document_type, router_type = NetworkStatusDocument, RouterDescriptor
+  else:
+    document_type, router_type = MicrodescriptorConsensus, RouterMicrodescriptor
+  
+  document, routers_start, routers_end = _get_document(document_file, validate, document_type)
+  document_file.seek(routers_start)
+  
+  while document_file.tell() < routers_end:
+    desc_content = "".join(_read_until_keywords("r", document_file, ignore_first = True, end_position = routers_end))
+    yield router_type(desc_content, document, validate)
 
-def _router_microdesc_generator(document_file, validate, known_flags):
-  while _peek_keyword(document_file) == "r":
-    desc_content = "".join(_read_until_keywords(_router_desc_end_kws, document_file, False, True))
-    yield RouterMicrodescriptor(desc_content, validate, known_flags)
+def _get_document(document_file, validate, document_type):
+  """
+  Network status documents consist of three sections: header, router entries,
+  and the footer. This provides back a tuple with the following...
+  (NetworkStatusDocument, routers_start, routers_end)
+  
+  :param file document_file: file with network status document content
+  :param bool validate: checks the validity of the document's contents if True, skips these checks otherwise
+  :param object document_type: consensus document class to construct
+  
+  :returns: tuple with the network status document and range that has the routers
+  
+  :raises:
+    * ValueError if the contents is malformed and validate is True
+    * IOError if the file can't be read
+  """
+  
+  # parse until the first router record
+  
+  header = _read_until_keywords("r", document_file)
+  routers_start = document_file.tell()
+  
+  # figure out the network status version
+  
+  # TODO: we should pick either 'directory-footer' or 'directory-signature'
+  # based on the header's network-status-version
+  
+  _read_until_keywords(["directory-footer", "directory-signature"], document_file, skip = True)
+  routers_end = document_file.tell()
+  footer = document_file.readlines()
+  
+  document_data = "".join(header + footer)
+  
+  return (document_type(document_data, validate), routers_start, routers_end)
 
 class NetworkStatusDocument(stem.descriptor.Descriptor):
   """
@@ -147,7 +157,6 @@ class NetworkStatusDocument(stem.descriptor.Descriptor):
   :var list server_versions: list of recommended Tor server versions
   :var list known_flags: **\*** list of known router flags
   :var list params: dict of parameter(str) => value(int) mappings
-  :var list router_descriptors: **\*** iterator for RouterDescriptor objects defined in the document
   :var list directory_authorities: **\*** list of DirectoryAuthority objects that have generated this document
   :var dict bandwidth_weights: **~** dict of weight(str) => value(int) mappings
   :var list directory_signatures: **\*** list of signatures this document has
@@ -169,7 +178,6 @@ class NetworkStatusDocument(stem.descriptor.Descriptor):
     
     super(NetworkStatusDocument, self).__init__(raw_content)
     
-    self.router_descriptors = []
     self.directory_authorities = []
     self.directory_signatures = []
     self.validated = validate
@@ -192,11 +200,6 @@ class NetworkStatusDocument(stem.descriptor.Descriptor):
     
     self._parse(raw_content)
   
-  def _router_desc_generator(self, document_file):
-    while _peek_keyword(document_file) == "r":
-      desc_content = "".join(_read_until_keywords(_router_desc_end_kws, document_file, False, True))
-      yield RouterDescriptor(desc_content, self.vote_status == "vote", self.validated, self.known_flags)
-  
   def _validate_network_status_version(self):
     return self.network_status_version == "3"
   
@@ -255,11 +258,6 @@ class NetworkStatusDocument(stem.descriptor.Descriptor):
       dirauth_data = "".join(dirauth_data).rstrip()
       self.directory_authorities.append(DirectoryAuthority(dirauth_data, vote, validate))
     
-    # router descriptors
-    if _peek_keyword(content) == "r":
-      router_descriptors_data = "".join(_read_until_keywords(["bandwidth-weights", "directory-footer", "directory-signature"], content, False, True))
-      self.router_descriptors = self._router_desc_generator(StringIO(router_descriptors_data))
-    
     # footer section
     if self.consensus_method > 9 or vote and filter(lambda x: x >= 9, self.consensus_methods):
       if _peek_keyword(content) == "directory-footer":
@@ -396,6 +394,8 @@ class RouterDescriptor(stem.descriptor.Descriptor):
   Router descriptor object. Parses and stores router information in a router
   entry read from a v3 network status document.
   
+  :var NetworkStatusDocument document: **\*** document this descriptor came from
+  
   :var str nickname: **\*** router's nickname
   :var str identity: **\*** router's identity
   :var str digest: **\*** router's digest
@@ -420,13 +420,13 @@ class RouterDescriptor(stem.descriptor.Descriptor):
   | exit_policy appears only in votes
   """
   
-  def __init__(self, raw_contents, vote = True, validate = True, known_flags = Flag):
+  def __init__(self, raw_contents, document, validate = True, known_flags = Flag):
     """
     Parse a router descriptor in a v3 network status document and provide a new
     RouterDescriptor object.
     
     :param str raw_content: router descriptor content to be parsed
-    :param bool vote: True if the descriptor is from a vote document
+    :param NetworkStatusDocument document: document this descriptor came from
     :param bool validate: whether the router descriptor should be validated
     :param bool known_flags: list of known router status flags
     
@@ -435,6 +435,8 @@ class RouterDescriptor(stem.descriptor.Descriptor):
     
     super(RouterDescriptor, self).__init__(raw_contents)
     
+    self.document = document
+    
     self.nickname = None
     self.identity = None
     self.digest = None
@@ -455,18 +457,18 @@ class RouterDescriptor(stem.descriptor.Descriptor):
     
     self.microdescriptor_hashes = []
     
-    self._parse(raw_contents, vote, validate, known_flags)
+    self._parse(raw_contents, validate, known_flags)
   
-  def _parse(self, raw_content, vote, validate, known_flags):
+  def _parse(self, raw_content, validate, known_flags):
     """
     :param dict raw_content: iptor contents to be applied
-    :param bool vote: True if the descriptor is from a vote document
     :param bool validate: checks the validity of descriptor content if True
     :param bool known_flags: list of known router status flags
     
     :raises: ValueError if an error occures in validation
     """
     
+    vote = self.document.vote_status == "vote"
     content = StringIO(raw_content)
     seen_keywords = set()
     peek_check_kw = lambda keyword: keyword == _peek_keyword(content)
@@ -576,7 +578,6 @@ class MicrodescriptorConsensus(NetworkStatusDocument):
   :var list server_versions: list of recommended Tor server versions
   :var list known_flags: **\*** list of known router flags
   :var list params: dict of parameter(str) => value(int) mappings
-  :var list router_descriptors: **\*** iterator for RouterDescriptor objects defined in the document
   :var list directory_authorities: **\*** list of DirectoryAuthority objects that have generated this document
   :var dict bandwidth_weights: **~** dict of weight(str) => value(int) mappings
   :var list directory_signatures: **\*** list of signatures this document has
@@ -585,11 +586,6 @@ class MicrodescriptorConsensus(NetworkStatusDocument):
   | **~** attribute appears only in consensuses
   """
   
-  def _router_desc_generator(self, document_file):
-    while _peek_keyword(document_file) == "r":
-      desc_content = "".join(_read_until_keywords(_router_desc_end_kws, document_file, False, True))
-      yield RouterMicrodescriptor(desc_content, self.validated, self.known_flags)
-  
   def _validate_network_status_version(self):
     return self.network_status_version == "3 microdesc"
 
@@ -598,6 +594,8 @@ class RouterMicrodescriptor(RouterDescriptor):
   Router microdescriptor object. Parses and stores router information in a router
   microdescriptor from a v3 microdescriptor consensus.
   
+  :var MicrodescriptorConsensus document: **\*** document this descriptor came from
+  
   :var str nickname: **\*** router's nickname
   :var str identity: **\*** router's identity
   :var datetime publication: **\*** router's publication
@@ -618,21 +616,24 @@ class RouterMicrodescriptor(RouterDescriptor):
   | **\*** attribute is either required when we're parsed with validation or has a default value, others are left as None if undefined
   """
   
-  def __init__(self, raw_contents, validate = True, known_flags = Flag):
+  def __init__(self, raw_contents, document, validate = True, known_flags = Flag):
     """
     Parse a router descriptor in a v3 microdescriptor consensus and provide a new
     RouterMicrodescriptor object.
     
     :param str raw_content: router descriptor content to be parsed
+    :param MicrodescriptorConsensus document: document this descriptor came from
     :param bool validate: whether the router descriptor should be validated
     :param bool known_flags: list of known router status flags
     
     :raises: ValueError if the descriptor data is invalid
     """
     
-    super(RouterMicrodescriptor, self).__init__(raw_contents, False, validate, known_flags)
+    super(RouterMicrodescriptor, self).__init__(raw_contents, document, validate, known_flags)
+    
+    self.document = document
   
-  def _parse(self, raw_content, _, validate, known_flags):
+  def _parse(self, raw_content, validate, known_flags):
     """
     :param dict raw_content: router descriptor contents to be parsed
     :param bool validate: checks the validity of descriptor content if True
diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py
index 11bd3e3..f95270b 100644
--- a/test/integ/descriptor/networkstatus.py
+++ b/test/integ/descriptor/networkstatus.py
@@ -13,7 +13,6 @@ import stem.exit_policy
 import stem.version
 import stem.descriptor.networkstatus
 import test.integ.descriptor
-from stem.descriptor.networkstatus import Flavour
 
 def _strptime(string):
   return datetime.datetime.strptime(string, "%Y-%m-%d %H:%M:%S")
@@ -39,7 +38,7 @@ class TestNetworkStatusDocument(unittest.TestCase):
     
     count = 0
     with open(descriptor_path) as descriptor_file:
-      for desc in stem.descriptor.networkstatus.parse_file(descriptor_file).router_descriptors:
+      for desc in stem.descriptor.networkstatus.parse_file(descriptor_file):
         if resource.getrusage(resource.RUSAGE_SELF).ru_maxrss > 200000:
           # if we're using > 200 MB we should fail
           self.fail()
@@ -75,7 +74,8 @@ class TestNetworkStatusDocument(unittest.TestCase):
     descriptor_path = test.integ.descriptor.get_resource("cached-consensus")
     
     descriptor_file = file(descriptor_path)
-    desc = stem.descriptor.networkstatus.NetworkStatusDocument(descriptor_file.read())
+    router1 = next(stem.descriptor.networkstatus.parse_file(descriptor_file))
+    desc = router1.document
     descriptor_file.close()
     
     self.assertEquals(True, desc.validated)
@@ -102,7 +102,7 @@ class TestNetworkStatusDocument(unittest.TestCase):
     self.assertEquals(set(desc.known_flags), set(["Authority", "BadExit", "Exit", "Fast", "Guard", "HSDir", "Named", "Running", "Stable", "Unnamed", "V2Dir", "Valid"]))
     expected_params = {"CircuitPriorityHalflifeMsec": 30000, "bwauthpid": 1}
     self.assertEquals(expected_params, desc.params)
-    router1 = next(desc.router_descriptors)
+    
     self.assertEquals("sumkledi", router1.nickname)
     self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router1.identity)
     self.assertEquals("8mCr8Sl7RF4ENU4jb0FZFA/3do8", router1.digest)
@@ -167,7 +167,8 @@ I/TJmV928na7RLZe2mGHCAW3VQOvV+QkCfj05VZ8CsY=
     descriptor_path = test.integ.descriptor.get_resource("vote")
     
     descriptor_file = file(descriptor_path)
-    desc = stem.descriptor.networkstatus.NetworkStatusDocument(descriptor_file.read())
+    router1 = next(stem.descriptor.networkstatus.parse_file(descriptor_file))
+    desc = router1.document
     descriptor_file.close()
     
     self.assertEquals(True, desc.validated)
@@ -186,7 +187,7 @@ I/TJmV928na7RLZe2mGHCAW3VQOvV+QkCfj05VZ8CsY=
     self.assertEquals(set(desc.known_flags), set(["Authority", "BadExit", "Exit", "Fast", "Guard", "HSDir", "Running", "Stable", "V2Dir", "Valid"]))
     expected_params = {"CircuitPriorityHalflifeMsec": 30000, "bwauthpid": 1}
     self.assertEquals(expected_params, desc.params)
-    router1 = next(desc.router_descriptors)
+    
     self.assertEquals("sumkledi", router1.nickname)
     self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router1.identity)
     self.assertEquals("B5n4BiALAF8B5AqafxohyYiuj7E", router1.digest)
@@ -272,7 +273,7 @@ DnN5aFtYKiTc19qIC7Nmo+afPdDEf0MlJvEOP5EWl3w=
     
     count = 0
     with open(descriptor_path) as descriptor_file:
-      for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, True, flavour = Flavour.MICRODESCRIPTOR).router_descriptors:
+      for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, True, is_microdescriptor = True):
         assert desc.nickname # check that the router has a nickname
         count += 1
     





More information about the tor-commits mailing list