[tor-commits] [stem/master] Providing alternative methods for parsing a NetworkStatusDocument

atagar at torproject.org atagar at torproject.org
Sun Feb 3 21:17:52 UTC 2013


commit ea0b73a5aa221fadafc2ba718a0ef42e151e5ad6
Author: Damian Johnson <atagar at torproject.org>
Date:   Sun Feb 3 13:00:56 2013 -0800

    Providing alternative methods for parsing a NetworkStatusDocument
    
    Adding support in both the DescriptorReader and parse_file() function for three
    ways of parsing network status documents...
    
    a. Provide the router status entries (ie. the current behavior).
    
    b. Provide the document itself with the router status entries that it contains.
       This has the biggest cost in terms of upfront parsing time and memory usage,
       but provides the caller with everything they might want.
    
    c. Provide the document but skip reading the router status entries. A handy
       option of you just care about the document's header/footer.
    
    Now that we have these capability I'm further simplifying the descriptor API a
    bit. The network status docs encouraged users to use the NetworkStatusDocument
    constructors to achieve option 'b' above, but now that it's in the reader and
    parse_file() there's no reason for them to do that.
    
    Users should now *always* use either the DescriptorReader or parse_file(). If
    they don't then they're off the reservation.
---
 stem/descriptor/__init__.py                       |   49 ++++++++---
 stem/descriptor/networkstatus.py                  |   94 +++++++++------------
 stem/descriptor/reader.py                         |    9 ++-
 test/settings.cfg                                 |    2 +-
 test/unit/descriptor/networkstatus/document_v3.py |   23 +++++
 5 files changed, 108 insertions(+), 69 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index f99e3ca..fafdd8a 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -11,6 +11,24 @@ Package for parsing and processing descriptor data.
     |- get_path - location of the descriptor on disk if it came from a file
     |- get_unrecognized_lines - unparsed descriptor content
     +- __str__ - string that the descriptor was made from
+
+.. data:: DocumentHandler (enum)
+
+  Ways in which we can parse a
+  :class:`~stem.descriptor.networkstatus.NetworkStatusDocument`.
+
+  Both **ENTRIES** and **BARE_DOCUMENT** have a 'thin' document, which doesn't
+  have a populated **routers** attribute. This allows for lower memory usage
+  and upfront runtime. However, if read time and memory aren't a concern then
+  **DOCUMENT** can provide you with a fully populated document.
+
+  =================== ===========
+  DocumentHandler     Description
+  =================== ===========
+  **ENTRIES**         Iterates over the contained :class:`~stem.descriptor.router_status_entry.RouterStatusEntry`. Each has a reference to the bare document it came from (through its **document** attribute).
+  **DOCUMENT**        :class:`~stem.descriptor.networkstatus.NetworkStatusDocument` with the :class:`~stem.descriptor.router_status_entry.RouterStatusEntry` it contains (through its **routers** attribute).
+  **BARE_DOCUMENT**   :class:`~stem.descriptor.networkstatus.NetworkStatusDocument` **without** a reference to its contents (the :class:`~stem.descriptor.router_status_entry.RouterStatusEntry` are unread).
+  =================== ===========
 """
 
 __all__ = [
@@ -28,6 +46,7 @@ import os
 import re
 
 import stem.prereq
+import stem.util.enum
 import stem.util.str_tools
 
 try:
@@ -42,8 +61,14 @@ KEYWORD_LINE = re.compile("^([%s]+)(?:[%s]+(.*))?$" % (KEYWORD_CHAR, WHITESPACE)
 PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE))
 PGP_BLOCK_END = "-----END %s-----"
 
+DocumentHandler = stem.util.enum.UppercaseEnum(
+  "ENTRIES",
+  "DOCUMENT",
+  "BARE_DOCUMENT",
+)
+
 
-def parse_file(descriptor_file, descriptor_type = None, path = None, validate = True):
+def parse_file(descriptor_file, descriptor_type = None, path = None, validate = True, document_handler = DocumentHandler.ENTRIES):
   """
   Simple function to read the descriptor contents from a file, providing an
   iterator for its :class:`~stem.descriptor.__init__.Descriptor` contents.
@@ -84,7 +109,7 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
   ========================================= =====
 
   If you're using **python 3** then beware that the open() function defaults to
-  using **text mode**. **Binary mode** is strongly suggested because it's both
+  using text mode. **Binary mode** is strongly suggested because it's both
   faster (by my testing by about 33x) and doesn't do universal newline
   translation which can make us misparse the document.
 
@@ -97,6 +122,8 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
   :param str path: absolute path to the file's location on disk
   :param bool validate: checks the validity of the descriptor's content if
     **True**, skips these checks otherwise
+  :param stem.descriptor.__init__.DocumentHandler document_handler: method in
+    which to parse :class:`~stem.descriptor.networkstatus.NetworkStatusDocument`
 
   :returns: iterator for :class:`~stem.descriptor.__init__.Descriptor` instances in the file
 
@@ -134,14 +161,14 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
 
     if descriptor_type_match:
       desc_type, major_version, minor_version = descriptor_type_match.groups()
-      file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f, validate)
+      file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f, validate, document_handler)
     else:
       raise ValueError("The descriptor_type must be of the form '<type> <major_version>.<minor_version>'")
   elif metrics_header_match:
     # Metrics descriptor handling
 
     desc_type, major_version, minor_version = metrics_header_match.groups()
-    file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f, validate)
+    file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f, validate, document_handler)
   else:
     # Cached descriptor handling. These contain multiple descriptors per file.
 
@@ -150,9 +177,9 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
     elif filename == "cached-extrainfo":
       file_parser = lambda f: stem.descriptor.extrainfo_descriptor._parse_file(f, validate = validate)
     elif filename == "cached-consensus":
-      file_parser = lambda f: stem.descriptor.networkstatus._parse_file(f, validate = validate)
+      file_parser = lambda f: stem.descriptor.networkstatus._parse_file(f, validate = validate, document_handler = document_handler)
     elif filename == "cached-microdesc-consensus":
-      file_parser = lambda f: stem.descriptor.networkstatus._parse_file(f, is_microdescriptor = True, validate = validate)
+      file_parser = lambda f: stem.descriptor.networkstatus._parse_file(f, is_microdescriptor = True, validate = validate, document_handler = document_handler)
 
   if file_parser:
     for desc in file_parser(descriptor_file):
@@ -168,7 +195,7 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
   raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
 
 
-def _parse_metrics_file(descriptor_type, major_version, minor_version, descriptor_file, validate):
+def _parse_metrics_file(descriptor_type, major_version, minor_version, descriptor_file, validate, document_handler):
   # Parses descriptor files from metrics, yielding individual descriptors. This
   # throws a TypeError if the descriptor_type or version isn't recognized.
   import stem.descriptor.server_descriptor
@@ -193,24 +220,24 @@ def _parse_metrics_file(descriptor_type, major_version, minor_version, descripto
   elif descriptor_type == "network-status-2" and major_version == 1:
     document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV2
 
-    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, validate = validate):
+    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, validate = validate, document_handler = document_handler):
       yield desc
   elif descriptor_type == "dir-key-certificate-3" and major_version == 1:
     yield stem.descriptor.networkstatus.KeyCertificate(descriptor_file.read(), validate = validate)
   elif descriptor_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1:
     document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
 
-    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, validate = validate):
+    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, validate = validate, document_handler = document_handler):
       yield desc
   elif descriptor_type == "network-status-microdesc-consensus-3" and major_version == 1:
     document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
 
-    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, is_microdescriptor = True, validate = validate):
+    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, is_microdescriptor = True, validate = validate, document_handler = document_handler):
       yield desc
   elif descriptor_type == "bridge-network-status" and major_version == 1:
     document_type = stem.descriptor.networkstatus.BridgeNetworkStatusDocument
 
-    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, validate = validate):
+    for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, validate = validate, document_handler = document_handler):
       yield desc
   else:
     raise TypeError("Unrecognized metrics descriptor format. type: '%s', version: '%i.%i'" % (descriptor_type, major_version, minor_version))
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py
index 0d60de1..9f3730c 100644
--- a/stem/descriptor/networkstatus.py
+++ b/stem/descriptor/networkstatus.py
@@ -15,43 +15,18 @@ dir-spec. Documents can be obtained from a few sources...
 
 Of these, the router status entry section can be quite large (on the order of
 hundreds of kilobytes). As such we provide a couple of methods for reading
-network status documents...
-
-* :class:`stem.descriptor.networkstatus.NetworkStatusDocumentV3` constructor
-
-If read time and memory aren't a concern then you can simply use the document
-constructor. Router entries are assigned to its 'routers' attribute...
-
-::
-
-  from stem.descriptor.networkstatus import NetworkStatusDocumentV3
-
-  # Reads the full consensus into memory twice (both for the parsed and
-  # unparsed contents).
-
-  consensus_file = open('.tor/cached-consensus', 'r')
-  consensus = NetworkStatusDocumentV3(consensus_file.read())
-  consensus_file.close()
-
-  for router in consensus.routers:
-    print router.nickname
-
-* :func:`stem.descriptor.parse_file`
-
-Alternatively, the :func:`~stem.descriptor.parse_file` function provides an
-iterator for a document's routers. Those routers refer to a 'thin' document,
-which doesn't have a 'routers' attribute. This allows for lower memory usage
-and upfront runtime.
+network status documents through :func:`~stem.descriptor.__init__.parse_file`.
+For more information see :func:`~stem.descriptor.__init__.DocumentHandler`...
 
 ::
 
-  from stem.descriptor import parse_file
+  from stem.descriptor import parse_file, DocumentHandler
 
-  with open('.tor/cached-consensus', 'r') as consensus_file:
+  with open('.tor/cached-consensus', 'rb') as consensus_file:
     # Processes the routers as we read them in. The routers refer to a document
     # with an unset 'routers' attribute.
 
-    for router in parse_file(consensus_file, 'network-status-consensus-3 1.0'):
+    for router in parse_file(consensus_file, 'network-status-consensus-3 1.0', document_handler = DocumentHandler.ENTRIES):
       print router.nickname
 
 **Module Overview:**
@@ -167,7 +142,7 @@ BANDWIDTH_WEIGHT_ENTRIES = (
 )
 
 
-def _parse_file(document_file, document_type = None, validate = True, is_microdescriptor = False):
+def _parse_file(document_file, document_type = None, validate = True, is_microdescriptor = False, document_handler = stem.descriptor.DocumentHandler.ENTRIES):
   """
   Parses a network status and iterates over the RouterStatusEntry in it. The
   document that these instances reference have an empty 'routers' attribute to
@@ -179,6 +154,8 @@ def _parse_file(document_file, document_type = None, validate = True, is_microde
     **True**, skips these checks otherwise
   :param bool is_microdescriptor: **True** if this is for a microdescriptor
     consensus, **False** otherwise
+  :param stem.descriptor.__init__.DocumentHandler document_handler: method in
+    which to parse :class:`~stem.descriptor.networkstatus.NetworkStatusDocument`
 
   :returns: :class:`stem.descriptor.networkstatus.NetworkStatusDocument` object
 
@@ -193,17 +170,6 @@ def _parse_file(document_file, document_type = None, validate = True, is_microde
   if document_type is None:
     document_type = NetworkStatusDocumentV3
 
-  # getting the document without the routers section
-
-  header = stem.descriptor._read_until_keywords((ROUTERS_START, FOOTER_START, V2_FOOTER_START), document_file)
-
-  routers_start = document_file.tell()
-  stem.descriptor._read_until_keywords((FOOTER_START, V2_FOOTER_START), document_file, skip = True)
-  routers_end = document_file.tell()
-
-  footer = document_file.readlines()
-  document_content = "".join(header + footer)
-
   if document_type == NetworkStatusDocumentV2:
     document_type = NetworkStatusDocumentV2
     router_type = stem.descriptor.router_status_entry.RouterStatusEntryV2
@@ -218,18 +184,38 @@ def _parse_file(document_file, document_type = None, validate = True, is_microde
   else:
     raise ValueError("Document type %i isn't recognized (only able to parse v2, v3, and bridge)" % document_type)
 
-  desc_iterator = stem.descriptor.router_status_entry._parse_file(
-    document_file,
-    validate,
-    entry_class = router_type,
-    entry_keyword = ROUTERS_START,
-    start_position = routers_start,
-    end_position = routers_end,
-    extra_args = (document_type(document_content, validate),),
-  )
-
-  for desc in desc_iterator:
-    yield desc
+  if document_handler == stem.descriptor.DocumentHandler.DOCUMENT:
+    yield document_type(document_file.read(), validate)
+    return
+
+  # getting the document without the routers section
+
+  header = stem.descriptor._read_until_keywords((ROUTERS_START, FOOTER_START, V2_FOOTER_START), document_file)
+
+  routers_start = document_file.tell()
+  stem.descriptor._read_until_keywords((FOOTER_START, V2_FOOTER_START), document_file, skip = True)
+  routers_end = document_file.tell()
+
+  footer = document_file.readlines()
+  document_content = "".join(header + footer)
+
+  if document_handler == stem.descriptor.DocumentHandler.BARE_DOCUMENT:
+    yield document_type(document_content, validate)
+  elif document_handler == stem.descriptor.DocumentHandler.ENTRIES:
+    desc_iterator = stem.descriptor.router_status_entry._parse_file(
+      document_file,
+      validate,
+      entry_class = router_type,
+      entry_keyword = ROUTERS_START,
+      start_position = routers_start,
+      end_position = routers_end,
+      extra_args = (document_type(document_content, validate),),
+    )
+
+    for desc in desc_iterator:
+      yield desc
+  else:
+    raise ValueError("Unrecognized document_handler: %s" % document_handler)
 
 
 class NetworkStatusDocument(stem.descriptor.Descriptor):
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index f00e0aa..21ab049 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -255,9 +255,11 @@ class DescriptorReader(object):
     be read, this is unbounded if zero
   :param str persistence_path: if set we will load and save processed file
     listings from this path, errors are ignored
+  :param stem.descriptor.__init__.DocumentHandler document_handler: method in
+    which to parse :class:`~stem.descriptor.networkstatus.NetworkStatusDocument`
   """
 
-  def __init__(self, target, validate = True, follow_links = False, buffer_size = 100, persistence_path = None):
+  def __init__(self, target, validate = True, follow_links = False, buffer_size = 100, persistence_path = None, document_handler = stem.descriptor.DocumentHandler.ENTRIES):
     if isinstance(target, str):
       self._targets = [target]
     else:
@@ -266,6 +268,7 @@ class DescriptorReader(object):
     self._validate = validate
     self._follow_links = follow_links
     self._persistence_path = persistence_path
+    self._document_handler = document_handler
     self._read_listeners = []
     self._skip_listeners = []
     self._processed_files = {}
@@ -514,7 +517,7 @@ class DescriptorReader(object):
       self._notify_read_listeners(target)
 
       with open(target, 'rb') as target_file:
-        for desc in stem.descriptor.parse_file(target_file, validate = self._validate, path = target):
+        for desc in stem.descriptor.parse_file(target_file, validate = self._validate, path = target, document_handler = self._document_handler):
           if self._is_stopped.isSet():
             return
 
@@ -542,7 +545,7 @@ class DescriptorReader(object):
         if tar_entry.isfile():
           entry = tar_file.extractfile(tar_entry)
 
-          for desc in stem.descriptor.parse_file(entry, validate = self._validate, path = target):
+          for desc in stem.descriptor.parse_file(entry, validate = self._validate, path = target, document_handler = self._document_handler):
             if self._is_stopped.isSet():
               return
 
diff --git a/test/settings.cfg b/test/settings.cfg
index 6ad4b59..a5abc91 100644
--- a/test/settings.cfg
+++ b/test/settings.cfg
@@ -157,7 +157,7 @@ target.torrc RUN_PTRACE   => PORT, PTRACE
 pyflakes.ignore stem/prereq.py => 'RSA' imported but unused
 pyflakes.ignore stem/prereq.py => 'asn1' imported but unused
 pyflakes.ignore stem/prereq.py => 'long_to_bytes' imported but unused
-pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 35
+pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 54
 pyflakes.ignore stem/util/str_tools.py => redefinition of function '_to_bytes' from line 54
 pyflakes.ignore stem/util/str_tools.py => redefinition of function '_to_unicode' from line 60
 pyflakes.ignore test/mocking.py => undefined name 'builtins'
diff --git a/test/unit/descriptor/networkstatus/document_v3.py b/test/unit/descriptor/networkstatus/document_v3.py
index b1cc74e..74c7fe4 100644
--- a/test/unit/descriptor/networkstatus/document_v3.py
+++ b/test/unit/descriptor/networkstatus/document_v3.py
@@ -130,6 +130,29 @@ class TestNetworkStatusDocument(unittest.TestCase):
       for router in stem.descriptor.parse_file(consensus_file, 'network-status-consensus-3 1.0'):
         self.assertEqual('caerSidi', router.nickname)
 
+  def test_handlers(self):
+    """
+    Try parsing a document with DocumentHandler.DOCUMENT and
+    DocumentHandler.BARE_DOCUMENT.
+    """
+
+    # Simple sanity check that they provide the right type, and that the
+    # document includes or excludes the router status entries as appropriate.
+
+    entry1 = get_router_status_entry_v3({'s': "Fast"})
+    entry2 = get_router_status_entry_v3({'s': "Valid"})
+    content = get_network_status_document_v3(routers = (entry1, entry2), content = True)
+
+    descriptors = list(stem.descriptor.parse_file(StringIO.StringIO(content), 'network-status-consensus-3 1.0', document_handler = stem.descriptor.DocumentHandler.DOCUMENT))
+    self.assertEqual(1, len(descriptors))
+    self.assertTrue(isinstance(descriptors[0], NetworkStatusDocumentV3))
+    self.assertEqual(2, len(descriptors[0].routers))
+
+    descriptors = list(stem.descriptor.parse_file(StringIO.StringIO(content), 'network-status-consensus-3 1.0', document_handler = stem.descriptor.DocumentHandler.BARE_DOCUMENT))
+    self.assertEqual(1, len(descriptors))
+    self.assertTrue(isinstance(descriptors[0], NetworkStatusDocumentV3))
+    self.assertEqual(0, len(descriptors[0].routers))
+
   def test_parse_file(self):
     """
     Try parsing a document via the _parse_file() function.



More information about the tor-commits mailing list