commit 5818640e54542ff079b0a9328f82bb0c5897bc9a Author: Damian Johnson atagar@torproject.org Date: Thu Jan 17 09:28:56 2013 -0800
Only providing a single parse_file() function
Ok, our interface for parsing descriptors was damn confusing. Users had three options...
* stem.descriptor.reader
This was the method I had intended for users to always read descriptor files with, but folks keep finding parse_file() first...
* stem.descriptor.parse_file()
I didn't intend for this to be used by our users (I made it to support the DescriptorReader). However, it's pretty convenient for reading individual descriptor files.
* stem.descriptor.*.parse_file()
The parse_file() in individual descriptor classes definitely weren't intended for external users. They were made to support stem.descriptor.parse_file(), but turned out to be used most of all...
I'm making the last group private to discourage their usage and changing stem.descriptor.parse_file() to be both more capable and user friendly...
* users can now explicitly give it a descriptor type, as per the metrics types (https://metrics.torproject.org/formats.html#descriptortypes)
* the path is now optional (it was only required because I expected this to be used by the DescriptorReader)
* ... and better pydocs
This breaks backward compatibility, but should greatly help our usability going forward. Thanks to Aaron for pointing out the pain points. --- stem/control.py | 4 +- stem/descriptor/__init__.py | 62 ++++++++++++++++----- stem/descriptor/extrainfo_descriptor.py | 3 +- stem/descriptor/networkstatus.py | 28 ++++----- stem/descriptor/reader.py | 4 +- stem/descriptor/router_status_entry.py | 2 +- stem/descriptor/server_descriptor.py | 3 +- stem/response/events.py | 4 +- test/integ/descriptor/extrainfo_descriptor.py | 2 +- test/integ/descriptor/networkstatus.py | 15 +++-- test/integ/descriptor/server_descriptor.py | 2 +- test/settings.cfg | 2 +- test/unit/descriptor/networkstatus/document_v3.py | 10 ++-- test/unit/descriptor/server_descriptor.py | 4 +- 14 files changed, 87 insertions(+), 58 deletions(-)
diff --git a/stem/control.py b/stem/control.py index 10bc667..71339b2 100644 --- a/stem/control.py +++ b/stem/control.py @@ -1020,7 +1020,7 @@ class Controller(BaseController):
desc_content = self.get_info("desc/all-recent")
- for desc in stem.descriptor.server_descriptor.parse_file(StringIO.StringIO(desc_content)): + for desc in stem.descriptor.server_descriptor._parse_file(StringIO.StringIO(desc_content)): yield desc except Exception, exc: if default == UNDEFINED: @@ -1087,7 +1087,7 @@ class Controller(BaseController):
desc_content = self.get_info("ns/all")
- desc_iterator = stem.descriptor.router_status_entry.parse_file( + desc_iterator = stem.descriptor.router_status_entry._parse_file( StringIO.StringIO(desc_content), True, entry_class = stem.descriptor.router_status_entry.RouterStatusEntryV2, diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py index aabf37e..3a0f46e 100644 --- a/stem/descriptor/__init__.py +++ b/stem/descriptor/__init__.py @@ -5,7 +5,8 @@ Package for parsing and processing descriptor data.
::
- parse_file - Iterates over the descriptors in a file. + parse_file - Parses the descriptors in a file. + Descriptor - Common parent for all descriptor file types. |- get_path - location of the descriptor on disk if it came from a file |- get_unrecognized_lines - unparsed descriptor content @@ -39,12 +40,27 @@ PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHIT PGP_BLOCK_END = "-----END %s-----"
-def parse_file(path, descriptor_file): +def parse_file(descriptor_file, descriptor_type = None, path = None): """ - Provides an iterator for the descriptors within a given file. + Simple function to read the descriptor contents from a file, providing an + iterator for its :class:`~stem.descriptor.Descriptor` contents. + + If you don't provide a **descriptor_type** argument then this automatically + tries to determine the descriptor type based on the following... + + * The filename if it matches something from tor's data directory. For + instance, tor's 'cached-descriptors' contains server descriptors. + + * The @type annotation on the first line. These are generally only found in + the descriptor archives from 'https://metrics.torproject.org'. + + This is a handy function for simple usage, but if you're reading multiple + descriptor files you might want to consider the + :class:`~stem.descriptor.reader.DescriptorReader`.
- :param str path: absolute path to the file's location on disk :param file descriptor_file: opened file with the descriptor contents + :param tuple descriptor_type: tuple of the form **(type, major_version, minor_version)** as per `https://metrics.torproject.org/formats.html#descriptortypes%60_ + :param str path: absolute path to the file's location on disk
:returns: iterator for :class:`stem.descriptor.Descriptor` instances in the file
@@ -65,16 +81,30 @@ def parse_file(path, descriptor_file):
# Cached descriptor handling. These contain multiple descriptors per file.
- filename, file_parser = os.path.basename(path), None + filename = '<undefined>' if path is None else os.path.basename(path) + file_parser = None + + if descriptor_type is not None: + if len(descriptor_type) != 3: + raise ValueError("The descriptor_type must be a tuple of the form (type, major_version, minor_version)") + + desc_type, major_version, minor_version = descriptor_type
- if filename == "cached-descriptors": - file_parser = stem.descriptor.server_descriptor.parse_file + try: + major_version = int(major_version) + minor_version = int(minor_version) + except ValueError: + raise ValueError("The descriptor_type's major and minor versions must be integers") + + file_parser = lambda f: _parse_metrics_file(desc_type, major_version, minor_version, f) + elif filename == "cached-descriptors": + file_parser = stem.descriptor.server_descriptor._parse_file elif filename == "cached-extrainfo": - file_parser = stem.descriptor.extrainfo_descriptor.parse_file + file_parser = stem.descriptor.extrainfo_descriptor._parse_file elif filename == "cached-consensus": - file_parser = stem.descriptor.networkstatus.parse_file + file_parser = stem.descriptor.networkstatus._parse_file elif filename == "cached-microdesc-consensus": - file_parser = lambda f: stem.descriptor.networkstatus.parse_file(f, is_microdescriptor = True) + file_parser = lambda f: stem.descriptor.networkstatus._parse_file(f, is_microdescriptor = True) else: # Metrics descriptor handling first_line, desc = descriptor_file.readline().strip(), None @@ -86,7 +116,9 @@ def parse_file(path, descriptor_file):
if file_parser: for desc in file_parser(descriptor_file): - desc._set_path(path) + if path is not None: + desc._set_path(path) + yield desc
return @@ -117,22 +149,22 @@ def _parse_metrics_file(descriptor_type, major_version, minor_version, descripto elif descriptor_type == "network-status-2" and major_version == 1: document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV2
- for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, document_type): + for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type): yield desc elif descriptor_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1: document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
- for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, document_type): + for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type): yield desc elif descriptor_type == "network-status-microdesc-consensus-3" and major_version == 1: document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
- for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, document_type, is_microdescriptor = True): + for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, is_microdescriptor = True): yield desc elif descriptor_type == "bridge-network-status" and major_version == 1: document_type = stem.descriptor.networkstatus.BridgeNetworkStatusDocument
- for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, document_type): + for desc in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type): yield desc else: raise TypeError("Unrecognized metrics descriptor format. type: '%s', version: '%i.%i'" % (descriptor_type, major_version, minor_version)) diff --git a/stem/descriptor/extrainfo_descriptor.py b/stem/descriptor/extrainfo_descriptor.py index 4b98c68..de6b5c9 100644 --- a/stem/descriptor/extrainfo_descriptor.py +++ b/stem/descriptor/extrainfo_descriptor.py @@ -23,7 +23,6 @@ Extra-info descriptors are available from a few sources...
::
- parse_file - Iterates over the extra-info descriptors in a file. ExtraInfoDescriptor - Tor extra-info descriptor. | |- RelayExtraInfoDescriptor - Extra-info descriptor for a relay. | +- BridgeExtraInfoDescriptor - Extra-info descriptor for a bridge. @@ -134,7 +133,7 @@ SINGLE_FIELDS = ( )
-def parse_file(descriptor_file, validate = True): +def _parse_file(descriptor_file, validate = True): """ Iterates over the extra-info descriptors in a file.
diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py index ef7e4a5..2b80f85 100644 --- a/stem/descriptor/networkstatus.py +++ b/stem/descriptor/networkstatus.py @@ -36,30 +36,28 @@ constructor. Router entries are assigned to its 'routers' attribute... for router in consensus.routers: print router.nickname
-* :func:`stem.descriptor.networkstatus.parse_file` +* :func:`stem.descriptor.parse_file`
-Alternatively, the :func:`~stem.descriptor.networkstatus.parse_file` function -provides an iterator for a document's routers. Those routers refer to a 'thin' -document, which doesn't have a 'routers' attribute. This allows for lower -memory usage and upfront runtime. +Alternatively, the :func:`~stem.descriptor.parse_file` function provides an +iterator for a document's routers. Those routers refer to a 'thin' document, +which doesn't have a 'routers' attribute. This allows for lower memory usage +and upfront runtime.
::
- from stem.descriptor.networkstatus import parse_file + from stem.descriptor import parse_file
with open('.tor/cached-consensus', 'r') as consensus_file: # Processes the routers as we read them in. The routers refer to a document # with an unset 'routers' attribute.
- for router in parse_file(consensus_file): + for router in parse_file(consensus_file, ('network-status-consensus-3', 1, 0)): print router.nickname
**Module Overview:**
::
- parse_file - parses a network status file, providing an iterator for its routers - NetworkStatusDocument - Network status document |- NetworkStatusDocumentV2 - Version 2 network status document +- NetworkStatusDocumentV3 - Version 3 network status document @@ -166,7 +164,7 @@ BANDWIDTH_WEIGHT_ENTRIES = ( )
-def parse_file(document_file, document_type = None, validate = True, is_microdescriptor = False): +def _parse_file(document_file, document_type = None, validate = True, is_microdescriptor = False): """ Parses a network status and iterates over the RouterStatusEntry in it. The document that these instances reference have an empty 'routers' attribute to @@ -217,7 +215,7 @@ def parse_file(document_file, document_type = None, validate = True, is_microdes else: raise ValueError("Document type %i isn't recognized (only able to parse v2, v3, and bridge)" % document_type)
- desc_iterator = stem.descriptor.router_status_entry.parse_file( + desc_iterator = stem.descriptor.router_status_entry._parse_file( document_file, validate, entry_class = router_type, @@ -300,7 +298,7 @@ class NetworkStatusDocumentV2(NetworkStatusDocument): document_file = StringIO.StringIO(raw_content) document_content = "".join(stem.descriptor._read_until_keywords((ROUTERS_START, V2_FOOTER_START), document_file))
- self.routers = tuple(stem.descriptor.router_status_entry.parse_file( + self.routers = tuple(stem.descriptor.router_status_entry._parse_file( document_file, validate, entry_class = stem.descriptor.router_status_entry.RouterStatusEntryV2, @@ -483,7 +481,7 @@ class NetworkStatusDocumentV3(NetworkStatusDocument): else: self._unrecognized_lines += value
- self.directory_authorities = tuple(stem.descriptor.router_status_entry.parse_file( + self.directory_authorities = tuple(stem.descriptor.router_status_entry._parse_file( document_file, validate, entry_class = DirectoryAuthority, @@ -497,7 +495,7 @@ class NetworkStatusDocumentV3(NetworkStatusDocument): else: router_type = stem.descriptor.router_status_entry.RouterStatusEntryMicroV3
- self.routers = tuple(stem.descriptor.router_status_entry.parse_file( + self.routers = tuple(stem.descriptor.router_status_entry._parse_file( document_file, validate, entry_class = router_type, @@ -1365,7 +1363,7 @@ class BridgeNetworkStatusDocument(NetworkStatusDocument): elif validate: raise ValueError("Bridge network status documents must start with a 'published' line:\n%s" % raw_content)
- self.routers = tuple(stem.descriptor.router_status_entry.parse_file( + self.routers = tuple(stem.descriptor.router_status_entry._parse_file( document_file, validate, entry_class = stem.descriptor.router_status_entry.RouterStatusEntryV3, diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py index d3e9d2a..343b15e 100644 --- a/stem/descriptor/reader.py +++ b/stem/descriptor/reader.py @@ -505,7 +505,7 @@ class DescriptorReader(object): try: self._notify_read_listeners(target) with open(target) as target_file: - for desc in stem.descriptor.parse_file(target, target_file): + for desc in stem.descriptor.parse_file(target_file, path = target): if self._is_stopped.isSet(): return
@@ -533,7 +533,7 @@ class DescriptorReader(object): if tar_entry.isfile(): entry = tar_file.extractfile(tar_entry)
- for desc in stem.descriptor.parse_file(target, entry): + for desc in stem.descriptor.parse_file(entry, path = target): if self._is_stopped.isSet(): return
diff --git a/stem/descriptor/router_status_entry.py b/stem/descriptor/router_status_entry.py index aca72ed..94a91c2 100644 --- a/stem/descriptor/router_status_entry.py +++ b/stem/descriptor/router_status_entry.py @@ -23,7 +23,7 @@ import stem.descriptor import stem.exit_policy
-def parse_file(document_file, validate, entry_class, entry_keyword = "r", start_position = None, end_position = None, section_end_keywords = (), extra_args = ()): +def _parse_file(document_file, validate, entry_class, entry_keyword = "r", start_position = None, end_position = None, section_end_keywords = (), extra_args = ()): """ Reads a range of the document_file containing some number of entry_class instances. We deliminate the entry_class entries by the keyword on their diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py index b574ea0..1a10dd1 100644 --- a/stem/descriptor/server_descriptor.py +++ b/stem/descriptor/server_descriptor.py @@ -12,7 +12,6 @@ etc). This information is provided from a few sources...
::
- parse_file - Iterates over the server descriptors in a file. ServerDescriptor - Tor server descriptor. |- RelayDescriptor - Server descriptor for a relay. | +- is_valid - checks the signature against the descriptor content @@ -72,7 +71,7 @@ SINGLE_FIELDS = ( )
-def parse_file(descriptor_file, validate = True): +def _parse_file(descriptor_file, validate = True): """ Iterates over the server descriptors in a file.
diff --git a/stem/response/events.py b/stem/response/events.py index 4b199ce..563b685 100644 --- a/stem/response/events.py +++ b/stem/response/events.py @@ -566,7 +566,7 @@ class NetworkStatusEvent(Event): def _parse(self): content = str(self).lstrip("NS\n")
- self.desc = list(stem.descriptor.router_status_entry.parse_file( + self.desc = list(stem.descriptor.router_status_entry._parse_file( StringIO.StringIO(content), True, entry_class = stem.descriptor.router_status_entry.RouterStatusEntryV3, @@ -591,7 +591,7 @@ class NewConsensusEvent(Event): def _parse(self): content = str(self).lstrip("NEWCONSENSUS\n")
- self.desc = list(stem.descriptor.router_status_entry.parse_file( + self.desc = list(stem.descriptor.router_status_entry._parse_file( StringIO.StringIO(content), True, entry_class = stem.descriptor.router_status_entry.RouterStatusEntryV3, diff --git a/test/integ/descriptor/extrainfo_descriptor.py b/test/integ/descriptor/extrainfo_descriptor.py index 87776a5..5fc9b5a 100644 --- a/test/integ/descriptor/extrainfo_descriptor.py +++ b/test/integ/descriptor/extrainfo_descriptor.py @@ -151,7 +151,7 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw return
with open(descriptor_path) as descriptor_file: - for desc in stem.descriptor.extrainfo_descriptor.parse_file(descriptor_file): + for desc in stem.descriptor.extrainfo_descriptor._parse_file(descriptor_file): unrecognized_lines = desc.get_unrecognized_lines()
if desc.dir_v2_responses_unknown: diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py index c5a5d77..e34960a 100644 --- a/test/integ/descriptor/networkstatus.py +++ b/test/integ/descriptor/networkstatus.py @@ -9,6 +9,7 @@ import os import resource import unittest
+import stem import stem.descriptor import stem.descriptor.networkstatus import stem.version @@ -42,7 +43,7 @@ class TestNetworkStatus(unittest.TestCase): with open(consensus_path) as descriptor_file: document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
- for router in stem.descriptor.networkstatus.parse_file(descriptor_file, document_type): + for router in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type): count += 1
# We should have constant memory usage. Fail if we're using over 200 MB. @@ -53,7 +54,7 @@ class TestNetworkStatus(unittest.TestCase): # TODO: this should be a 'new capability' check later rather than # failing the tests for flag in router.flags: - if not flag in stem.descriptor.Flag: + if not flag in stem.Flag: raise ValueError("Unrecognized flag type: %s, found on relay %s (%s)" % (flag, router.fingerprint, router.nickname))
unrecognized_lines = router.get_unrecognized_lines() @@ -89,7 +90,7 @@ class TestNetworkStatus(unittest.TestCase): with open(consensus_path) as descriptor_file: document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
- for router in stem.descriptor.networkstatus.parse_file(descriptor_file, document_type, is_microdescriptor = True): + for router in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, is_microdescriptor = True): count += 1
if resource.getrusage(resource.RUSAGE_SELF).ru_maxrss > 200000: @@ -99,7 +100,7 @@ class TestNetworkStatus(unittest.TestCase): # TODO: this should be a 'new capability' check later rather than # failing the tests for flag in router.flags: - if not flag in stem.descriptor.Flag: + if not flag in stem.Flag: raise ValueError("Unrecognized flag type: %s, found on microdescriptor relay %s (%s)" % (flag, router.fingerprint, router.nickname))
unrecognized_lines = router.get_unrecognized_lines() @@ -117,7 +118,7 @@ class TestNetworkStatus(unittest.TestCase): consensus_path = test.integ.descriptor.get_resource("metrics_consensus")
with open(consensus_path) as descriptor_file: - descriptors = stem.descriptor.parse_file(consensus_path, descriptor_file) + descriptors = stem.descriptor.parse_file(descriptor_file, path = consensus_path)
router = next(descriptors) self.assertEquals("sumkledi", router.nickname) @@ -136,7 +137,7 @@ class TestNetworkStatus(unittest.TestCase): consensus_path = test.integ.descriptor.get_resource("bridge_network_status")
with open(consensus_path) as descriptor_file: - descriptors = stem.descriptor.parse_file(consensus_path, descriptor_file) + descriptors = stem.descriptor.parse_file(descriptor_file, path = consensus_path)
router = next(descriptors) self.assertEquals("Unnamed", router.nickname) @@ -327,7 +328,7 @@ TpQQk3nNQF8z6UIvdlvP+DnJV4izWVkQEZgUZgIVM0E= vote_path = test.integ.descriptor.get_resource("metrics_vote")
with open(vote_path) as descriptor_file: - descriptors = stem.descriptor.parse_file(vote_path, descriptor_file) + descriptors = stem.descriptor.parse_file(descriptor_file, path = vote_path)
router = next(descriptors) self.assertEquals("sumkledi", router.nickname) diff --git a/test/integ/descriptor/server_descriptor.py b/test/integ/descriptor/server_descriptor.py index e8657cc..dd7dcd7 100644 --- a/test/integ/descriptor/server_descriptor.py +++ b/test/integ/descriptor/server_descriptor.py @@ -159,7 +159,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4= return
with open(descriptor_path) as descriptor_file: - for desc in stem.descriptor.server_descriptor.parse_file(descriptor_file): + for desc in stem.descriptor.server_descriptor._parse_file(descriptor_file): # the following attributes should be deprecated, and not appear in the wild self.assertEquals(None, desc.read_history_end) self.assertEquals(None, desc.write_history_end) diff --git a/test/settings.cfg b/test/settings.cfg index 561c0c8..ebaf6c0 100644 --- a/test/settings.cfg +++ b/test/settings.cfg @@ -156,6 +156,6 @@ target.torrc RUN_PTRACE => PORT, PTRACE pyflakes.ignore stem/prereq.py => 'RSA' imported but unused pyflakes.ignore stem/prereq.py => 'asn1' imported but unused pyflakes.ignore stem/prereq.py => 'long_to_bytes' imported but unused -pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 31 +pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 32 pyflakes.ignore test/unit/response/events.py => 'from stem import *' used; unable to detect undefined names
diff --git a/test/unit/descriptor/networkstatus/document_v3.py b/test/unit/descriptor/networkstatus/document_v3.py index c68549c..c4180d7 100644 --- a/test/unit/descriptor/networkstatus/document_v3.py +++ b/test/unit/descriptor/networkstatus/document_v3.py @@ -18,7 +18,7 @@ from stem.descriptor.networkstatus import HEADER_STATUS_DOCUMENT_FIELDS, \ BANDWIDTH_WEIGHT_ENTRIES, \ DirectoryAuthority, \ NetworkStatusDocumentV3, \ - parse_file + _parse_file
from stem.descriptor.router_status_entry import \ RouterStatusEntryV3, \ @@ -123,15 +123,15 @@ class TestNetworkStatusDocument(unittest.TestCase): for router in consensus.routers: self.assertEqual('caerSidi', router.nickname)
- # second example: using parse_file + # second example: using _parse_file
with support_with(StringIO.StringIO(content)) as consensus_file: - for router in parse_file(consensus_file): + for router in _parse_file(consensus_file): self.assertEqual('caerSidi', router.nickname)
def test_parse_file(self): """ - Try parsing a document via the parse_file() function. + Try parsing a document via the _parse_file() function. """
entry1 = get_router_status_entry_v3({'s': "Fast"}) @@ -144,7 +144,7 @@ class TestNetworkStatusDocument(unittest.TestCase): expected_document = get_network_status_document_v3()
descriptor_file = StringIO.StringIO(content) - entries = list(parse_file(descriptor_file)) + entries = list(_parse_file(descriptor_file))
self.assertEquals(entry1, entries[0]) self.assertEquals(entry2, entries[1]) diff --git a/test/unit/descriptor/server_descriptor.py b/test/unit/descriptor/server_descriptor.py index 3ea4c5e..00fd96f 100644 --- a/test/unit/descriptor/server_descriptor.py +++ b/test/unit/descriptor/server_descriptor.py @@ -208,8 +208,8 @@ class TestServerDescriptor(unittest.TestCase): desc_text = sign_descriptor_content(desc_text) desc_text += "\ntrailing text that should be ignored, ho hum"
- # running parse_file should provide an iterator with a single descriptor - desc_iter = stem.descriptor.server_descriptor.parse_file(StringIO.StringIO(desc_text)) + # running _parse_file should provide an iterator with a single descriptor + desc_iter = stem.descriptor.server_descriptor._parse_file(StringIO.StringIO(desc_text)) desc_entries = list(desc_iter) self.assertEquals(1, len(desc_entries)) desc = desc_entries[0]
tor-commits@lists.torproject.org