commit 6c3717b65acc9d208ef3bf90b5b54f3983e507df Author: Ravi Chandra Padmala neenaoffline@gmail.com Date: Wed Aug 15 20:47:29 2012 +0530
Fixes to document parsing
One major change is that stem.descriptor.networkstatus.parse_file now returns a NetworkStatusDocument object instead of iterating over the router descriptors in the document --- stem/descriptor/__init__.py | 119 +++++++++++++++----------------- stem/descriptor/networkstatus.py | 18 +++--- test/integ/descriptor/networkstatus.py | 27 +------ 3 files changed, 68 insertions(+), 96 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py index d9ac21b..6563e4b 100644 --- a/stem/descriptor/__init__.py +++ b/stem/descriptor/__init__.py @@ -65,9 +65,17 @@ def parse_file(path, descriptor_file): elif filename == "cached-extrainfo": file_parser = stem.descriptor.extrainfo_descriptor.parse_file elif filename == "cached-consensus": - file_parser = stem.descriptor.networkstatus.parse_file + file_parser = lambda f: [stem.descriptor.networkstatus.parse_file(f)] elif filename == "cached-microdesc-consensus": - file_parser = lambda f: stem.descriptor.networkstatus.parse_file(f, True, "microdesc") + file_parser = lambda f: [stem.descriptor.networkstatus.parse_file(f, True, "microdesc")] + else: + # Metrics descriptor handling + first_line, desc = descriptor_file.readline().strip(), None + metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line) + + if metrics_header_match: + desc_type, major_version, minor_version = metrics_header_match.groups() + file_parser = lambda f: _parse_metrics_file(desc_type, int(major_version), int(minor_version), f)
if file_parser: for desc in file_parser(descriptor_file): @@ -76,47 +84,33 @@ def parse_file(path, descriptor_file):
return
- # Metrics descriptor handling. These contain a single descriptor per file. - - first_line, desc = descriptor_file.readline().strip(), None - metrics_header_match = re.match("^@type (\S+) (\d+).(\d+)$", first_line) - - if metrics_header_match: - # still doesn't necessarily mean that this is a descriptor, check if the - # header contents are recognized - - desc_type, major_version, minor_version = metrics_header_match.groups() - major_version, minor_version = int(major_version), int(minor_version) - - if desc_type == "server-descriptor" and major_version == 1: - desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_file.read()) - elif desc_type == "bridge-server-descriptor" and major_version == 1: - desc = stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_file.read()) - elif desc_type == "extra-info" and major_version == 1: - desc = stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor(descriptor_file.read()) - elif desc_type == "bridge-extra-info" and major_version == 1: - # version 1.1 introduced a 'transport' field... - # https://trac.torproject.org/6257 - - desc = stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor(descriptor_file.read()) - elif desc_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1: - desc = stem.descriptor.networkstatus.NetworkStatusDocument(descriptor_file.read()) - for desc in desc.router_descriptors: - desc._set_path(path) - yield desc - return - elif desc_type == "network-status-microdesc-consensus-3" and major_version == 1: - desc = stem.descriptor.networkstatus.MicrodescriptorConsensus(descriptor_file.read()) - - if desc: - desc._set_path(path) - yield desc - return - # Not recognized as a descriptor file.
raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
+def _parse_metrics_file(descriptor_type, major_version, minor_version, descriptor_file): + # Parses descriptor files from metrics, yielding individual descriptors. This + # throws a TypeError if the descriptor_type or version isn't recognized. + import stem.descriptor.server_descriptor + import stem.descriptor.extrainfo_descriptor + import stem.descriptor.networkstatus + + if descriptor_type == "server-descriptor" and major_version == 1: + yield stem.descriptor.server_descriptor.RelayDescriptor(descriptor_file.read()) + elif descriptor_type == "bridge-server-descriptor" and major_version == 1: + yield stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_file.read()) + elif descriptor_type == "extra-info" and major_version == 1: + yield stem.descriptor.extrainfo_descriptor.RelayExtraInfoDescriptor(descriptor_file.read()) + elif descriptor_type == "bridge-extra-info" and major_version == 1: + # version 1.1 introduced a 'transport' field... + # https://trac.torproject.org/6257 + + yield stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor(descriptor_file.read()) + elif descriptor_type in ("network-status-consensus-3", "network-status-vote-3") and major_version == 1: + yield stem.descriptor.networkstatus.parse_file(descriptor_file) + elif descriptor_type == "network-status-microdesc-consensus-3" and major_version == 1: + yield stem.descriptor.networkstatus.parse_file(descriptor_file, flavour = "microdesc") + class Descriptor(object): """ Common parent for all types of descriptors. @@ -177,19 +171,13 @@ def _peek_keyword(descriptor_file): :returns: keyword at the current offset of descriptor_file """
- last_position = descriptor_file.tell() - line = descriptor_file.readline() - if not line: return None - - if " " in line: - keyword = line.split(" ", 1)[0] - if keyword == "opt": - keyword = line.split(" ", 2)[1] - else: keyword = line.strip() + line = _peek_line(descriptor_file)
- descriptor_file.seek(last_position) + if line.startswith("opt "): + line = line[4:] + if not line: return None
- return keyword + return line.split(" ", 1)[0].rstrip("\n")
def _read_keyword_line(keyword, descriptor_file, validate = True, optional = False): """ @@ -200,8 +188,9 @@ def _read_keyword_line(keyword, descriptor_file, validate = True, optional = Fal Respects the opt keyword and returns the next keyword if the first is "opt".
:param str keyword: keyword the line must begin with - :param bool optional: if the current line must begin with the given keyword + :param bool descriptor_file: file/file-like object containing descriptor data :param bool validate: validation is enabled + :param bool optional: if the current line must begin with the given keyword
:returns: the text after the keyword if the keyword matches the one provided, otherwise returns None or raises an exception
@@ -214,13 +203,14 @@ def _read_keyword_line(keyword, descriptor_file, validate = True, optional = Fal raise ValueError("Unexpected end of document") return None
- if line_matches_keyword(keyword, line): - line = descriptor_file.readline() - - if line == "opt " + keyword or line == keyword: return "" - elif line.startswith("opt "): return line.split(" ", 2)[2].rstrip("\n") - else: return line.split(" ", 1)[1].rstrip("\n") - elif line.startswith("opt"): + opt_line = False + if line.startswith("opt "): + line = line[4:] + opt_line = True + if re.match("^" + re.escape(keyword) + "($| )", line): + descriptor_file.readline() + return line[len(keyword):].strip() + elif opt_line and not optional: # if this is something new we don't recognize # ignore it and go to the next line descriptor_file.readline() @@ -239,8 +229,8 @@ def _read_keyword_line_str(keyword, lines, validate = True, optional = False):
:param str keyword: keyword the line must begin with :param list lines: list of strings to be read from - :param bool optional: if the current line must begin with the given keyword :param bool validate: validation is enabled + :param bool optional: if the current line must begin with the given keyword
:returns: the text after the keyword if the keyword matches the one provided, otherwise returns None or raises an exception
@@ -252,16 +242,17 @@ def _read_keyword_line_str(keyword, lines, validate = True, optional = False): raise ValueError("Unexpected end of document") return
+ opt_line = False + if lines[0].startswith("opt "): + line = line[4:] + opt_line = True if line_matches_keyword(keyword, lines[0]): line = lines.pop(0)
- if line == "opt " + keyword or line == keyword: return "" - elif line.startswith("opt "): return line.split(" ", 2)[2] - else: return line.split(" ", 1)[1] - elif line.startswith("opt "): + return line[len(keyword):].strip() + elif opt_line and not optional: # if this is something new we don't recognize yet # ignore it and go to the next line - lines.pop(0) return _read_keyword_line_str(keyword, lines, optional) elif not optional and validate: raise ValueError("Error parsing network status document: Expected %s, received: %s" % (keyword, lines[0])) diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py index 7effc7e..f9d89a8 100644 --- a/stem/descriptor/networkstatus.py +++ b/stem/descriptor/networkstatus.py @@ -21,7 +21,7 @@ The documents can be obtained from any of the following sources...
nsdoc_file = open("/home/neena/.tor/cached-consensus") try: - consensus = stem.descriptor.networkstatus.NetworkStatusDocument(nsdoc_file.read()) + consensus = stem.descriptor.networkstatus.parse_file(nsdoc_file) except ValueError: print "Invalid cached-consensus file"
@@ -33,7 +33,9 @@ The documents can be obtained from any of the following sources...
parse_file - parses a network status file and provides a NetworkStatusDocument NetworkStatusDocument - Tor v3 network status document + +- MicrodescriptorConsensus - Microdescriptor flavoured consensus documents RouterDescriptor - Router descriptor; contains information about a Tor relay + +- RouterMicrodescriptor - Router microdescriptor; contains information that doesn't change frequently DirectorySignature - Network status document's directory signature DirectoryAuthority - Directory authority defined in a v3 network status document """ @@ -63,7 +65,7 @@ Flavour = stem.util.enum.Enum( ("NONE", ""), ("NS", "ns"), ("MICRODESCRIPTOR", "microdesc"), - ) +)
Flag = stem.util.enum.Enum( ("AUTHORITY", "Authority"), @@ -78,18 +80,16 @@ Flag = stem.util.enum.Enum( ("UNNAMED", "Unnamed"), ("V2DIR", "V2Dir"), ("VALID", "Valid"), - ) - -Flag = stem.util.enum.Enum(*[(flag.upper(), flag) for flag in ["Authority", "BadExit", "Exit", "Fast", "Guard", "HSDir", "Named", "Running", "Stable", "Unnamed", "V2Dir", "Valid"]]) +)
def parse_file(document_file, validate = True, flavour = Flavour.NONE): """ - Iterates over the router descriptors in a network status document. + Parses a network status document and provides a NetworkStatusDocument object.
:param file document_file: file with network status document content :param bool validate: checks the validity of the document's contents if True, skips these checks otherwise
- :returns: iterator for :class:`stem.descriptor.networkstatus.RouterDescriptor` instances in the file + :returns: :class:`stem.descriptor.networkstatus.NetworkStatusDocument` object
:raises: * ValueError if the contents is malformed and validate is True @@ -109,12 +109,12 @@ def parse_file(document_file, validate = True, flavour = Flavour.NONE): document = NetworkStatusDocument(document_data, validate) document_file.seek(r_offset) document.router_descriptors = _ns_router_desc_generator(document_file, document.vote_status == "vote", validate) - yield document + return document elif flavour == Flavour.MICRODESCRIPTOR: document = MicrodescriptorConsensus(document_data, validate) document_file.seek(r_offset) document.router_descriptors = _router_microdesc_generator(document_file, validate, document.known_flags) - yield document + return document
def _ns_router_desc_generator(document_file, vote, validate): while _peek_keyword(document_file) == "r": diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py index 484e67d..bd326ad 100644 --- a/test/integ/descriptor/networkstatus.py +++ b/test/integ/descriptor/networkstatus.py @@ -39,7 +39,7 @@ class TestNetworkStatusDocument(unittest.TestCase):
count = 0 with open(descriptor_path) as descriptor_file: - for desc in stem.descriptor.networkstatus.parse_file(descriptor_file): + for desc in stem.descriptor.networkstatus.parse_file(descriptor_file).router_descriptors: if resource.getrusage(resource.RUSAGE_SELF).ru_maxrss > 200000: # if we're using > 200 MB we should fail self.fail() @@ -58,7 +58,7 @@ class TestNetworkStatusDocument(unittest.TestCase): with file(descriptor_path) as descriptor_file: desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
- router = next(desc) + router = next(next(desc).router_descriptors) self.assertEquals("sumkledi", router.nickname) self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router.identity) self.assertEquals("8mCr8Sl7RF4ENU4jb0FZFA/3do8", router.digest) @@ -150,7 +150,7 @@ I/TJmV928na7RLZe2mGHCAW3VQOvV+QkCfj05VZ8CsY= with file(descriptor_path) as descriptor_file: desc = stem.descriptor.parse_file(descriptor_path, descriptor_file)
- router = next(desc) + router = next(next(desc).router_descriptors) self.assertEquals("sumkledi", router.nickname) self.assertEquals("ABPSI4nNUNC3hKPkBhyzHozozrU", router.identity) self.assertEquals("B5n4BiALAF8B5AqafxohyYiuj7E", router.digest) @@ -273,28 +273,9 @@ class TestMicrodescriptorConsensus(unittest.TestCase):
count = 0 with open(descriptor_path) as descriptor_file: - for desc in next(stem.descriptor.networkstatus.parse_file(descriptor_file, True, flavour = Flavour.MICRODESCRIPTOR)).router_descriptors: + for desc in stem.descriptor.networkstatus.parse_file(descriptor_file, True, flavour = Flavour.MICRODESCRIPTOR).router_descriptors: assert desc.nickname # check that the router has a nickname count += 1
assert count > 100 # sanity check - assuming atleast 100 relays in the consensus - - def test_metrics_microdesc_consensus(self): - """ - Checks if consensus documents from Metrics are parsed properly. - """ - - descriptor_path = test.integ.descriptor.get_resource("metrics_microdesc_consensus") - - with file(descriptor_path) as descriptor_file: - desc = stem.descriptor.parse_file(descriptor_path, descriptor_file) - - router = next(next(desc).router_descriptors) - self.assertEquals("JapanAnon", router.nickname) - self.assertEquals("AGw/p8P246zRPQ3ZsQx9+pM8I3s", router.identity) - self.assertEquals("9LDw0XiFeLQDXK9t8ht4+MK9tWx6Jxp1RwP36eatRWs", router.digest) - self.assertEquals(_strptime("2012-07-18 15:55:42"), router.publication) - self.assertEquals("220.0.231.71", router.ip) - self.assertEquals(443, router.orport) - self.assertEquals(9030, router.dirport)