commit 13944b062ff26a5efa178ff8d5552ff1d049574b Author: Damian Johnson atagar@torproject.org Date: Sun May 6 20:08:53 2012 -0700
Skeleton for extra-info descriptors
Basic module for parsing extrainfo descriptors. This doesn't actually do any parsing yet, just turning the raw content into a ExtraInfoDescriptor instance. This abstracts code we'll need out of the server_descriptor module so we'll be able to use it.
What code there is here is exercised via the runner tests. --- stem/descriptor/__init__.py | 158 ++++++++++++++++++++++- stem/descriptor/extrainfo_descriptor.py | 111 ++++++++++++++++ stem/descriptor/server_descriptor.py | 159 +---------------------- test/integ/descriptor/data/extrainfo_descriptor | 12 ++ 4 files changed, 286 insertions(+), 154 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py index 2af1fb0..f3a5983 100644 --- a/stem/descriptor/__init__.py +++ b/stem/descriptor/__init__.py @@ -8,9 +8,16 @@ Descriptor - Common parent for all descriptor file types. +- __str__ - string that the descriptor was made from """
-__all__ = ["descriptor", "reader", "server_descriptor", "parse_file", "Descriptor"] +__all__ = ["descriptor", "reader", "extrainfo_descriptor", "server_descriptor", "parse_file", "Descriptor"]
import os +import re + +KEYWORD_CHAR = "a-zA-Z0-9-" +WHITESPACE = " \t" +KEYWORD_LINE = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE)) +PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE)) +PGP_BLOCK_END = "-----END %s-----"
def parse_file(path, descriptor_file): """ @@ -28,6 +35,7 @@ def parse_file(path, descriptor_file): IOError if unable to read from the descriptor_file """
+ import stem.descriptor.extrainfo_descriptor import stem.descriptor.server_descriptor
# The tor descriptor specifications do not provide a reliable method for @@ -44,6 +52,10 @@ def parse_file(path, descriptor_file): for desc in stem.descriptor.server_descriptor.parse_file(descriptor_file): desc._set_path(path) yield desc + elif filename == "cached-extrainfo" or first_line.startswith("extra-info "): + for desc in stem.descriptor.extrainfo_descriptor.parse_file(descriptor_file): + desc._set_path(path) + yield desc else: # unrecognized descriptor type raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line)) @@ -85,3 +97,147 @@ class Descriptor: def __str__(self): return self._raw_contents
+def _read_until_keyword(keyword, descriptor_file, inclusive = False): + """ + Reads from the descriptor file until we get to the given keyword or reach the + end of the file. + + Arguments: + keyword (str) - keyword we want to read until + descriptor_file (file) - file with the descriptor content + inclusive (bool) - includes the line with the keyword if True + + Returns: + list with the lines until we find the keyword + """ + + content = [] + + while True: + last_position = descriptor_file.tell() + line = descriptor_file.readline() + if not line: break # EOF + + if " " in line: line_keyword = line.split(" ", 1)[0] + else: line_keyword = line.strip() + + if line_keyword == keyword: + if inclusive: content.append(line) + else: descriptor_file.seek(last_position) + + break + else: + content.append(line) + + return content + +def _get_pseudo_pgp_block(remaining_contents): + """ + Checks if given contents begins with a pseudo-Open-PGP-style block and, if + so, pops it off and provides it back to the caller. + + Arguments: + remaining_contents (list) - lines to be checked for a public key block + + Returns: + str with the armor wrapped contents or None if it doesn't exist + + Raises: + ValueError if the contents starts with a key block but it's malformed (for + instance, if it lacks an ending line) + """ + + if not remaining_contents: + return None # nothing left + + block_match = PGP_BLOCK_START.match(remaining_contents[0]) + + if block_match: + block_type = block_match.groups()[0] + block_lines = [] + + while True: + if not remaining_contents: + raise ValueError("Unterminated pgp style block") + + line = remaining_contents.pop(0) + block_lines.append(line) + + if line == PGP_BLOCK_END % block_type: + return "\n".join(block_lines) + else: + return None + +def _get_descriptor_components(raw_contents, validate, extra_keywords): + """ + Initial breakup of the server descriptor contents to make parsing easier. + + A descriptor contains a series of 'keyword lines' which are simply a keyword + followed by an optional value. Lines can also be followed by a signature + block. + + To get a sublisting with just certain keywords use extra_keywords. This can + be useful if we care about their relative ordering with respect to each + other. For instance, we care about the ordering of 'accept' and 'reject' + entries because this influences the resulting exit policy, but for everything + else in server descriptors the order does not matter. + + Arguments: + raw_contents (str) - descriptor content provided by the relay + validate (bool) - checks the validity of the descriptor's content if + True, skips these checks otherwise + extra_keywords (list) - entity keywords to put into a separate listing with + ordering intact + + Returns: + tuple with the following attributes... + entries (dict) - keyword => (value, pgp key) entries + first_keyword (str) - keyword of the first line + last_keyword (str) - keyword of the last line + extra_entries (list) - lines containing entries matching extra_keywords + """ + + entries = {} + first_keyword = None + last_keyword = None + extra_entries = [] # entries with a keyword in extra_keywords + remaining_lines = raw_contents.split("\n") + + while remaining_lines: + line = remaining_lines.pop(0) + + # last line can be empty + if not line and not remaining_lines: continue + + # Some lines have an 'opt ' for backward compatability. They should be + # ignored. This prefix is being removed in... + # https://trac.torproject.org/projects/tor/ticket/5124 + + if line.startswith("opt "): line = line[4:] + + line_match = KEYWORD_LINE.match(line) + + if not line_match: + if not validate: continue + raise ValueError("Line contains invalid characters: %s" % line) + + keyword, value = line_match.groups() + + if not first_keyword: first_keyword = keyword + last_keyword = keyword + + try: + block_contents = _get_pseudo_pgp_block(remaining_lines) + except ValueError, exc: + if not validate: continue + raise exc + + if keyword in extra_keywords: + extra_entries.append("%s %s" % (keyword, value)) + elif keyword in entries: + entries[keyword].append((value, block_contents)) + else: + entries[keyword] = [(value, block_contents)] + + return entries, first_keyword, last_keyword, extra_entries + diff --git a/stem/descriptor/extrainfo_descriptor.py b/stem/descriptor/extrainfo_descriptor.py new file mode 100644 index 0000000..891ba18 --- /dev/null +++ b/stem/descriptor/extrainfo_descriptor.py @@ -0,0 +1,111 @@ +""" +Parsing for Tor extra-info descriptors. These are published by relays whenever +their server descriptor is published and have a similar format. However, unlike +server descriptors these don't contain information that Tor clients require to +function and as such aren't fetched by default. + +Defined in section 2.2 of the dir-spec, extra-info descriptors contain +interesting but non-vital information such as usage statistics. These documents +cannot be requested of bridges. + +Extra-info descriptors are available from a few sources... + +- if you have 'DownloadExtraInfo 1' in your torrc... + - control port via 'GETINFO extra-info/digest/*' queries + - the 'cached-extrainfo' file in tor's data directory +- tor metrics, at https://metrics.torproject.org/data.html +- directory authorities and mirrors via their DirPort + +parse_file - Iterates over the extra-info descriptors in a file. +ExtraInfoDescriptor - Tor extra-info descriptor. +""" + +import stem.descriptor + +def parse_file(descriptor_file, validate = True): + """ + Iterates over the extra-info descriptors in a file. + + Arguments: + descriptor_file (file) - file with descriptor content + validate (bool) - checks the validity of the descriptor's content if + True, skips these checks otherwise + + Returns: + iterator for ExtraInfoDescriptor instances in the file + + Raises: + ValueError if the contents is malformed and validate is True + IOError if the file can't be read + """ + + while True: + extrainfo_content = stem.descriptor._read_until_keyword("router-signature", descriptor_file) + + # we've reached the 'router-signature', now include the pgp style block + block_end_prefix = stem.descriptor.PGP_BLOCK_END.split(' ', 1)[0] + extrainfo_content += stem.descriptor._read_until_keyword(block_end_prefix, descriptor_file, True) + + if extrainfo_content: + yield ExtraInfoDescriptor("".join(extrainfo_content), validate) + else: break # done parsing file + +class ExtraInfoDescriptor(stem.descriptor.Descriptor): + """ + Extra-info descriptor document. + + Attributes: + nickname (str) - relay's nickname (*) + fingerprint (str) - fourty hex digits that make up the relay's fingerprint (*) + published (datetime.datetime) - time in GMT when the descriptor was generated (*) + geoip_db_digest (str) - sha1 of geoIP database file + + read_history (str) - read-history line, always unset + read_history_end (datetime.datetime) - end of the sampling interval + read_history_interval (int) - seconds per interval + read_history_values (list) - bytes read during each interval (*) + + write_history (str) - write-history line, always unset + write_history_end (datetime.datetime) - end of the sampling interval + write_history_interval (int) - seconds per interval + write_history_values (list) - bytes written during each interval (*) + + (*) required fields, others are left as None if undefined + """ + + def __init__(self, raw_contents, validate = True, annotations = None): + """ + Extra-info descriptor constructor, created from a relay's extra-info + content (as provided by "GETINFO extra-info/digest/*", cached contents, and + metrics). + + By default this validates the descriptor's content as it's parsed. This + validation can be disables to either improve performance or be accepting of + malformed data. + + Arguments: + raw_contents (str) - extra-info content provided by the relay + validate (bool) - checks the validity of the extra-info descriptor if + True, skips these checks otherwise + + Raises: + ValueError if the contents is malformed and validate is True + """ + + stem.descriptor.Descriptor.__init__(self, raw_contents) + + self.nickname = None + self.fingerprint = None + self.published = None + self.geoip_db_digest = None + + self.read_history = None + self.read_history_end = None + self.read_history_interval = None + self.read_history_values = [] + + self.write_history = None + self.write_history_end = None + self.write_history_interval = None + self.write_history_values = [] + diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py index 551835d..b87ed2a 100644 --- a/stem/descriptor/server_descriptor.py +++ b/stem/descriptor/server_descriptor.py @@ -33,12 +33,6 @@ import stem.version import stem.util.connection import stem.util.tor_tools
-KEYWORD_CHAR = "a-zA-Z0-9-" -WHITESPACE = " \t" -KEYWORD_LINE = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE)) -PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE)) -PGP_BLOCK_END = "-----END %s-----" - # relay descriptors must have exactly one of the following REQUIRED_FIELDS = ( "router", @@ -127,56 +121,21 @@ def parse_file(descriptor_file, validate = True): # to the caller).
while True: - annotations = _read_until_keyword("router", descriptor_file) - descriptor_content = _read_until_keyword("router-signature", descriptor_file) + annotations = stem.descriptor._read_until_keyword("router", descriptor_file) + descriptor_content = stem.descriptor._read_until_keyword("router-signature", descriptor_file)
# we've reached the 'router-signature', now include the pgp style block - block_end_prefix = PGP_BLOCK_END.split(' ', 1)[0] - descriptor_content += _read_until_keyword(block_end_prefix, descriptor_file, True) + block_end_prefix = stem.descriptor.PGP_BLOCK_END.split(' ', 1)[0] + descriptor_content += stem.descriptor._read_until_keyword(block_end_prefix, descriptor_file, True)
if descriptor_content: # strip newlines from annotations annotations = map(str.strip, annotations)
descriptor_text = "".join(descriptor_content) - descriptor = RelayDescriptor(descriptor_text, validate, annotations) - yield descriptor + yield RelayDescriptor(descriptor_text, validate, annotations) else: break # done parsing descriptors
-def _read_until_keyword(keyword, descriptor_file, inclusive = False): - """ - Reads from the descriptor file until we get to the given keyword or reach the - end of the file. - - Arguments: - keyword (str) - keyword we want to read until - descriptor_file (file) - file with the descriptor content - inclusive (bool) - includes the line with the keyword if True - - Returns: - list with the lines until we find the keyword - """ - - content = [] - - while True: - last_position = descriptor_file.tell() - line = descriptor_file.readline() - if not line: break # EOF - - if " " in line: line_keyword = line.split(" ", 1)[0] - else: line_keyword = line.strip() - - if line_keyword == keyword: - if inclusive: content.append(line) - else: descriptor_file.seek(last_position) - - break - else: - content.append(line) - - return content - class ServerDescriptor(stem.descriptor.Descriptor): """ Common parent for server descriptors. @@ -297,7 +256,7 @@ class ServerDescriptor(stem.descriptor.Descriptor): # does not matter so breaking it into key / value pairs.
entries, first_keyword, last_keyword, self.exit_policy = \ - _get_descriptor_components(raw_contents, validate) + stem.descriptor._get_descriptor_components(raw_contents, validate, ("accept", "reject")) self._parse(entries, validate) if validate: self._check_constraints(entries, first_keyword, last_keyword)
@@ -816,109 +775,3 @@ class BridgeDescriptor(ServerDescriptor): def _first_keyword(self): return "router"
-def _get_descriptor_components(raw_contents, validate): - """ - Initial breakup of the server descriptor contents to make parsing easier. - - A descriptor contains a series of 'keyword lines' which are simply a keyword - followed by an optional value. Lines can also be followed by a signature - block. - - We care about the ordering of 'accept' and 'reject' entries because this - influences the resulting exit policy, but for everything else the order does - not matter so breaking it into key / value pairs. - - Arguments: - raw_contents (str) - descriptor content provided by the relay - validate (bool) - checks the validity of the descriptor's content if - True, skips these checks otherwise - - Returns: - tuple with the following attributes... - entries (dict) - keyword => (value, pgp key) entries - first_keyword (str) - keyword of the first line - last_keyword (str) - keyword of the last line - exit_policy (list) - lines containing the exit policy - """ - - entries = {} - first_keyword = None - last_keyword = None - exit_policy = [] - remaining_lines = raw_contents.split("\n") - - while remaining_lines: - line = remaining_lines.pop(0) - - # last line can be empty - if not line and not remaining_lines: continue - - # Some lines have an 'opt ' for backward compatability. They should be - # ignored. This prefix is being removed in... - # https://trac.torproject.org/projects/tor/ticket/5124 - - if line.startswith("opt "): line = line[4:] - - line_match = KEYWORD_LINE.match(line) - - if not line_match: - if not validate: continue - raise ValueError("Line contains invalid characters: %s" % line) - - keyword, value = line_match.groups() - - if not first_keyword: first_keyword = keyword - last_keyword = keyword - - try: - block_contents = _get_pseudo_pgp_block(remaining_lines) - except ValueError, exc: - if not validate: continue - raise exc - - if keyword in ("accept", "reject"): - exit_policy.append("%s %s" % (keyword, value)) - elif keyword in entries: - entries[keyword].append((value, block_contents)) - else: - entries[keyword] = [(value, block_contents)] - - return entries, first_keyword, last_keyword, exit_policy - -def _get_pseudo_pgp_block(remaining_contents): - """ - Checks if given contents begins with a pseudo-Open-PGP-style block and, if - so, pops it off and provides it back to the caller. - - Arguments: - remaining_contents (list) - lines to be checked for a public key block - - Returns: - str with the armor wrapped contents or None if it doesn't exist - - Raises: - ValueError if the contents starts with a key block but it's malformed (for - instance, if it lacks an ending line) - """ - - if not remaining_contents: - return None # nothing left - - block_match = PGP_BLOCK_START.match(remaining_contents[0]) - - if block_match: - block_type = block_match.groups()[0] - block_lines = [] - - while True: - if not remaining_contents: - raise ValueError("Unterminated pgp style block") - - line = remaining_contents.pop(0) - block_lines.append(line) - - if line == PGP_BLOCK_END % block_type: - return "\n".join(block_lines) - else: - return None - diff --git a/test/integ/descriptor/data/extrainfo_descriptor b/test/integ/descriptor/data/extrainfo_descriptor new file mode 100644 index 0000000..4525afe --- /dev/null +++ b/test/integ/descriptor/data/extrainfo_descriptor @@ -0,0 +1,12 @@ +extra-info NINJA B2289C3EAB83ECD6EB916A2F481A02E6B76A0A48 +published 2012-05-05 17:03:50 +write-history 2012-05-05 17:02:45 (900 s) 1082368,19456,50176,272384,485376,1850368,1132544,1790976,2459648,4091904,6310912,13701120,3209216,3871744,7873536,5440512,7287808,10561536,9979904,11247616,11982848,7590912,10611712,20728832,38534144,6839296,3173376,16678912 +read-history 2012-05-05 17:02:45 (900 s) 3309568,9216,41984,27648,123904,2004992,364544,576512,1607680,3808256,4672512,12783616,2938880,2562048,7348224,3574784,6488064,10954752,9359360,4438016,6286336,6438912,4502528,10720256,38165504,1524736,2336768,8186880 +dirreq-write-history 2012-05-05 17:02:45 (900 s) 0,0,0,227328,349184,382976,738304,1171456,850944,657408,1675264,987136,702464,1335296,587776,1941504,893952,533504,695296,6828032,6326272,1287168,6310912,10085376,1048576,5372928,894976,8610816 +dirreq-read-history 2012-05-05 17:02:45 (900 s) 0,0,0,0,33792,27648,48128,46080,60416,51200,63488,64512,45056,27648,37888,48128,57344,34816,46080,50176,37888,51200,25600,33792,39936,32768,28672,30720 +router-signature +-----BEGIN SIGNATURE----- +K5FSywk7qvw/boA4DQcqkls6Ize5vcBYfhQ8JnOeRQC9+uDxbnpm3qaYN9jZ8myj +k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw +7LZqklu+gVvhMKREpchVqlAwXkWR44VENm24Hs+mT3M= +-----END SIGNATURE-----