[tor-commits] [stem/master] Skeleton for extra-info descriptors

atagar at torproject.org atagar at torproject.org
Mon May 14 00:14:27 UTC 2012


commit 13944b062ff26a5efa178ff8d5552ff1d049574b
Author: Damian Johnson <atagar at torproject.org>
Date:   Sun May 6 20:08:53 2012 -0700

    Skeleton for extra-info descriptors
    
    Basic module for parsing extrainfo descriptors. This doesn't actually do any
    parsing yet, just turning the raw content into a ExtraInfoDescriptor instance.
    This abstracts code we'll need out of the server_descriptor module so we'll be
    able to use it.
    
    What code there is here is exercised via the runner tests.
---
 stem/descriptor/__init__.py                     |  158 ++++++++++++++++++++++-
 stem/descriptor/extrainfo_descriptor.py         |  111 ++++++++++++++++
 stem/descriptor/server_descriptor.py            |  159 +----------------------
 test/integ/descriptor/data/extrainfo_descriptor |   12 ++
 4 files changed, 286 insertions(+), 154 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 2af1fb0..f3a5983 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -8,9 +8,16 @@ Descriptor - Common parent for all descriptor file types.
   +- __str__ - string that the descriptor was made from
 """
 
-__all__ = ["descriptor", "reader", "server_descriptor", "parse_file", "Descriptor"]
+__all__ = ["descriptor", "reader", "extrainfo_descriptor", "server_descriptor", "parse_file", "Descriptor"]
 
 import os
+import re
+
+KEYWORD_CHAR    = "a-zA-Z0-9-"
+WHITESPACE      = " \t"
+KEYWORD_LINE    = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE))
+PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE))
+PGP_BLOCK_END   = "-----END %s-----"
 
 def parse_file(path, descriptor_file):
   """
@@ -28,6 +35,7 @@ def parse_file(path, descriptor_file):
     IOError if unable to read from the descriptor_file
   """
   
+  import stem.descriptor.extrainfo_descriptor
   import stem.descriptor.server_descriptor
   
   # The tor descriptor specifications do not provide a reliable method for
@@ -44,6 +52,10 @@ def parse_file(path, descriptor_file):
     for desc in stem.descriptor.server_descriptor.parse_file(descriptor_file):
       desc._set_path(path)
       yield desc
+  elif filename == "cached-extrainfo" or first_line.startswith("extra-info "):
+    for desc in stem.descriptor.extrainfo_descriptor.parse_file(descriptor_file):
+      desc._set_path(path)
+      yield desc
   else:
     # unrecognized descriptor type
     raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
@@ -85,3 +97,147 @@ class Descriptor:
   def __str__(self):
     return self._raw_contents
 
+def _read_until_keyword(keyword, descriptor_file, inclusive = False):
+  """
+  Reads from the descriptor file until we get to the given keyword or reach the
+  end of the file.
+  
+  Arguments:
+    keyword (str)          - keyword we want to read until
+    descriptor_file (file) - file with the descriptor content
+    inclusive (bool)       - includes the line with the keyword if True
+  
+  Returns:
+    list with the lines until we find the keyword
+  """
+  
+  content = []
+  
+  while True:
+    last_position = descriptor_file.tell()
+    line = descriptor_file.readline()
+    if not line: break # EOF
+    
+    if " " in line: line_keyword = line.split(" ", 1)[0]
+    else: line_keyword = line.strip()
+    
+    if line_keyword == keyword:
+      if inclusive: content.append(line)
+      else: descriptor_file.seek(last_position)
+      
+      break
+    else:
+      content.append(line)
+  
+  return content
+
+def _get_pseudo_pgp_block(remaining_contents):
+  """
+  Checks if given contents begins with a pseudo-Open-PGP-style block and, if
+  so, pops it off and provides it back to the caller.
+  
+  Arguments:
+    remaining_contents (list) - lines to be checked for a public key block
+  
+  Returns:
+    str with the armor wrapped contents or None if it doesn't exist
+  
+  Raises:
+    ValueError if the contents starts with a key block but it's malformed (for
+    instance, if it lacks an ending line)
+  """
+  
+  if not remaining_contents:
+    return None # nothing left
+  
+  block_match = PGP_BLOCK_START.match(remaining_contents[0])
+  
+  if block_match:
+    block_type = block_match.groups()[0]
+    block_lines = []
+    
+    while True:
+      if not remaining_contents:
+        raise ValueError("Unterminated pgp style block")
+      
+      line = remaining_contents.pop(0)
+      block_lines.append(line)
+      
+      if line == PGP_BLOCK_END % block_type:
+        return "\n".join(block_lines)
+  else:
+    return None
+
+def _get_descriptor_components(raw_contents, validate, extra_keywords):
+  """
+  Initial breakup of the server descriptor contents to make parsing easier.
+  
+  A descriptor contains a series of 'keyword lines' which are simply a keyword
+  followed by an optional value. Lines can also be followed by a signature
+  block.
+  
+  To get a sublisting with just certain keywords use extra_keywords. This can
+  be useful if we care about their relative ordering with respect to each
+  other. For instance, we care about the ordering of 'accept' and 'reject'
+  entries because this influences the resulting exit policy, but for everything
+  else in server descriptors the order does not matter.
+  
+  Arguments:
+    raw_contents (str) - descriptor content provided by the relay
+    validate (bool)    - checks the validity of the descriptor's content if
+                         True, skips these checks otherwise
+    extra_keywords (list) - entity keywords to put into a separate listing with
+                         ordering intact
+  
+  Returns:
+    tuple with the following attributes...
+      entries (dict)      - keyword => (value, pgp key) entries
+      first_keyword (str) - keyword of the first line
+      last_keyword (str)  - keyword of the last line
+      extra_entries (list) - lines containing entries matching extra_keywords
+  """
+  
+  entries = {}
+  first_keyword = None
+  last_keyword = None
+  extra_entries = [] # entries with a keyword in extra_keywords
+  remaining_lines = raw_contents.split("\n")
+  
+  while remaining_lines:
+    line = remaining_lines.pop(0)
+    
+    # last line can be empty
+    if not line and not remaining_lines: continue
+    
+    # Some lines have an 'opt ' for backward compatability. They should be
+    # ignored. This prefix is being removed in...
+    # https://trac.torproject.org/projects/tor/ticket/5124
+    
+    if line.startswith("opt "): line = line[4:]
+    
+    line_match = KEYWORD_LINE.match(line)
+    
+    if not line_match:
+      if not validate: continue
+      raise ValueError("Line contains invalid characters: %s" % line)
+    
+    keyword, value = line_match.groups()
+    
+    if not first_keyword: first_keyword = keyword
+    last_keyword = keyword
+    
+    try:
+      block_contents = _get_pseudo_pgp_block(remaining_lines)
+    except ValueError, exc:
+      if not validate: continue
+      raise exc
+    
+    if keyword in extra_keywords:
+      extra_entries.append("%s %s" % (keyword, value))
+    elif keyword in entries:
+      entries[keyword].append((value, block_contents))
+    else:
+      entries[keyword] = [(value, block_contents)]
+  
+  return entries, first_keyword, last_keyword, extra_entries
+
diff --git a/stem/descriptor/extrainfo_descriptor.py b/stem/descriptor/extrainfo_descriptor.py
new file mode 100644
index 0000000..891ba18
--- /dev/null
+++ b/stem/descriptor/extrainfo_descriptor.py
@@ -0,0 +1,111 @@
+"""
+Parsing for Tor extra-info descriptors. These are published by relays whenever
+their server descriptor is published and have a similar format. However, unlike
+server descriptors these don't contain information that Tor clients require to
+function and as such aren't fetched by default.
+
+Defined in section 2.2 of the dir-spec, extra-info descriptors contain
+interesting but non-vital information such as usage statistics. These documents
+cannot be requested of bridges.
+
+Extra-info descriptors are available from a few sources...
+
+- if you have 'DownloadExtraInfo 1' in your torrc...
+  - control port via 'GETINFO extra-info/digest/*' queries
+  - the 'cached-extrainfo' file in tor's data directory
+- tor metrics, at https://metrics.torproject.org/data.html
+- directory authorities and mirrors via their DirPort
+
+parse_file - Iterates over the extra-info descriptors in a file.
+ExtraInfoDescriptor - Tor extra-info descriptor.
+"""
+
+import stem.descriptor
+
+def parse_file(descriptor_file, validate = True):
+  """
+  Iterates over the extra-info descriptors in a file.
+  
+  Arguments:
+    descriptor_file (file) - file with descriptor content
+    validate (bool)        - checks the validity of the descriptor's content if
+                             True, skips these checks otherwise
+  
+  Returns:
+    iterator for ExtraInfoDescriptor instances in the file
+  
+  Raises:
+    ValueError if the contents is malformed and validate is True
+    IOError if the file can't be read
+  """
+  
+  while True:
+    extrainfo_content = stem.descriptor._read_until_keyword("router-signature", descriptor_file)
+    
+    # we've reached the 'router-signature', now include the pgp style block
+    block_end_prefix = stem.descriptor.PGP_BLOCK_END.split(' ', 1)[0]
+    extrainfo_content += stem.descriptor._read_until_keyword(block_end_prefix, descriptor_file, True)
+    
+    if extrainfo_content:
+      yield ExtraInfoDescriptor("".join(extrainfo_content), validate)
+    else: break # done parsing file
+
+class ExtraInfoDescriptor(stem.descriptor.Descriptor):
+  """
+  Extra-info descriptor document.
+  
+  Attributes:
+    nickname (str)           - relay's nickname (*)
+    fingerprint (str)        - fourty hex digits that make up the relay's fingerprint (*)
+    published (datetime.datetime) - time in GMT when the descriptor was generated (*)
+    geoip_db_digest (str)    - sha1 of geoIP database file
+    
+    read_history (str)       - read-history line, always unset
+    read_history_end (datetime.datetime) - end of the sampling interval
+    read_history_interval (int) - seconds per interval
+    read_history_values (list) - bytes read during each interval (*)
+    
+    write_history (str)      - write-history line, always unset
+    write_history_end (datetime.datetime) - end of the sampling interval
+    write_history_interval (int) - seconds per interval
+    write_history_values (list) - bytes written during each interval (*)
+    
+    (*) required fields, others are left as None if undefined
+  """
+  
+  def __init__(self, raw_contents, validate = True, annotations = None):
+    """
+    Extra-info descriptor constructor, created from a relay's extra-info
+    content (as provided by "GETINFO extra-info/digest/*", cached contents, and
+    metrics).
+    
+    By default this validates the descriptor's content as it's parsed. This
+    validation can be disables to either improve performance or be accepting of
+    malformed data.
+    
+    Arguments:
+      raw_contents (str) - extra-info content provided by the relay
+      validate (bool)    - checks the validity of the extra-info descriptor if
+                           True, skips these checks otherwise
+    
+    Raises:
+      ValueError if the contents is malformed and validate is True
+    """
+    
+    stem.descriptor.Descriptor.__init__(self, raw_contents)
+    
+    self.nickname = None
+    self.fingerprint = None
+    self.published = None
+    self.geoip_db_digest = None
+    
+    self.read_history = None
+    self.read_history_end = None
+    self.read_history_interval = None
+    self.read_history_values = []
+    
+    self.write_history = None
+    self.write_history_end = None
+    self.write_history_interval = None
+    self.write_history_values = []
+
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index 551835d..b87ed2a 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -33,12 +33,6 @@ import stem.version
 import stem.util.connection
 import stem.util.tor_tools
 
-KEYWORD_CHAR    = "a-zA-Z0-9-"
-WHITESPACE      = " \t"
-KEYWORD_LINE    = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE))
-PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE))
-PGP_BLOCK_END   = "-----END %s-----"
-
 # relay descriptors must have exactly one of the following
 REQUIRED_FIELDS = (
   "router",
@@ -127,56 +121,21 @@ def parse_file(descriptor_file, validate = True):
   # to the caller).
   
   while True:
-    annotations = _read_until_keyword("router", descriptor_file)
-    descriptor_content = _read_until_keyword("router-signature", descriptor_file)
+    annotations = stem.descriptor._read_until_keyword("router", descriptor_file)
+    descriptor_content = stem.descriptor._read_until_keyword("router-signature", descriptor_file)
     
     # we've reached the 'router-signature', now include the pgp style block
-    block_end_prefix = PGP_BLOCK_END.split(' ', 1)[0]
-    descriptor_content += _read_until_keyword(block_end_prefix, descriptor_file, True)
+    block_end_prefix = stem.descriptor.PGP_BLOCK_END.split(' ', 1)[0]
+    descriptor_content += stem.descriptor._read_until_keyword(block_end_prefix, descriptor_file, True)
     
     if descriptor_content:
       # strip newlines from annotations
       annotations = map(str.strip, annotations)
       
       descriptor_text = "".join(descriptor_content)
-      descriptor = RelayDescriptor(descriptor_text, validate, annotations)
-      yield descriptor
+      yield RelayDescriptor(descriptor_text, validate, annotations)
     else: break # done parsing descriptors
 
-def _read_until_keyword(keyword, descriptor_file, inclusive = False):
-  """
-  Reads from the descriptor file until we get to the given keyword or reach the
-  end of the file.
-  
-  Arguments:
-    keyword (str)          - keyword we want to read until
-    descriptor_file (file) - file with the descriptor content
-    inclusive (bool)       - includes the line with the keyword if True
-  
-  Returns:
-    list with the lines until we find the keyword
-  """
-  
-  content = []
-  
-  while True:
-    last_position = descriptor_file.tell()
-    line = descriptor_file.readline()
-    if not line: break # EOF
-    
-    if " " in line: line_keyword = line.split(" ", 1)[0]
-    else: line_keyword = line.strip()
-    
-    if line_keyword == keyword:
-      if inclusive: content.append(line)
-      else: descriptor_file.seek(last_position)
-      
-      break
-    else:
-      content.append(line)
-  
-  return content
-
 class ServerDescriptor(stem.descriptor.Descriptor):
   """
   Common parent for server descriptors.
@@ -297,7 +256,7 @@ class ServerDescriptor(stem.descriptor.Descriptor):
     # does not matter so breaking it into key / value pairs.
     
     entries, first_keyword, last_keyword, self.exit_policy = \
-      _get_descriptor_components(raw_contents, validate)
+      stem.descriptor._get_descriptor_components(raw_contents, validate, ("accept", "reject"))
     self._parse(entries, validate)
     if validate: self._check_constraints(entries, first_keyword, last_keyword)
   
@@ -816,109 +775,3 @@ class BridgeDescriptor(ServerDescriptor):
   def _first_keyword(self):
     return "router"
 
-def _get_descriptor_components(raw_contents, validate):
-  """
-  Initial breakup of the server descriptor contents to make parsing easier.
-  
-  A descriptor contains a series of 'keyword lines' which are simply a keyword
-  followed by an optional value. Lines can also be followed by a signature
-  block.
-  
-  We care about the ordering of 'accept' and 'reject' entries because this
-  influences the resulting exit policy, but for everything else the order does
-  not matter so breaking it into key / value pairs.
-  
-  Arguments:
-    raw_contents (str) - descriptor content provided by the relay
-    validate (bool)    - checks the validity of the descriptor's content if
-                         True, skips these checks otherwise
-  
-  Returns:
-    tuple with the following attributes...
-      entries (dict)      - keyword => (value, pgp key) entries
-      first_keyword (str) - keyword of the first line
-      last_keyword (str)  - keyword of the last line
-      exit_policy (list)  - lines containing the exit policy
-  """
-  
-  entries = {}
-  first_keyword = None
-  last_keyword = None
-  exit_policy = []
-  remaining_lines = raw_contents.split("\n")
-  
-  while remaining_lines:
-    line = remaining_lines.pop(0)
-    
-    # last line can be empty
-    if not line and not remaining_lines: continue
-    
-    # Some lines have an 'opt ' for backward compatability. They should be
-    # ignored. This prefix is being removed in...
-    # https://trac.torproject.org/projects/tor/ticket/5124
-    
-    if line.startswith("opt "): line = line[4:]
-    
-    line_match = KEYWORD_LINE.match(line)
-    
-    if not line_match:
-      if not validate: continue
-      raise ValueError("Line contains invalid characters: %s" % line)
-    
-    keyword, value = line_match.groups()
-    
-    if not first_keyword: first_keyword = keyword
-    last_keyword = keyword
-    
-    try:
-      block_contents = _get_pseudo_pgp_block(remaining_lines)
-    except ValueError, exc:
-      if not validate: continue
-      raise exc
-    
-    if keyword in ("accept", "reject"):
-      exit_policy.append("%s %s" % (keyword, value))
-    elif keyword in entries:
-      entries[keyword].append((value, block_contents))
-    else:
-      entries[keyword] = [(value, block_contents)]
-  
-  return entries, first_keyword, last_keyword, exit_policy
-
-def _get_pseudo_pgp_block(remaining_contents):
-  """
-  Checks if given contents begins with a pseudo-Open-PGP-style block and, if
-  so, pops it off and provides it back to the caller.
-  
-  Arguments:
-    remaining_contents (list) - lines to be checked for a public key block
-  
-  Returns:
-    str with the armor wrapped contents or None if it doesn't exist
-  
-  Raises:
-    ValueError if the contents starts with a key block but it's malformed (for
-    instance, if it lacks an ending line)
-  """
-  
-  if not remaining_contents:
-    return None # nothing left
-  
-  block_match = PGP_BLOCK_START.match(remaining_contents[0])
-  
-  if block_match:
-    block_type = block_match.groups()[0]
-    block_lines = []
-    
-    while True:
-      if not remaining_contents:
-        raise ValueError("Unterminated pgp style block")
-      
-      line = remaining_contents.pop(0)
-      block_lines.append(line)
-      
-      if line == PGP_BLOCK_END % block_type:
-        return "\n".join(block_lines)
-  else:
-    return None
-
diff --git a/test/integ/descriptor/data/extrainfo_descriptor b/test/integ/descriptor/data/extrainfo_descriptor
new file mode 100644
index 0000000..4525afe
--- /dev/null
+++ b/test/integ/descriptor/data/extrainfo_descriptor
@@ -0,0 +1,12 @@
+extra-info NINJA B2289C3EAB83ECD6EB916A2F481A02E6B76A0A48
+published 2012-05-05 17:03:50
+write-history 2012-05-05 17:02:45 (900 s) 1082368,19456,50176,272384,485376,1850368,1132544,1790976,2459648,4091904,6310912,13701120,3209216,3871744,7873536,5440512,7287808,10561536,9979904,11247616,11982848,7590912,10611712,20728832,38534144,6839296,3173376,16678912
+read-history 2012-05-05 17:02:45 (900 s) 3309568,9216,41984,27648,123904,2004992,364544,576512,1607680,3808256,4672512,12783616,2938880,2562048,7348224,3574784,6488064,10954752,9359360,4438016,6286336,6438912,4502528,10720256,38165504,1524736,2336768,8186880
+dirreq-write-history 2012-05-05 17:02:45 (900 s) 0,0,0,227328,349184,382976,738304,1171456,850944,657408,1675264,987136,702464,1335296,587776,1941504,893952,533504,695296,6828032,6326272,1287168,6310912,10085376,1048576,5372928,894976,8610816
+dirreq-read-history 2012-05-05 17:02:45 (900 s) 0,0,0,0,33792,27648,48128,46080,60416,51200,63488,64512,45056,27648,37888,48128,57344,34816,46080,50176,37888,51200,25600,33792,39936,32768,28672,30720
+router-signature
+-----BEGIN SIGNATURE-----
+K5FSywk7qvw/boA4DQcqkls6Ize5vcBYfhQ8JnOeRQC9+uDxbnpm3qaYN9jZ8myj
+k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
+7LZqklu+gVvhMKREpchVqlAwXkWR44VENm24Hs+mT3M=
+-----END SIGNATURE-----





More information about the tor-commits mailing list