[tor-commits] [stem/master] Add support for network status parsing

atagar at torproject.org atagar at torproject.org
Sat Oct 13 18:35:44 UTC 2012


commit a9f35d5e86bffc4b04cb525736a23a7e72009efc
Author: Ravi Chandra Padmala <neenaoffline at gmail.com>
Date:   Thu Jul 19 16:46:09 2012 +0200

    Add support for network status parsing
---
 stem/descriptor/networkstatus_descriptor.py |  635 +++++++++++++++++++++++++++
 1 files changed, 635 insertions(+), 0 deletions(-)

diff --git a/stem/descriptor/networkstatus_descriptor.py b/stem/descriptor/networkstatus_descriptor.py
new file mode 100644
index 0000000..4bcf9a9
--- /dev/null
+++ b/stem/descriptor/networkstatus_descriptor.py
@@ -0,0 +1,635 @@
+"""
+Parsing for Tor network status documents. Currently supports parsing v3 network
+status documents (both votes and consensus').
+
+The network status documents also contain a list of router descriptors,
+directory authorities, signatures etc.
+
+The votes and consensus' can be obtained from any of the following sources...
+
+* the 'cached-consensus' file in tor's data directory
+* tor metrics, at https://metrics.torproject.org/data.html
+* directory authorities and mirrors via their DirPort
+
+**Module Overview:**
+
+::
+
+  parse_file - parses a network status file and provides a NetworkStatusDocument
+  NetworkStatusDocument - Tor v3 network status document
+    +- MicrodescriptorConsensus - Tor microdescriptor consensus document
+  RouterDescriptor - Router descriptor; contains information about a Tor relay
+    +- RouterMicrodescriptor - Router microdescriptor; contains information that doesn't change often
+  DirectorySignature
+  DirectoryAuthority
+"""
+
+import re
+import base64
+import hashlib
+import datetime
+
+import stem.prereq
+import stem.descriptor
+import stem.descriptor.extrainfo_descriptor
+import stem.version
+import stem.exit_policy
+import stem.util.log as log
+import stem.util.connection
+import stem.util.tor_tools
+
+_bandwidth_weights_regex = re.compile(" ".join(["W%s=\d+" % weight for weight in ["bd", 
+  "be", "bg", "bm", "db", "eb", "ed", "ee", "eg", "em", "gb", "gd", "gg", "gm", "mb", "md", "me", "mg", "mm"]]))
+
+def parse_file(document_file, validate = True):
+  """
+  Parses a network status document file, and returns a NetworkStatusDocument
+  object.
+  
+  :param file document_file: file with network status document content
+  :param bool validate: checks the validity of the document's contents if True, skips these checks otherwise
+  
+  :returns: NetworkStatusDocument object created by parsing the file
+  
+  :raises:
+    * ValueError if the contents is malformed and validate is True
+    * IOError if the file can't be read
+  """
+  
+  data = document_file.read()
+  
+  # if the file has Metrics metadata
+  if data.startswith("@type network-status-consensus-3 1.0\n") or data.startswith("@type network-status-vote-3 1.0\n"):
+    return NetworkStatusDocument(data[data.find("\n") + 1:], validate)
+  
+  return NetworkStatusDocument(document_file.read(), validate)
+
+class DocumentParser:
+  """
+  Helper class to parse documents.
+
+  :var str line: current line to be being parsed
+  :var list lines: list of remaining lines to be parsed
+  """
+  
+  def __init__(self, raw_content, validate):
+    """
+    Create a new DocumentParser.
+
+    :param str raw_content: content to be parsed
+    :param bool validate: if False, treats every keyword line as optional
+    """
+    
+    self._raw_content = raw_content
+    self.lines = raw_content.split("\n")
+    self.validate = validate
+    self.line = self.lines.pop(0)
+
+  def peek_keyword(self):
+    """
+    Returns the first keyword in the next line. Respects the opt keyword and
+    returns the actual keyword if the first is "opt".
+    
+    :returns: the first keyword of the next line
+    """
+    
+    if self.line:
+      if self.line.startswith("opt "):
+        return self.line.split(" ")[1]
+      return self.line.split(" ")[0]
+  
+  def read_keyword_line(self, keyword, optional = False):
+    """
+    Returns the first keyword in the next line it matches the given keyword.
+    
+    If it doesn't match, a ValueError is raised if optional is True and if the
+    DocumentParser was created with validation enabled. If not, None is returned.
+    
+    Respects the opt keyword and returns the next keyword if the first is "opt".
+
+    :param str keyword: keyword the line must begin with
+    :param bool optional: If the current line must begin with the given keyword
+    
+    :returns: the text after the keyword if the keyword matches the one provided, otherwise returns None or raises an exception
+    
+    :raises: ValueError if a non-optional keyword doesn't match when validation is enabled
+    """
+    
+    keyword_regex = re.compile("(opt )?" + re.escape(keyword) + "($| )")
+    
+    if not self.line:
+      if not optional and self.validate:
+        raise ValueError("Unexpected end of document")
+      return
+    
+    if keyword_regex.match(self.line):
+      try: line, self.line = self.line, self.lines.pop(0)
+      except IndexError: line, self.line = self.line, None
+      
+      if line == "opt " + keyword or line == keyword: return ""
+      elif line.startswith("opt "): return line.split(" ", 2)[2]
+      else: return line.split(" ", 1)[1]
+    elif self.line.startswith("opt"):
+      # if this was something new introduced at some point in the future
+      # ignore it and go to the next line
+      self.read_line()
+      return self.read_keyword_line(self, keyword, optional)
+    elif not optional and self.validate:
+      raise ValueError("Error parsing network status document: Expected %s, received: %s" % (keyword, self.line))
+  
+  def read_line(self):
+    """
+    Returns the current line and shifts the parser to the next line.
+    
+    :returns: the current line if it exists, None otherwise
+    """
+    
+    if self.line:
+      tmp, self.line = self.line, self.lines.pop(0)
+      return tmp
+  
+  def read_block(self, keyword):
+    """
+    Returns a keyword block that begins with "-----BEGIN keyword-----\\n" and
+    ends with "-----END keyword-----\\n".
+
+    :param str keyword: keyword block that must be read
+
+    :returns: the data in the keyword block
+    """
+
+    lines = []
+    
+    if self.line == "-----BEGIN " + keyword + "-----":
+      self.read_line()
+      while self.line != "-----END " + keyword + "-----":
+        lines.append(self.read_line())
+
+    return "\n".join(lines)
+  
+  def read_until(self, terminals = []):
+    """
+    Returns the data in the parser until a line that begins with one of the keywords in terminals are found.
+    
+    :param list terminals: list of strings at which we should stop reading and return the data
+    
+    :returns: the current line if it exists, None otherwise
+    """
+    
+    if self.line == None: return
+    lines, self.line = [self.line], self.lines.pop(0)
+    while self.line and not self.line.split(" ")[0] in terminals:
+      lines.append(self.line)
+      self.line = self.lines.pop(0)
+
+    return "\n".join(lines)
+  
+  def remaining(self):
+    """
+    Returns the data remaining in the parser.
+    
+    :returns: all a list of all unparsed lines
+    """
+    
+    if self.line:
+      lines, self.lines = self.lines, []
+      lines.insert(0, self.line)
+      return lines
+    else:
+      return []
+
+def _strptime(string, validate = True, optional = False):
+  try:
+    return datetime.datetime.strptime(string, "%Y-%m-%d %H:%M:%S")
+  except ValueError, exc:
+    if validate or not optional: raise exc
+
+class NetworkStatusDocument(stem.descriptor.Descriptor):
+  """
+  A v3 network status document.
+  
+  This could be a v3 consensus or vote document.
+  
+  :var bool validated: **\*** whether the document is validated
+  :var str network_status_version: **\*** a document format version. For v3 documents this is "3"
+  :var str vote_status: **\*** status of the vote. Is either "vote" or "consensus"
+  :var list consensus_methods: A list of supported consensus generation methods (integers)
+  :var datetime published: time when the document was published
+  :var int consensus_method: consensus method used to generate a consensus
+  :var datetime valid_after: **\*** time when the consensus becomes valid
+  :var datetime fresh_until: **\*** time until when the consensus is considered to be fresh
+  :var datetime valid_until: **\*** time until when the consensus is valid
+  :var int vote_delay: **\*** number of seconds allowed for collecting votes from all authorities
+  :var int dist_delay: number of seconds allowed for collecting signatures from all authorities
+  :var list client_versions: list of recommended Tor client versions 
+  :var list server_versions: list of recommended Tor server versions
+  :var list known_flags: **\*** list of known router flags
+  :var list params: dict of parameter(str) => value(int) mappings
+  :var list router_descriptors: **\*** list of RouterDescriptor objects defined in the document
+  :var list directory_authorities: **\*** list of DirectoryAuthority objects that have generated this document
+  :var dict bandwidth_weights: dict of weight(str) => value(int) mappings
+  :var list directory_signatures: **\*** list of signatures this document has
+  
+  **\*** attribute is either required when we're parsed with validation or has a default value, others are left as None if undefined
+  """
+  
+  def __init__(self, raw_content, validate = True):
+    """
+    Parse a v3 network status document and provide a new NetworkStatusDocument object.
+    
+    :param str raw_content: raw network status document data
+    :param bool validate: True if the document is to be validated, False otherwise
+    
+    :raises: ValueError if the document is invalid
+    """
+    
+    super(NetworkStatusDocument, self).__init__(raw_content)
+    
+    self.router_descriptors = []
+    self.directory_authorities = []
+    self.directory_signatures = []
+    self.validated = validate
+    
+    self.network_status_version = None
+    self.vote_status = None
+    self.consensus_methods = []
+    self.published = None
+    self.consensus_method = None
+    self.valid_after = None
+    self.fresh_until = None
+    self.valid_until = None
+    self.vote_delay = None
+    self.dist_delay = None
+    self.client_versions = []
+    self.server_versions = []
+    self.known_flags = []
+    self.params = {}
+    self.bandwidth_weights = {}
+    
+    self._parse(raw_content)
+  
+  def _generate_router(self, raw_content, vote, validate):
+    return RouterDescriptor(raw_content, vote, validate)
+  
+  def _validate_network_status_version(self):
+    return self.network_status_version == "3"
+  
+  def get_unrecognized_lines(self):
+    """
+    Returns any unrecognized trailing lines.
+    
+    :returns: a list of unrecognized trailing lines
+    """
+	
+    return self._unrecognized_lines
+  
+  def _parse(self, raw_content):
+    # preamble
+    validate = self.validated
+    doc_parser = DocumentParser(raw_content, validate)
+    
+    read_keyword_line = lambda keyword, optional = False: setattr(self, keyword.replace("-", "_"), doc_parser.read_keyword_line(keyword, optional))
+    
+    map(read_keyword_line, ["network-status-version", "vote-status"])
+    if validate and not self._validate_network_status_version():
+      raise ValueError("Invalid network-status-version: %s" % self.network_status_version)
+    
+    if self.vote_status == "vote": vote = True
+    elif self.vote_status == "consensus": vote = False
+    elif validate: raise ValueError("Unrecognized document type specified in vote-status")
+    
+    if vote:
+      read_keyword_line("consensus-methods", True)
+      self.consensus_methods = [int(method) for method in self.consensus_methods.split(" ")]
+      self.published = _strptime(doc_parser.read_keyword_line("published", True), validate, True)
+    else:
+      self.consensus_method = int(doc_parser.read_keyword_line("consensus-method", True))
+    
+    map(read_keyword_line, ["valid-after", "fresh-until", "valid-until"])
+    self.valid_after = _strptime(self.valid_after, validate)
+    self.fresh_until = _strptime(self.fresh_until, validate)
+    self.valid_until = _strptime(self.valid_until, validate)
+    voting_delay = doc_parser.read_keyword_line("voting-delay")
+    self.vote_delay, self.dist_delay = [int(delay) for delay in voting_delay.split(" ")]
+    
+    read_keyword_line("client-versions", True)
+    self.client_versions = [stem.version.Version(version_string) for version_string in self.client_versions.split(",")]
+    read_keyword_line("server-versions", True)
+    self.server_versions = [stem.version.Version(version_string) for version_string in self.server_versions.split(",")]
+    self.known_flags = doc_parser.read_keyword_line("known-flags").split(" ")
+    read_keyword_line("params", True)
+    if self.params:
+      self.params = dict([(param.split("=")[0], int(param.split("=")[1])) for param in self.params.split(" ")])
+    
+    # authority section
+    while doc_parser.line.startswith("dir-source "):
+      dirauth_data = doc_parser.read_until(["dir-source", "r"])
+      self.directory_authorities.append(DirectoryAuthority(dirauth_data, vote, validate))
+    
+    # router descriptors
+    while doc_parser.line.startswith("r "):
+      router_data = doc_parser.read_until(["r", "directory-footer", "directory-signature"])
+      self.router_descriptors.append(self._generate_router(router_data, vote, validate))
+    
+    # footer section
+    if self.consensus_method > 9 or vote and filter(lambda x: x >= 9, self.consensus_methods):
+      if doc_parser.line == "directory-footer":
+        doc_parser.read_line()
+      elif validate:
+        raise ValueError("Network status document missing directory-footer")
+    
+    if not vote:
+      read_keyword_line("bandwidth-weights", True)
+      if _bandwidth_weights_regex.match(self.bandwidth_weights):
+        self.bandwidth_weights = dict([(weight.split("=")[0], int(weight.split("=")[1])) for weight in self.bandwidth_weights.split(" ")])
+      elif validate:
+        raise ValueError("Invalid bandwidth-weights line")
+    
+    while doc_parser.line.startswith("directory-signature "):
+      signature_data = doc_parser.read_until(["directory-signature"])
+      self.directory_signatures.append(DirectorySignature(signature_data))
+    
+    self._unrecognized_lines = doc_parser.remaining()
+    if validate and self._unrecognized_lines: raise ValueError("Unrecognized trailing data")
+
+class DirectoryAuthority(stem.descriptor.Descriptor):
+  """
+  Contains directory authority information obtained from v3 network status
+  documents.
+  
+  :var str nickname: directory authority's nickname
+  :var str identity: uppercase hex fingerprint of the authority's identity key
+  :var str address: hostname
+  :var str ip: current IP address
+  :var int dirport: current directory port
+  :var int orport: current orport
+  :var str contact: directory authority's contact information
+  :var str legacy_dir_key: fingerprint of and obsolete identity key
+  :var str vote_digest: digest of the authority that contributed to the consensus
+  """
+  
+  def __init__(self, raw_content, vote = True, validate = True):
+    """
+    Parse a directory authority entry in a v3 network status document and
+    provide a DirectoryAuthority object.
+    
+    :param str raw_content: raw directory authority entry information
+    :param bool validate: True if the document is to be validated, False otherwise
+    
+    :raises: ValueError if the raw data is invalid
+    """
+    
+    super(DirectoryAuthority, self).__init__(raw_content)
+    parser = DocumentParser(raw_content, validate)
+    
+    dir_source = parser.read_keyword_line("dir-source")
+    self.nickname, self.identity, self.address, self.ip, self.dirport, self.orport = dir_source.split(" ")
+    self.dirport = int(self.dirport)
+    self.orport = int(self.orport)
+    
+    self.contact = parser.read_keyword_line("contact")
+    if vote:
+      self.legacy_dir_key = parser.read_keyword_line("legacy-dir-key", True)
+    else:
+      self.vote_digest = parser.read_keyword_line("vote-digest", True)
+    if parser.remaining() and validate:
+      raise ValueError("Unrecognized trailing data in directory authority information")
+
+class DirectorySignature(stem.descriptor.Descriptor):
+  """
+  Contains directory signature information described in a v3 network status
+  document.
+  
+  :var str identity: signature identity
+  :var str key_digest: signature key digest
+  :var str method: method used to generate the signature
+  :var str signature: the signature data
+  """
+  
+  def __init__(self, raw_content, validate = True):
+    """
+    Parse a directory signature entry in a v3 network status document and
+    provide a DirectorySignature object.
+    
+    :param str raw_content: raw directory signature entry information
+    :param bool validate: True if the document is to be validated, False otherwise
+    
+    :raises: ValueError if the raw data is invalid
+    """
+    
+    super(DirectorySignature, self).__init__(raw_content)
+    parser = DocumentParser(raw_content, validate)
+    
+    signature_line = parser.read_keyword_line("directory-signature").split(" ")
+
+    if len(signature_line) == 2:
+      self.identity, self.key_digest = signature_line
+    if len(signature_line) == 3: # for microdescriptor consensuses
+      self.method, self.identity, self.key_digest = signature_line
+
+    self.signature = parser.read_block("SIGNATURE")
+    if parser.remaining() and validate:
+      raise ValueError("Unrecognized trailing data in directory signature")
+
+class RouterDescriptor(stem.descriptor.Descriptor):
+  """
+  Router descriptor object. Parses and stores router information in a router
+  entry read from a v3 network status document.
+  
+  :var str nickname: **\*** router's nickname
+  :var str identity: **\*** router's identity
+  :var str digest: **\*** router's digest
+  :var datetime publication: **\*** router's publication
+  :var str ip: **\*** router's IP address
+  :var int orport: **\*** router's ORPort
+  :var int dirport: **\*** router's DirPort
+  
+  :var bool is_valid: **\*** router is valid
+  :var bool is_guard: **\*** router is suitable for use as an entry guard
+  :var bool is_named: **\*** router is named
+  :var bool is_unnamed: **\*** router is unnamed
+  :var bool is_running: **\*** router is running and currently usable
+  :var bool is_stable: **\*** router is stable, i.e., it's suitable for for long-lived circuits
+  :var bool is_exit: **\*** router is an exit router
+  :var bool is_fast: **\*** router is Fast, i.e., it's usable for high-bandwidth circuits
+  :var bool is_authority: **\*** router is a directory authority
+  :var bool supports_v2dir: **\*** router supports v2dir
+  :var bool supports_v3dir: **\*** router supports v3dir
+  :var bool is_hsdir: **\*** router is a hidden status
+  :var bool is_badexit: **\*** router is marked a bad exit
+  :var bool is_baddirectory: **\*** router is a bad directory
+  
+  :var :class:`stem.version.Version`,str version: Version of the Tor protocol this router is running
+  
+  :var int bandwidth: router's claimed bandwidth
+  :var int measured_bandwidth: router's measured bandwidth
+  
+  :var :class:`stem.exit_policy.MicrodescriptorExitPolicy` exitpolicy: router's exitpolicy
+  
+  :var str mircodescriptor_hashes: "m" SP methods 1*(SP algorithm "=" digest) NL
+  
+  **\*** attribute is either required when we're parsed with validation or has a default value, others are left as None if undefined
+  """
+  
+  def __init__(self, raw_contents, vote = True, validate = True):
+    """
+    Parse a router descriptor in a v3 network status document and provide a new
+    RouterDescriptor object.
+    
+    :param str raw_content: router descriptor content to be parsed
+    :param bool validate: whether the router descriptor should be validated
+    """
+    
+    super(RouterDescriptor, self).__init__(raw_contents)
+    
+    self.nickname = None
+    self.identity = None
+    self.digest = None
+    self.publication = None
+    self.ip = None
+    self.orport = None
+    self.dirport = None
+    
+    self.is_valid = None
+    self.is_guard = None
+    self.is_named = None
+    self.is_unnamed = None
+    self.is_running = None
+    self.is_stable = None
+    self.is_exit = None
+    self.is_fast = None
+    self.is_authority = None
+    self.supports_v2dir = None
+    self.supports_v3dir = None
+    self.is_hsdir = None
+    self.is_badexit = None
+    self.is_baddirectory = None
+    
+    self.version = None
+    
+    self.bandwidth = None
+    self.measured_bandwidth = None
+    
+    self.exit_policy = None
+    
+    self.mircodescriptor_hashes = []
+    
+    self._parse(raw_contents, vote, validate)
+  
+  def _parse(self, raw_content, vote, validate):
+    """
+    :param dict raw_content: iptor contents to be applied
+    :param bool validate: checks the validity of descriptor content if True
+    
+    :raises: ValueError if an error occures in validation
+    """
+    
+    parser = DocumentParser(raw_content, validate)
+    seen_keywords = set()
+    peek_check_kw = lambda keyword: keyword == parser.peek_keyword()
+    
+    r = parser.read_keyword_line("r")
+    # r mauer BD7xbfsCFku3+tgybEZsg8Yjhvw itcuKQ6PuPLJ7m/Oi928WjO2j8g 2012-06-22 13:19:32 80.101.105.103 9001 0
+    # "r" SP nickname SP identity SP digest SP publication SP IP SP ORPort SP DirPort NL
+    seen_keywords.add("r")
+    if r:
+      values = r.split(" ")
+      self.nickname, self.identity, self.digest = values[0], values[1], values[2]
+      self.publication = _strptime(" ".join((values[3], values[4])), validate)
+      self.ip, self.orport, self.dirport = values[5], int(values[6]), int(values[7])
+      if self.dirport == 0: self.dirport = None
+    elif validate: raise ValueError("Invalid router descriptor: empty 'r' line" )
+    
+    while parser.line:
+      if peek_check_kw("s"):
+        if "s" in seen_keywords: raise ValueError("Invalid router descriptor: 's' line appears twice")
+        line = parser.read_keyword_line("s")
+        if not line: continue
+        seen_keywords.add("s")
+        # s Named Running Stable Valid
+        #A series of space-separated status flags, in *lexical order*
+        flags = line.split(" ")
+        flag_map = {
+          "Valid": "is_valid",
+          "Guard": "is_guard",
+          "Named": "is_named",
+          "Unnamed": "is_unnamed",
+          "Running": "is_running",
+          "Stable": "is_stable",
+          "Exit": "is_exit",
+          "Fast": "is_fast",
+          "Authority": "is_authority",
+          "V2Dir": "supports_v2dir",
+          "V3Dir": "supports_v3dir",
+          "HSDir": "is_hsdir",
+          "BadExit": "is_badexit",
+          "BadDirectory": "is_baddirectory",
+        }
+        map(lambda flag: setattr(self, flag_map[flag], True), flags)
+        
+        if self.is_unnamed: self.is_named = False
+        elif self.is_named: self.is_unnamed = False
+      
+      elif peek_check_kw("v"):
+        if "v" in seen_keywords: raise ValueError("Invalid router descriptor: 'v' line appears twice")
+        line = parser.read_keyword_line("v", True)
+        seen_keywords.add("v")
+        # v Tor 0.2.2.35
+        if line:
+          if line.startswith("Tor "):
+            self.version = stem.version.Version(line[4:])
+          else:
+            self.version = line
+        elif validate: raise ValueError("Invalid router descriptor: empty 'v' line" )
+      
+      elif peek_check_kw("w"):
+        if "w" in seen_keywords: raise ValueError("Invalid router descriptor: 'w' line appears twice")
+        w = parser.read_keyword_line("w", True)
+        # "w" SP "Bandwidth=" INT [SP "Measured=" INT] NL
+        seen_keywords.add("w")
+        if w:
+          values = w.split(" ")
+          if len(values) <= 2 and len(values) > 0:
+            key, value = values[0].split("=")
+            if key == "Bandwidth": self.bandwidth = int(value)
+            elif validate: raise ValueError("Router descriptor contains invalid 'w' line: expected Bandwidth, read " + key)
+        
+            if len(values) == 2:
+              key, value = values[1].split("=")
+              if key == "Measured=": self.measured_bandwidth = int(value)
+              elif validate: raise ValueError("Router descriptor contains invalid 'w' line: expected Measured, read " + key)
+          elif validate: raise ValueError("Router descriptor contains invalid 'w' line")
+        elif validate: raise ValueError("Router descriptor contains empty 'w' line")
+      
+      elif peek_check_kw("p"):
+        if "p" in seen_keywords: raise ValueError("Invalid router descriptor: 'p' line appears twice")
+        p = parser.read_keyword_line("p", True)
+        seen_keywords.add("p")
+        # "p" SP ("accept" / "reject") SP PortList NL
+        if p:
+          self.exit_policy = stem.exit_policy.MicrodescriptorExitPolicy(p)
+          #self.exit_policy = p
+      
+      elif vote and peek_check_kw("m"):
+        # microdescriptor hashes
+        m = parser.read_keyword_line("m", True)
+        methods, digests = m.split(" ", 1)
+        method_list = methods.split(",")
+        digest_dict = [digest.split("=", 1) for digest in digests.split(" ")]
+        self.microdescriptor_hashes.append((method_list, digest_dict))
+      
+      elif validate:
+        raise ValueError("Router descriptor contains unrecognized trailing lines: %s" % parser.line)
+      
+      else:
+        self._unrecognized_lines.append(parser.read_line()) # ignore unrecognized lines if we aren't validating
+
+  def get_unrecognized_lines(self):
+    """
+    Returns any unrecognized lines.
+    
+    :returns: a list of unrecognized lines
+    """
+	
+    return self._unrecognized_lines
+  





More information about the tor-commits mailing list