[tor-commits] [stem/master] Move extrainfo descriptor parsing to helpers

Sun Jan 25 22:37:34 UTC 2015

commit b717bef5bb2a0d8a37fb154df28501af574c04df
Author: Damian Johnson <atagar at torproject.org>
Date:   Wed Jan 14 10:40:27 2015 -0800

    Move extrainfo descriptor parsing to helpers
    
    Intermediate point so we can do the same pattern of lazy loading I did for
    server descriptors.
---
 stem/descriptor/extrainfo_descriptor.py      |  772 ++++++++++++++------------
 test/unit/descriptor/extrainfo_descriptor.py |   28 +-
 2 files changed, 431 insertions(+), 369 deletions(-)

diff --git a/stem/descriptor/extrainfo_descriptor.py b/stem/descriptor/extrainfo_descriptor.py
index d6e6102..408cc92 100644
--- a/stem/descriptor/extrainfo_descriptor.py
+++ b/stem/descriptor/extrainfo_descriptor.py
@@ -69,6 +69,7 @@ Extra-info descriptors are available from a few sources...
   ===================== ===========
 """
 
+import functools
 import hashlib
 import re
 
@@ -225,6 +226,349 @@ def _parse_timestamp_and_interval(keyword, content):
     raise ValueError("%s line's timestamp wasn't parsable: %s" % (keyword, line))
 
 
+def _value(line, entries):
+  return entries[line][0][0]
+
+
+def _values(line, entries):
+  return [entry[0] for entry in entries[line]]
+
+
+def _parse_extra_info_line(descriptor, entries):
+  # "extra-info" Nickname Fingerprint
+
+  value = _value('extra-info', entries)
+  extra_info_comp = value.split()
+
+  if len(extra_info_comp) < 2:
+    raise ValueError('Extra-info line must have two values: extra-info %s' % value)
+  elif not stem.util.tor_tools.is_valid_nickname(extra_info_comp[0]):
+    raise ValueError("Extra-info line entry isn't a valid nickname: %s" % extra_info_comp[0])
+  elif not stem.util.tor_tools.is_valid_fingerprint(extra_info_comp[1]):
+    raise ValueError('Tor relay fingerprints consist of forty hex digits: %s' % extra_info_comp[1])
+
+  descriptor.nickname = extra_info_comp[0]
+  descriptor.fingerprint = extra_info_comp[1]
+
+
+def _parse_geoip_db_digest_line(descriptor, entries):
+  # "geoip-db-digest" Digest
+
+  value = _value('geoip-db-digest', entries)
+
+  if not stem.util.tor_tools.is_hex_digits(value, 40):
+    raise ValueError('Geoip digest line had an invalid sha1 digest: geoip-db-digest %s' % value)
+
+  descriptor.geoip_db_digest = value
+
+
+def _parse_geoip6_db_digest_line(descriptor, entries):
+  # "geoip6-db-digest" Digest
+
+  value = _value('geoip6-db-digest', entries)
+
+  if not stem.util.tor_tools.is_hex_digits(value, 40):
+    raise ValueError('Geoip v6 digest line had an invalid sha1 digest: geoip6-db-digest %s' % value)
+
+  descriptor.geoip6_db_digest = value
+
+
+def _parse_transport_line(descriptor, entries):
+  # "transport" transportname address:port [arglist]
+  # Everything after the transportname is scrubbed in published bridge
+  # descriptors, so we'll never see it in practice.
+  #
+  # These entries really only make sense for bridges, but have been seen
+  # on non-bridges in the wild when the relay operator configured it this
+  # way.
+
+  for value in _values('transport', entries):
+    name, address, port, args = None, None, None, None
+
+    if ' ' not in value:
+      # scrubbed
+      name = value
+    else:
+      # not scrubbed
+      value_comp = value.split()
+
+      if len(value_comp) < 1:
+        raise ValueError('Transport line is missing its transport name: transport %s' % value)
+      elif len(value_comp) < 2:
+        raise ValueError('Transport line is missing its address:port value: transport %s' % value)
+      elif ':' not in value_comp[1]:
+        raise ValueError("Transport line's address:port entry is missing a colon: transport %s" % value)
+
+      name = value_comp[0]
+      address, port_str = value_comp[1].split(':', 1)
+
+      if not stem.util.connection.is_valid_ipv4_address(address) or \
+             stem.util.connection.is_valid_ipv6_address(address):
+        raise ValueError('Transport line has a malformed address: transport %s' % value)
+      elif not stem.util.connection.is_valid_port(port_str):
+        raise ValueError('Transport line has a malformed port: transport %s' % value)
+
+      port = int(port_str)
+      args = value_comp[2:] if len(value_comp) >= 3 else []
+
+    descriptor.transport[name] = (address, port, args)
+
+
+def _parse_cell_circuits_per_decline_line(descriptor, entries):
+  # "cell-circuits-per-decile" num
+
+  value = _value('cell-circuits-per-decile', entries)
+
+  if not value.isdigit():
+    raise ValueError('Non-numeric cell-circuits-per-decile value: %s' % value)
+  elif int(value) < 0:
+    raise ValueError('Negative cell-circuits-per-decile value: %s' % value)
+
+  descriptor.cell_circuits_per_decile = int(value)
+
+
+def _parse_dirreq_line(keyword, recognized_counts_attr, unrecognized_counts_attr, descriptor, entries):
+  value = _value(keyword, entries)
+
+  recognized_counts = {}
+  unrecognized_counts = {}
+
+  is_response_stats = keyword in ('dirreq-v2-resp', 'dirreq-v3-resp')
+  key_set = DirResponse if is_response_stats else DirStat
+
+  key_type = 'STATUS' if is_response_stats else 'STAT'
+  error_msg = '%s lines should contain %s=COUNT mappings: %s %s' % (keyword, key_type, keyword, value)
+
+  if value:
+    for entry in value.split(','):
+      if '=' not in entry:
+        raise ValueError(error_msg)
+
+      status, count = entry.split('=', 1)
+
+      if count.isdigit():
+        if status in key_set:
+          recognized_counts[status] = int(count)
+        else:
+          unrecognized_counts[status] = int(count)
+      else:
+        raise ValueError(error_msg)
+
+  setattr(descriptor, recognized_counts_attr, recognized_counts)
+  setattr(descriptor, unrecognized_counts_attr, unrecognized_counts)
+
+
+def _parse_dirreq_share_line(keyword, attribute, descriptor, entries):
+  value = _value(keyword, entries)
+
+  if not value.endswith('%'):
+    raise ValueError('%s lines should be a percentage: %s %s' % (keyword, keyword, value))
+  elif float(value[:-1]) < 0:
+    raise ValueError('Negative percentage value: %s %s' % (keyword, value))
+
+  # bug means it might be above 100%: https://lists.torproject.org/pipermail/tor-dev/2012-June/003679.html
+
+  setattr(descriptor, attribute, float(value[:-1]) / 100)
+
+
+def _parse_cell_line(keyword, attribute, descriptor, entries):
+  # "<keyword>" num,...,num
+
+  value = _value(keyword, entries)
+  entries, exc = [], None
+
+  if value:
+    for entry in value.split(','):
+      try:
+        # Values should be positive but as discussed in ticket #5849
+        # there was a bug around this. It was fixed in tor 0.2.2.1.
+
+        entries.append(float(entry))
+      except ValueError:
+        exc = ValueError('Non-numeric entry in %s listing: %s %s' % (keyword, keyword, value))
+
+  setattr(descriptor, attribute, entries)
+
+  if exc:
+    raise exc
+
+
+def _parse_timestamp_line(keyword, attribute, descriptor, entries):
+  # "<keyword>" YYYY-MM-DD HH:MM:SS
+
+  value = _value(keyword, entries)
+
+  try:
+    setattr(descriptor, attribute, stem.util.str_tools._parse_timestamp(value))
+  except ValueError:
+    raise ValueError("Timestamp on %s line wasn't parsable: %s %s" % (keyword, keyword, value))
+
+
+def _parse_timestamp_and_interval_line(keyword, end_attribute, interval_attribute, descriptor, entries):
+  # "<keyword>" YYYY-MM-DD HH:MM:SS (NSEC s)
+
+  timestamp, interval, _ = _parse_timestamp_and_interval(keyword, _value(keyword, entries))
+  setattr(descriptor, end_attribute, timestamp)
+  setattr(descriptor, interval_attribute, interval)
+
+
+def _parse_conn_bi_direct_line(descriptor, entries):
+  # "conn-bi-direct" YYYY-MM-DD HH:MM:SS (NSEC s) BELOW,READ,WRITE,BOTH
+
+  value = _value('conn-bi-direct', entries)
+  timestamp, interval, remainder = _parse_timestamp_and_interval('conn-bi-direct', value)
+  stats = remainder.split(',')
+
+  if len(stats) != 4 or not (stats[0].isdigit() and stats[1].isdigit() and stats[2].isdigit() and stats[3].isdigit()):
+    raise ValueError('conn-bi-direct line should end with four numeric values: conn-bi-direct %s' % value)
+
+  descriptor.conn_bi_direct_end = timestamp
+  descriptor.conn_bi_direct_interval = interval
+  descriptor.conn_bi_direct_below = int(stats[0])
+  descriptor.conn_bi_direct_read = int(stats[1])
+  descriptor.conn_bi_direct_write = int(stats[2])
+  descriptor.conn_bi_direct_both = int(stats[3])
+
+
+def _parse_history_line(keyword, end_attribute, interval_attribute, values_attribute, descriptor, entries):
+  # "<keyword>" YYYY-MM-DD HH:MM:SS (NSEC s) NUM,NUM,NUM,NUM,NUM...
+
+  value = _value(keyword, entries)
+  timestamp, interval, remainder = _parse_timestamp_and_interval(keyword, value)
+  history_values = []
+
+  if remainder:
+    try:
+      history_values = [int(entry) for entry in remainder.split(',')]
+    except ValueError:
+      raise ValueError('%s line has non-numeric values: %s %s' % (keyword, keyword, value))
+
+  setattr(descriptor, end_attribute, timestamp)
+  setattr(descriptor, interval_attribute, interval)
+  setattr(descriptor, values_attribute, history_values)
+
+
+def _parse_port_count_line(keyword, attribute, descriptor, entries):
+  # "<keyword>" port=N,port=N,...
+
+  value, port_mappings = _value(keyword, entries), {}
+  error_msg = 'Entries in %s line should only be PORT=N entries: %s %s' % (keyword, keyword, value)
+
+  if value:
+    for entry in value.split(','):
+      if '=' not in entry:
+        raise ValueError(error_msg)
+
+      port, stat = entry.split('=', 1)
+
+      if (port == 'other' or stem.util.connection.is_valid_port(port)) and stat.isdigit():
+        if port != 'other':
+          port = int(port)
+
+        port_mappings[port] = int(stat)
+      else:
+        raise ValueError(error_msg)
+
+  setattr(descriptor, attribute, port_mappings)
+
+
+def _parse_geoip_to_count_line(keyword, attribute, descriptor, entries):
+  # "<keyword>" CC=N,CC=N,...
+  #
+  # The maxmind geoip (https://www.maxmind.com/app/iso3166) has numeric
+  # locale codes for some special values, for instance...
+  #   A1,"Anonymous Proxy"
+  #   A2,"Satellite Provider"
+  #   ??,"Unknown"
+
+  value, locale_usage = _value(keyword, entries), {}
+  error_msg = 'Entries in %s line should only be CC=N entries: %s %s' % (keyword, keyword, value)
+
+  if value:
+    for entry in value.split(','):
+      if '=' not in entry:
+        raise ValueError(error_msg)
+
+      locale, count = entry.split('=', 1)
+
+      if _locale_re.match(locale) and count.isdigit():
+        locale_usage[locale] = int(count)
+      else:
+        raise ValueError(error_msg)
+
+  setattr(descriptor, attribute, locale_usage)
+
+
+def _parse_bridge_ip_versions_line(descriptor, entries):
+  value, ip_versions = _value('bridge-ip-versions', entries), {}
+
+  if value:
+    for entry in value.split(','):
+      if '=' not in entry:
+        raise stem.ProtocolError("The bridge-ip-versions should be a comma separated listing of '<protocol>=<count>' mappings: bridge-ip-versions %s" % value)
+
+      protocol, count = entry.split('=', 1)
+
+      if not count.isdigit():
+        raise stem.ProtocolError('IP protocol count was non-numeric (%s): bridge-ip-versions %s' % (count, value))
+
+      ip_versions[protocol] = int(count)
+
+  descriptor.ip_versions = ip_versions
+
+
+def _parse_bridge_ip_transports_line(descriptor, entries):
+  value, ip_transports = _value('bridge-ip-transports', entries), {}
+
+  if value:
+    for entry in value.split(','):
+      if '=' not in entry:
+        raise stem.ProtocolError("The bridge-ip-transports should be a comma separated listing of '<protocol>=<count>' mappings: bridge-ip-transports %s" % value)
+
+      protocol, count = entry.split('=', 1)
+
+      if not count.isdigit():
+        raise stem.ProtocolError('Transport count was non-numeric (%s): bridge-ip-transports %s' % (count, value))
+
+      ip_transports[protocol] = int(count)
+
+  descriptor.ip_transports = ip_transports
+
+
+_parse_dirreq_v2_resp_line = functools.partial(_parse_dirreq_line, 'dirreq-v2-resp', 'dir_v2_responses', 'dir_v2_responses_unknown')
+_parse_dirreq_v3_resp_line = functools.partial(_parse_dirreq_line, 'dirreq-v3-resp', 'dir_v3_responses', 'dir_v3_responses_unknown')
+_parse_dirreq_v2_direct_dl_line = functools.partial(_parse_dirreq_line, 'dirreq-v2-direct-dl', 'dir_v2_direct_dl', 'dir_v2_direct_dl_unknown')
+_parse_dirreq_v3_direct_dl_line = functools.partial(_parse_dirreq_line, 'dirreq-v3-direct-dl', 'dir_v3_direct_dl', 'dir_v3_direct_dl_unknown')
+_parse_dirreq_v2_tunneled_dl_line = functools.partial(_parse_dirreq_line, 'dirreq-v2-tunneled-dl', 'dir_v2_tunneled_dl', 'dir_v2_tunneled_dl_unknown')
+_parse_dirreq_v3_tunneled_dl_line = functools.partial(_parse_dirreq_line, 'dirreq-v3-tunneled-dl', 'dir_v3_tunneled_dl', 'dir_v3_tunneled_dl_unknown')
+_parse_dirreq_v2_share_line = functools.partial(_parse_dirreq_share_line, 'dirreq-v2-share', 'dir_v2_share')
+_parse_dirreq_v3_share_line = functools.partial(_parse_dirreq_share_line, 'dirreq-v3-share', 'dir_v3_share')
+_parse_cell_processed_cells_line = functools.partial(_parse_cell_line, 'cell-processed-cells', 'cell_processed_cells')
+_parse_cell_queued_cells_line = functools.partial(_parse_cell_line, 'cell-queued-cells', 'cell_queued_cells')
+_parse_cell_time_in_queue_line = functools.partial(_parse_cell_line, 'cell-time-in-queue', 'cell_time_in_queue')
+_parse_published_line = functools.partial(_parse_timestamp_line, 'published', 'published')
+_parse_geoip_start_time_line = functools.partial(_parse_timestamp_line, 'geoip-start-time', 'geoip_start_time')
+_parse_cell_stats_end_line = functools.partial(_parse_timestamp_and_interval_line, 'cell-stats-end', 'cell_stats_end', 'cell_stats_interval')
+_parse_entry_stats_end_line = functools.partial(_parse_timestamp_and_interval_line, 'entry-stats-end', 'entry_stats_end', 'entry_stats_interval')
+_parse_exit_stats_end_line = functools.partial(_parse_timestamp_and_interval_line, 'exit-stats-end', 'exit_stats_end', 'exit_stats_interval')
+_parse_bridge_stats_end_line = functools.partial(_parse_timestamp_and_interval_line, 'bridge-stats-end', 'bridge_stats_end', 'bridge_stats_interval')
+_parse_dirreq_stats_end_line = functools.partial(_parse_timestamp_and_interval_line, 'dirreq-stats-end', 'dir_stats_end', 'dir_stats_interval')
+_parse_read_history_line = functools.partial(_parse_history_line, 'read-history', 'read_history_end', 'read_history_interval', 'read_history_values')
+_parse_write_history_line = functools.partial(_parse_history_line, 'write-history', 'write_history_end', 'write_history_interval', 'write_history_values')
+_parse_dirreq_read_history_line = functools.partial(_parse_history_line, 'dirreq-read-history', 'dir_read_history_end', 'dir_read_history_interval', 'dir_read_history_values')
+_parse_dirreq_write_history_line = functools.partial(_parse_history_line, 'dirreq-write-history', 'dir_write_history_end', 'dir_write_history_interval', 'dir_write_history_values')
+_parse_exit_kibibytes_written_line = functools.partial(_parse_port_count_line, 'exit-kibibytes-written', 'exit_kibibytes_written')
+_parse_exit_kibibytes_read_line = functools.partial(_parse_port_count_line, 'exit-kibibytes-read', 'exit_kibibytes_read')
+_parse_exit_streams_opened_line = functools.partial(_parse_port_count_line, 'exit-streams-opened', 'exit_streams_opened')
+_parse_dirreq_v2_ips_line = functools.partial(_parse_geoip_to_count_line, 'dirreq-v2-ips', 'dir_v2_ips')
+_parse_dirreq_v3_ips_line = functools.partial(_parse_geoip_to_count_line, 'dirreq-v3-ips', 'dir_v3_ips')
+_parse_dirreq_v2_reqs_line = functools.partial(_parse_geoip_to_count_line, 'dirreq-v2-reqs', 'dir_v2_requests')
+_parse_dirreq_v3_reqs_line = functools.partial(_parse_geoip_to_count_line, 'dirreq-v3-reqs', 'dir_v3_requests')
+_parse_geoip_client_origins_line = functools.partial(_parse_geoip_to_count_line, 'geoip-client-origins', 'geoip_client_origins')
+_parse_entry_ips_line = functools.partial(_parse_geoip_to_count_line, 'entry-ips', 'entry_ips')
+_parse_bridge_ips_line = functools.partial(_parse_geoip_to_count_line, 'bridge-ips', 'bridge_ips')
+
+
 class ExtraInfoDescriptor(Descriptor):
   """
   Extra-info descriptor document.
@@ -465,376 +809,94 @@ class ExtraInfoDescriptor(Descriptor):
       value, _, _ = values[0]
       line = '%s %s' % (keyword, value)  # original line
 
-      if keyword == 'extra-info':
-        # "extra-info" Nickname Fingerprint
-        extra_info_comp = value.split()
-
-        if len(extra_info_comp) < 2:
-          if not validate:
-            continue
-
-          raise ValueError('Extra-info line must have two values: %s' % line)
-
-        if validate:
-          if not stem.util.tor_tools.is_valid_nickname(extra_info_comp[0]):
-            raise ValueError("Extra-info line entry isn't a valid nickname: %s" % extra_info_comp[0])
-          elif not stem.util.tor_tools.is_valid_fingerprint(extra_info_comp[1]):
-            raise ValueError('Tor relay fingerprints consist of forty hex digits: %s' % extra_info_comp[1])
-
-        self.nickname = extra_info_comp[0]
-        self.fingerprint = extra_info_comp[1]
-      elif keyword == 'geoip-db-digest':
-        # "geoip-db-digest" Digest
-
-        if validate and not stem.util.tor_tools.is_hex_digits(value, 40):
-          raise ValueError('Geoip digest line had an invalid sha1 digest: %s' % line)
-
-        self.geoip_db_digest = value
-      elif keyword == 'geoip6-db-digest':
-        # "geoip6-db-digest" Digest
-
-        if validate and not stem.util.tor_tools.is_hex_digits(value, 40):
-          raise ValueError('Geoip v6 digest line had an invalid sha1 digest: %s' % line)
-
-        self.geoip6_db_digest = value
-      elif keyword == 'transport':
-        # "transport" transportname address:port [arglist]
-        # Everything after the transportname is scrubbed in published bridge
-        # descriptors, so we'll never see it in practice.
-        #
-        # These entries really only make sense for bridges, but have been seen
-        # on non-bridges in the wild when the relay operator configured it this
-        # way.
-
-        for transport_value, _, _ in values:
-          name, address, port, args = None, None, None, None
-
-          if ' ' not in transport_value:
-            # scrubbed
-            name = transport_value
-          else:
-            # not scrubbed
-            value_comp = transport_value.split()
-
-            if len(value_comp) < 1:
-              raise ValueError('Transport line is missing its transport name: %s' % line)
-            else:
-              name = value_comp[0]
-
-            if len(value_comp) < 2:
-              raise ValueError('Transport line is missing its address:port value: %s' % line)
-            elif ':' not in value_comp[1]:
-              raise ValueError("Transport line's address:port entry is missing a colon: %s" % line)
-            else:
-              address, port_str = value_comp[1].split(':', 1)
-
-              if not stem.util.connection.is_valid_ipv4_address(address) or \
-                     stem.util.connection.is_valid_ipv6_address(address):
-                raise ValueError('Transport line has a malformed address: %s' % line)
-              elif not stem.util.connection.is_valid_port(port_str):
-                raise ValueError('Transport line has a malformed port: %s' % line)
-
-              port = int(port_str)
-
-            if len(value_comp) >= 3:
-              args = value_comp[2:]
-            else:
-              args = []
-
-          self.transport[name] = (address, port, args)
-      elif keyword == 'cell-circuits-per-decile':
-        # "cell-circuits-per-decile" num
-
-        if not value.isdigit():
-          if validate:
-            raise ValueError('Non-numeric cell-circuits-per-decile value: %s' % line)
-          else:
-            continue
-
-        stat = int(value)
-
-        if validate and stat < 0:
-          raise ValueError('Negative cell-circuits-per-decile value: %s' % line)
-
-        self.cell_circuits_per_decile = stat
-      elif keyword in ('dirreq-v2-resp', 'dirreq-v3-resp', 'dirreq-v2-direct-dl', 'dirreq-v3-direct-dl', 'dirreq-v2-tunneled-dl', 'dirreq-v3-tunneled-dl'):
-        recognized_counts = {}
-        unrecognized_counts = {}
-
-        is_response_stats = keyword in ('dirreq-v2-resp', 'dirreq-v3-resp')
-        key_set = DirResponse if is_response_stats else DirStat
-
-        key_type = 'STATUS' if is_response_stats else 'STAT'
-        error_msg = '%s lines should contain %s=COUNT mappings: %s' % (keyword, key_type, line)
-
-        if value:
-          for entry in value.split(','):
-            if '=' not in entry:
-              if validate:
-                raise ValueError(error_msg)
-              else:
-                continue
-
-            status, count = entry.split('=', 1)
-
-            if count.isdigit():
-              if status in key_set:
-                recognized_counts[status] = int(count)
-              else:
-                unrecognized_counts[status] = int(count)
-            elif validate:
-              raise ValueError(error_msg)
-
-        if keyword == 'dirreq-v2-resp':
-          self.dir_v2_responses = recognized_counts
-          self.dir_v2_responses_unknown = unrecognized_counts
+      try:
+        if keyword == 'extra-info':
+          _parse_extra_info_line(self, entries)
+        elif keyword == 'geoip-db-digest':
+          _parse_geoip_db_digest_line(self, entries)
+        elif keyword == 'geoip6-db-digest':
+          _parse_geoip6_db_digest_line(self, entries)
+        elif keyword == 'transport':
+          _parse_transport_line(self, entries)
+        elif keyword == 'cell-circuits-per-decile':
+          _parse_cell_circuits_per_decline_line(self, entries)
+        elif keyword == 'dirreq-v2-resp':
+          _parse_dirreq_v2_resp_line(self, entries)
         elif keyword == 'dirreq-v3-resp':
-          self.dir_v3_responses = recognized_counts
-          self.dir_v3_responses_unknown = unrecognized_counts
+          _parse_dirreq_v3_resp_line(self, entries)
         elif keyword == 'dirreq-v2-direct-dl':
-          self.dir_v2_direct_dl = recognized_counts
-          self.dir_v2_direct_dl_unknown = unrecognized_counts
+          _parse_dirreq_v2_direct_dl_line(self, entries)
         elif keyword == 'dirreq-v3-direct-dl':
-          self.dir_v3_direct_dl = recognized_counts
-          self.dir_v3_direct_dl_unknown = unrecognized_counts
+          _parse_dirreq_v3_direct_dl_line(self, entries)
         elif keyword == 'dirreq-v2-tunneled-dl':
-          self.dir_v2_tunneled_dl = recognized_counts
-          self.dir_v2_tunneled_dl_unknown = unrecognized_counts
+          _parse_dirreq_v2_tunneled_dl_line(self, entries)
         elif keyword == 'dirreq-v3-tunneled-dl':
-          self.dir_v3_tunneled_dl = recognized_counts
-          self.dir_v3_tunneled_dl_unknown = unrecognized_counts
-      elif keyword in ('dirreq-v2-share', 'dirreq-v3-share'):
-        # "<keyword>" num%
-
-        try:
-          if not value.endswith('%'):
-            raise ValueError()
-
-          percentage = float(value[:-1]) / 100
-
-          # Bug lets these be above 100%, however they're soon going away...
-          # https://lists.torproject.org/pipermail/tor-dev/2012-June/003679.html
-
-          if validate and percentage < 0:
-            raise ValueError('Negative percentage value: %s' % line)
-
-          if keyword == 'dirreq-v2-share':
-            self.dir_v2_share = percentage
-          elif keyword == 'dirreq-v3-share':
-            self.dir_v3_share = percentage
-        except ValueError as exc:
-          if validate:
-            raise ValueError("Value can't be parsed as a percentage: %s" % line)
-      elif keyword in ('cell-processed-cells', 'cell-queued-cells', 'cell-time-in-queue'):
-        # "<keyword>" num,...,num
-
-        entries = []
-
-        if value:
-          for entry in value.split(','):
-            try:
-              # Values should be positive but as discussed in ticket #5849
-              # there was a bug around this. It was fixed in tor 0.2.2.1.
-
-              entries.append(float(entry))
-            except ValueError:
-              if validate:
-                raise ValueError('Non-numeric entry in %s listing: %s' % (keyword, line))
-
-        if keyword == 'cell-processed-cells':
-          self.cell_processed_cells = entries
+          _parse_dirreq_v3_tunneled_dl_line(self, entries)
+        elif keyword == 'dirreq-v2-share':
+          _parse_dirreq_v2_share_line(self, entries)
+        elif keyword == 'dirreq-v3-share':
+          _parse_dirreq_v3_share_line(self, entries)
+        elif keyword == 'cell-processed-cells':
+          _parse_cell_processed_cells_line(self, entries)
         elif keyword == 'cell-queued-cells':
-          self.cell_queued_cells = entries
+          _parse_cell_queued_cells_line(self, entries)
         elif keyword == 'cell-time-in-queue':
-          self.cell_time_in_queue = entries
-      elif keyword in ('published', 'geoip-start-time'):
-        # "<keyword>" YYYY-MM-DD HH:MM:SS
-
-        try:
-          timestamp = stem.util.str_tools._parse_timestamp(value)
-
-          if keyword == 'published':
-            self.published = timestamp
-          elif keyword == 'geoip-start-time':
-            self.geoip_start_time = timestamp
-        except ValueError:
-          if validate:
-            raise ValueError("Timestamp on %s line wasn't parsable: %s" % (keyword, line))
-      elif keyword in ('cell-stats-end', 'entry-stats-end', 'exit-stats-end', 'bridge-stats-end', 'dirreq-stats-end'):
-        # "<keyword>" YYYY-MM-DD HH:MM:SS (NSEC s)
-
-        try:
-          timestamp, interval, _ = _parse_timestamp_and_interval(keyword, value)
-
-          if keyword == 'cell-stats-end':
-            self.cell_stats_end = timestamp
-            self.cell_stats_interval = interval
-          elif keyword == 'entry-stats-end':
-            self.entry_stats_end = timestamp
-            self.entry_stats_interval = interval
-          elif keyword == 'exit-stats-end':
-            self.exit_stats_end = timestamp
-            self.exit_stats_interval = interval
-          elif keyword == 'bridge-stats-end':
-            self.bridge_stats_end = timestamp
-            self.bridge_stats_interval = interval
-          elif keyword == 'dirreq-stats-end':
-            self.dir_stats_end = timestamp
-            self.dir_stats_interval = interval
-        except ValueError as exc:
-          if validate:
-            raise exc
-      elif keyword == 'conn-bi-direct':
-        # "conn-bi-direct" YYYY-MM-DD HH:MM:SS (NSEC s) BELOW,READ,WRITE,BOTH
-
-        try:
-          timestamp, interval, remainder = _parse_timestamp_and_interval(keyword, value)
-          stats = remainder.split(',')
-
-          if len(stats) != 4 or not \
-            (stats[0].isdigit() and stats[1].isdigit() and stats[2].isdigit() and stats[3].isdigit()):
-            raise ValueError('conn-bi-direct line should end with four numeric values: %s' % line)
-
-          self.conn_bi_direct_end = timestamp
-          self.conn_bi_direct_interval = interval
-          self.conn_bi_direct_below = int(stats[0])
-          self.conn_bi_direct_read = int(stats[1])
-          self.conn_bi_direct_write = int(stats[2])
-          self.conn_bi_direct_both = int(stats[3])
-        except ValueError as exc:
-          if validate:
-            raise exc
-      elif keyword in ('read-history', 'write-history', 'dirreq-read-history', 'dirreq-write-history'):
-        # "<keyword>" YYYY-MM-DD HH:MM:SS (NSEC s) NUM,NUM,NUM,NUM,NUM...
-        try:
-          timestamp, interval, remainder = _parse_timestamp_and_interval(keyword, value)
-          history_values = []
-
-          if remainder:
-            try:
-              history_values = [int(entry) for entry in remainder.split(",")]
-            except ValueError:
-              raise ValueError('%s line has non-numeric values: %s' % (keyword, line))
-
-          if keyword == 'read-history':
-            self.read_history_end = timestamp
-            self.read_history_interval = interval
-            self.read_history_values = history_values
-          elif keyword == 'write-history':
-            self.write_history_end = timestamp
-            self.write_history_interval = interval
-            self.write_history_values = history_values
-          elif keyword == 'dirreq-read-history':
-            self.dir_read_history_end = timestamp
-            self.dir_read_history_interval = interval
-            self.dir_read_history_values = history_values
-          elif keyword == 'dirreq-write-history':
-            self.dir_write_history_end = timestamp
-            self.dir_write_history_interval = interval
-            self.dir_write_history_values = history_values
-        except ValueError as exc:
-          if validate:
-            raise exc
-      elif keyword in ('exit-kibibytes-written', 'exit-kibibytes-read', 'exit-streams-opened'):
-        # "<keyword>" port=N,port=N,...
-
-        port_mappings = {}
-        error_msg = 'Entries in %s line should only be PORT=N entries: %s' % (keyword, line)
-
-        if value:
-          for entry in value.split(','):
-            if '=' not in entry:
-              if validate:
-                raise ValueError(error_msg)
-              else:
-                continue
-
-            port, stat = entry.split('=', 1)
-
-            if (port == 'other' or stem.util.connection.is_valid_port(port)) and stat.isdigit():
-              if port != 'other':
-                port = int(port)
-              port_mappings[port] = int(stat)
-            elif validate:
-              raise ValueError(error_msg)
-
-        if keyword == 'exit-kibibytes-written':
-          self.exit_kibibytes_written = port_mappings
+          _parse_cell_time_in_queue_line(self, entries)
+        elif keyword == 'published':
+          _parse_published_line(self, entries)
+        elif keyword == 'geoip-start-time':
+          _parse_geoip_start_time_line(self, entries)
+        elif keyword == 'cell-stats-end':
+          _parse_cell_stats_end_line(self, entries)
+        elif keyword == 'entry-stats-end':
+          _parse_entry_stats_end_line(self, entries)
+        elif keyword == 'exit-stats-end':
+          _parse_exit_stats_end_line(self, entries)
+        elif keyword == 'bridge-stats-end':
+          _parse_bridge_stats_end_line(self, entries)
+        elif keyword == 'dirreq-stats-end':
+          _parse_dirreq_stats_end_line(self, entries)
+        elif keyword == 'conn-bi-direct':
+          _parse_conn_bi_direct_line(self, entries)
+        elif keyword == 'read-history':
+          _parse_read_history_line(self, entries)
+        elif keyword == 'write-history':
+          _parse_write_history_line(self, entries)
+        elif keyword == 'dirreq-read-history':
+          _parse_dirreq_read_history_line(self, entries)
+        elif keyword == 'dirreq-write-history':
+          _parse_dirreq_write_history_line(self, entries)
+        elif keyword == 'exit-kibibytes-written':
+          _parse_exit_kibibytes_written_line(self, entries)
         elif keyword == 'exit-kibibytes-read':
-          self.exit_kibibytes_read = port_mappings
+          _parse_exit_kibibytes_read_line(self, entries)
         elif keyword == 'exit-streams-opened':
-          self.exit_streams_opened = port_mappings
-      elif keyword in ('dirreq-v2-ips', 'dirreq-v3-ips', 'dirreq-v2-reqs', 'dirreq-v3-reqs', 'geoip-client-origins', 'entry-ips', 'bridge-ips'):
-        # "<keyword>" CC=N,CC=N,...
-        #
-        # The maxmind geoip (https://www.maxmind.com/app/iso3166) has numeric
-        # locale codes for some special values, for instance...
-        #   A1,"Anonymous Proxy"
-        #   A2,"Satellite Provider"
-        #   ??,"Unknown"
-
-        locale_usage = {}
-        error_msg = 'Entries in %s line should only be CC=N entries: %s' % (keyword, line)
-
-        if value:
-          for entry in value.split(','):
-            if '=' not in entry:
-              if validate:
-                raise ValueError(error_msg)
-              else:
-                continue
-
-            locale, count = entry.split('=', 1)
-
-            if _locale_re.match(locale) and count.isdigit():
-              locale_usage[locale] = int(count)
-            elif validate:
-              raise ValueError(error_msg)
-
-        if keyword == 'dirreq-v2-ips':
-          self.dir_v2_ips = locale_usage
+          _parse_exit_streams_opened_line(self, entries)
+        elif keyword == 'dirreq-v2-ips':
+          _parse_dirreq_v2_ips_line(self, entries)
         elif keyword == 'dirreq-v3-ips':
-          self.dir_v3_ips = locale_usage
+          _parse_dirreq_v3_ips_line(self, entries)
         elif keyword == 'dirreq-v2-reqs':
-          self.dir_v2_requests = locale_usage
+          _parse_dirreq_v2_reqs_line(self, entries)
         elif keyword == 'dirreq-v3-reqs':
-          self.dir_v3_requests = locale_usage
+          _parse_dirreq_v3_reqs_line(self, entries)
         elif keyword == 'geoip-client-origins':
-          self.geoip_client_origins = locale_usage
+          _parse_geoip_client_origins_line(self, entries)
         elif keyword == 'entry-ips':
-          self.entry_ips = locale_usage
+          _parse_entry_ips_line(self, entries)
         elif keyword == 'bridge-ips':
-          self.bridge_ips = locale_usage
-      elif keyword == 'bridge-ip-versions':
-        self.ip_versions = {}
-
-        if value:
-          for entry in value.split(','):
-            if '=' not in entry:
-              raise stem.ProtocolError("The bridge-ip-versions should be a comma separated listing of '<protocol>=<count>' mappings: %s" % line)
-
-            protocol, count = entry.split('=', 1)
-
-            if not count.isdigit():
-              raise stem.ProtocolError('IP protocol count was non-numeric (%s): %s' % (count, line))
-
-            self.ip_versions[protocol] = int(count)
-      elif keyword == 'bridge-ip-transports':
-        self.ip_transports = {}
-
-        if value:
-          for entry in value.split(','):
-            if '=' not in entry:
-              raise stem.ProtocolError("The bridge-ip-transports should be a comma separated listing of '<protocol>=<count>' mappings: %s" % line)
-
-            protocol, count = entry.split('=', 1)
-
-            if not count.isdigit():
-              raise stem.ProtocolError('Transport count was non-numeric (%s): %s' % (count, line))
-
-            self.ip_transports[protocol] = int(count)
-      else:
-        self._unrecognized_lines.append(line)
+          _parse_bridge_ips_line(self, entries)
+        elif keyword == 'bridge-ip-versions':
+          _parse_bridge_ip_versions_line(self, entries)
+        elif keyword == 'bridge-ip-transports':
+          _parse_bridge_ip_transports_line(self, entries)
+        else:
+          self._unrecognized_lines.append(line)
+      except ValueError as exc:
+        if validate:
+          raise exc
+        else:
+          continue
 
   def digest(self):
     """
diff --git a/test/unit/descriptor/extrainfo_descriptor.py b/test/unit/descriptor/extrainfo_descriptor.py
index 7e67019..525ff06 100644
--- a/test/unit/descriptor/extrainfo_descriptor.py
+++ b/test/unit/descriptor/extrainfo_descriptor.py
@@ -200,10 +200,10 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
 
     for entry in test_entries:
       desc_text = get_relay_extrainfo_descriptor({'geoip-db-digest': entry}, content = True)
-      self._expect_invalid_attr(desc_text, 'geoip_db_digest', entry)
+      self._expect_invalid_attr(desc_text, 'geoip_db_digest')
 
       desc_text = get_relay_extrainfo_descriptor({'geoip6-db-digest': entry}, content = True)
-      self._expect_invalid_attr(desc_text, 'geoip6_db_digest', entry)
+      self._expect_invalid_attr(desc_text, 'geoip6_db_digest')
 
   def test_cell_circuits_per_decile(self):
     """
@@ -257,8 +257,8 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
       for entry in test_entries:
         desc_text = get_relay_extrainfo_descriptor({keyword: entry}, content = True)
         desc = self._expect_invalid_attr(desc_text)
-        self.assertEqual({}, getattr(desc, attr))
-        self.assertEqual({}, getattr(desc, unknown_attr))
+        self.assertEqual(None, getattr(desc, attr))
+        self.assertEqual(None, getattr(desc, unknown_attr))
 
   def test_dir_stat_lines(self):
     """
@@ -299,8 +299,8 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
       for entry in test_entries:
         desc_text = get_relay_extrainfo_descriptor({keyword: entry}, content = True)
         desc = self._expect_invalid_attr(desc_text)
-        self.assertEqual({}, getattr(desc, attr))
-        self.assertEqual({}, getattr(desc, unknown_attr))
+        self.assertEqual(None, getattr(desc, attr))
+        self.assertEqual(None, getattr(desc, unknown_attr))
 
   def test_conn_bi_direct(self):
     """
@@ -360,15 +360,15 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
         self.assertEqual(expected_value, getattr(desc, attr))
 
       test_entries = (
-        ('', None),
-        (' ', None),
-        ('100', None),
-        ('-5%', -0.05),
+        (''),
+        (' '),
+        ('100'),
+        ('-5%'),
       )
 
-      for entry, expected in test_entries:
+      for entry in test_entries:
         desc_text = get_relay_extrainfo_descriptor({keyword: entry}, content = True)
-        self._expect_invalid_attr(desc_text, attr, expected)
+        self._expect_invalid_attr(desc_text, attr)
 
   def test_number_list_lines(self):
     """
@@ -525,7 +525,7 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
 
       for entry in test_entries:
         desc_text = get_relay_extrainfo_descriptor({keyword: entry}, content = True)
-        self._expect_invalid_attr(desc_text, attr, {})
+        self._expect_invalid_attr(desc_text, attr)
 
   def test_locale_mapping_lines(self):
     """
@@ -554,7 +554,7 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
 
       for entry in test_entries:
         desc_text = get_relay_extrainfo_descriptor({keyword: entry}, content = True)
-        self._expect_invalid_attr(desc_text, attr, {})
+        self._expect_invalid_attr(desc_text, attr)
 
   def test_minimal_bridge_descriptor(self):
     """