[tor-commits] [stem/master] Explicit non-ascii content validation

Sat Mar 26 23:54:16 UTC 2016

commit 27a654e2e0495c7ac701d792e7db7ed47d4e9753
Author: Damian Johnson <atagar at torproject.org>
Date:   Sat Mar 26 16:14:01 2016 -0700

    Explicit non-ascii content validation
    
    Few hours ago a relay started publishing a malformed extrainfo descriptor...
    
      https://trac.torproject.org/projects/tor/ticket/18656
    
    This is fine, bugs happen. This is why we check for malformed content. But
    this one cost me a few hours since the non-ascii content then caused DocTor to
    choke, providing me a useless stacktrace...
    
      Traceback (most recent call last):
        File "./descriptor_checker.py", line 99, in <module>
          main()
        File "./descriptor_checker.py", line 57, in main
          log.warn("Unable to retrieve the %s: %s" % (descriptor_type, query.error))
      UnicodeEncodeError: 'ascii' codec can't encode character u'\ufffd' in position 76: ordinal not in range(128)
    
    Instead now it provides...
    
      source: http://86.59.21.38:80/tor/extra/all.z
      time: 03/26/2016 16:30
      error: 'dirreq-v3-reqs' line had non-ascii content: S?=4026597208,S?=4026597208,S?=4026597208,S?=4026597208,S?=4026597208,S?=4026597208,??=4026591624,6?=4026537520,6?=4026537520,6?=4026537520,us=8
    
    Non-ascii strings are toxic for systems they're in. They break printing,
    logging, exception handling, and anything else that touches them unless they're
    handled specially.
    
    Changing Stem to explicitly validate that content is ascii, and provide the
    user with escaped strings in those exceptions.
---
 docs/change_log.rst                                |  1 +
 stem/descriptor/__init__.py                        | 11 +++++++-
 stem/descriptor/hidden_service_descriptor.py       |  2 +-
 stem/descriptor/server_descriptor.py               |  2 +-
 test/integ/process.py                              |  2 +-
 .../descriptor/data/extrainfo_nonascii_v3_reqs     | 30 ++++++++++++++++++++++
 test/unit/descriptor/extrainfo_descriptor.py       | 13 ++++++++++
 test/unit/descriptor/reader.py                     |  2 +-
 8 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/docs/change_log.rst b/docs/change_log.rst
index 76143bb..4317307 100644
--- a/docs/change_log.rst
+++ b/docs/change_log.rst
@@ -73,6 +73,7 @@ The following are only available within Stem's `git repository
   * Deprecated :func:`~stem.descriptor.remote.DescriptorDownloader.get_microdescriptors` as it was never implemented in tor (:trac:`9271`)
   * Fixed parsing of server descriptor's *allow-single-hop-exits* and *caches-extra-info* lines
   * Bracketed IPv6 addresses were mistreated as being invalid content
+  * Better validation for non-ascii descriptor content
   * Updated dannenberg's v3ident (:trac:`17906`)
 
  * **Utilities**
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index dc7c1df..65e6ed4 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -42,6 +42,7 @@ import copy
 import hashlib
 import os
 import re
+import string
 import tarfile
 
 import stem.prereq
@@ -777,7 +778,7 @@ def _get_pseudo_pgp_block(remaining_contents):
     return None
 
 
-def _get_descriptor_components(raw_contents, validate, extra_keywords = ()):
+def _get_descriptor_components(raw_contents, validate, extra_keywords = (), non_ascii_fields = ()):
   """
   Initial breakup of the server descriptor contents to make parsing easier.
 
@@ -796,6 +797,7 @@ def _get_descriptor_components(raw_contents, validate, extra_keywords = ()):
     True, skips these checks otherwise
   :param list extra_keywords: entity keywords to put into a separate listing
     with ordering intact
+  :param list non_ascii_fields: fields containing non-ascii content
 
   :returns:
     **collections.OrderedDict** with the 'keyword => (value, pgp key) entries'
@@ -857,6 +859,13 @@ def _get_descriptor_components(raw_contents, validate, extra_keywords = ()):
 
       raise
 
+    if validate and keyword not in non_ascii_fields:
+      try:
+        value.decode('ascii')
+      except UnicodeError:
+        replaced = ''.join([(char if char in string.printable else '?') for char in value])
+        raise ValueError("'%s' line had non-ascii content: %s" % (keyword, replaced))
+
     if keyword in extra_keywords:
       extra_entries.append('%s %s' % (keyword, value))
     else:
diff --git a/stem/descriptor/hidden_service_descriptor.py b/stem/descriptor/hidden_service_descriptor.py
index de7f0e0..43a3e4f 100644
--- a/stem/descriptor/hidden_service_descriptor.py
+++ b/stem/descriptor/hidden_service_descriptor.py
@@ -227,7 +227,7 @@ class HiddenServiceDescriptor(Descriptor):
 
   def __init__(self, raw_contents, validate = False):
     super(HiddenServiceDescriptor, self).__init__(raw_contents, lazy_load = not validate)
-    entries = _get_descriptor_components(raw_contents, validate)
+    entries = _get_descriptor_components(raw_contents, validate, non_ascii_fields = ('introduction-points'))
 
     if validate:
       for keyword in REQUIRED_FIELDS:
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index f096499..5749832 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -553,7 +553,7 @@ class ServerDescriptor(Descriptor):
     # influences the resulting exit policy, but for everything else the order
     # does not matter so breaking it into key / value pairs.
 
-    entries, self._unparsed_exit_policy = _get_descriptor_components(stem.util.str_tools._to_unicode(raw_contents), validate, ('accept', 'reject'))
+    entries, self._unparsed_exit_policy = _get_descriptor_components(stem.util.str_tools._to_unicode(raw_contents), validate, extra_keywords = ('accept', 'reject'), non_ascii_fields = ('contact', 'platform'))
 
     if validate:
       self._parse(entries, validate)
diff --git a/test/integ/process.py b/test/integ/process.py
index 1cce8c7..d8e6b82 100644
--- a/test/integ/process.py
+++ b/test/integ/process.py
@@ -382,7 +382,7 @@ class TestProcess(unittest.TestCase):
     runtime = time.time() - start_time
 
     if not (runtime > 2 and runtime < 3):
-      self.fail('Test should have taken 2-3 seconds, took %i instead' % runtime)
+      self.fail('Test should have taken 2-3 seconds, took %0.1f instead' % runtime)
 
   @require_version(stem.version.Requirement.TAKEOWNERSHIP)
   @only_run_once
diff --git a/test/unit/descriptor/data/extrainfo_nonascii_v3_reqs b/test/unit/descriptor/data/extrainfo_nonascii_v3_reqs
new file mode 100644
index 0000000..c69d59e
--- /dev/null
+++ b/test/unit/descriptor/data/extrainfo_nonascii_v3_reqs
@@ -0,0 +1,30 @@
+extra-info newton C871C91489886D5E2E94C13EA1A5FDC4B6DC5204
+identity-ed25519
+-----BEGIN ED25519 CERT-----
+AQQABjBoAS6cmIIt3t0HjlInkuk+asDf7yAuTXhg20Gdy4uesKH3AQAgBAC6lLwz
+7+yzoMFGDKic6l7kOjyGlV3H01GWtErUWy0PIquP5kMVHnrvdhsBwX0zwSA19Bo+
+tdKPIKrMV+89QBGX2KSa7RRXqIdnGBnGdodglrvMFPGgO/LYHOb9YogFJgA=
+-----END ED25519 CERT-----
+published 2016-03-26 07:47:53
+write-history 2016-03-26 05:42:08 (14400 s) 682447872,1737454592,1207077888,1278699520,1157439488,635725824
+read-history 2016-03-26 05:42:08 (14400 s) 672482304,1680056320,1198174208,1233503232,1105783808,620540928
+dirreq-write-history 2016-03-26 05:42:08 (14400 s) 1960960,46080,2144256,218112,737280,3273728
+dirreq-read-history 2016-03-26 05:42:08 (14400 s) 322560,13312,323584,8192,46080,440320
+geoip-db-digest 6346E26E2BC96F8511588CE2695E9B0339A75D32
+geoip6-db-digest 43CCB43DBC653D8CC16396A882C5F116A6004F0C
+dirreq-stats-end 2016-03-25 19:53:29 (86400 s)
+dirreq-v3-ips us=8
+dirreq-v3-reqs Sÿ=4026597208,Sÿ=4026597208,Sÿ=4026597208,Sÿ=4026597208,Sÿ=4026597208,Sÿ=4026597208,¥þ=4026591624,6=4026537520,6=4026537520,6=4026537520,us=8
+dirreq-v3-resp ok=8,not-enough-sigs=0,unavailable=0,not-found=0,not-modified=0,busy=0
+dirreq-v3-direct-dl complete=0,timeout=0,running=0
+dirreq-v3-tunneled-dl complete=8,timeout=4,running=0
+hidserv-stats-end 2016-03-25 19:53:29 (86400 s)
+hidserv-rend-relayed-cells 5238 delta_f=2048 epsilon=0.30 bin_size=1024
+hidserv-dir-onions-seen -21 delta_f=8 epsilon=0.30 bin_size=8
+router-sig-ed25519 TeDQDiRfI/k31H+fxciE8q02/ddm/qIi4H7wpQ9A8wRwWBoFUBdwrjZu+pACiglomOuc/I2uSt0++/pmZulZDg
+router-signature
+-----BEGIN SIGNATURE-----
+OVkz0CKUwWSXjhUMEsYATzcfT+D24QR8w6IjUtb6Mcz5P5CMggqmSFhL6SzUFXWS
+JwipCWcd2/1NsJ7/JLyXKcG0Ak6FPNP3arweah8EwWkVo7rvscagI+u+Uq2FqtO/
+OTNjnriDWTZdSA9FOcQueBZi7Qf7xwA0LEjtadtCjUI=
+-----END SIGNATURE-----
diff --git a/test/unit/descriptor/extrainfo_descriptor.py b/test/unit/descriptor/extrainfo_descriptor.py
index fa68e62..ce898d3 100644
--- a/test/unit/descriptor/extrainfo_descriptor.py
+++ b/test/unit/descriptor/extrainfo_descriptor.py
@@ -166,6 +166,19 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
     self.assertEqual('7DSOQz9eGgjDX6GT7qcrVViK8yqJD4aoEnuhdAgYtgA', desc.router_digest_sha256)
     self.assertEqual([], desc.get_unrecognized_lines())
 
+  def test_nonascii_v3_reqs(self):
+    """
+    Malformed descriptor with non-ascii content for the 'dirreq-v3-reqs' line.
+    """
+
+    with open(get_resource('extrainfo_nonascii_v3_reqs'), 'rb') as descriptor_file:
+      try:
+        next(stem.descriptor.parse_file(descriptor_file, 'extra-info 1.0', validate = True))
+        self.fail("validation should've raised an exception")
+      except ValueError as exc:
+        expected = "'dirreq-v3-reqs' line had non-ascii content: S?=4026597208,S?=4026597208,S?=4026597208,S?=4026597208,S?=4026597208,S?=4026597208,??=4026591624,6?=4026537520,6?=4026537520,6?=4026537520,us=8"
+        self.assertEqual(expected, str(exc))
+
   def test_minimal_extrainfo_descriptor(self):
     """
     Basic sanity check that we can parse an extrainfo descriptor with minimal
diff --git a/test/unit/descriptor/reader.py b/test/unit/descriptor/reader.py
index 8f617f9..f21ea75 100644
--- a/test/unit/descriptor/reader.py
+++ b/test/unit/descriptor/reader.py
@@ -466,7 +466,7 @@ class TestDescriptorReader(unittest.TestCase):
     reader = stem.descriptor.reader.DescriptorReader(DESCRIPTOR_TEST_DATA)
     reader.register_skip_listener(skip_listener.listener)
 
-    expected_skip_files = ('riddle', 'tiny.png', 'vote', 'new_metrics_type', 'cached-microdesc-consensus_with_carriage_returns')
+    expected_skip_files = ('riddle', 'tiny.png', 'vote', 'new_metrics_type', 'cached-microdesc-consensus_with_carriage_returns', 'extrainfo_nonascii_v3_reqs')
 
     with reader:
       list(reader)  # iterates over all of the descriptors