[tor-commits] [stem/master] Swapping contact and platform attributes to bytes

atagar at torproject.org atagar at torproject.org
Sat Mar 23 03:36:13 UTC 2013


commit c7f34c929a4f091b0527bec30d340472b55eacab
Author: Damian Johnson <atagar at torproject.org>
Date:   Fri Mar 22 19:49:06 2013 -0700

    Swapping contact and platform attributes to bytes
    
    The 'contact' and 'platform' lines in a server descriptor can have any
    arbitrary byte content. This isn't necessarily unicode, so we can't provide
    that to our callers without potentially mangling the data.
    
    Non-unicode contact lines are surprisingly common (fourteen instance in one
    cached-descriptors, around 0.4%).
---
 stem/descriptor/__init__.py                |   27 +++++++++++++++++++++++++++
 stem/descriptor/server_descriptor.py       |   27 ++++++++++++++++-----------
 test/integ/descriptor/server_descriptor.py |   18 +++++++++---------
 test/unit/descriptor/server_descriptor.py  |    8 ++++----
 4 files changed, 56 insertions(+), 24 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 40c1049..04ee7dc 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -318,6 +318,33 @@ class Descriptor(object):
       return self._raw_contents
 
 
+def _get_bytes_field(keyword, content):
+  """
+  Provides the value corresponding to the given keyword. This is handy to fetch
+  values specifically allowed to be arbitrary bytes prior to converting to
+  unicode.
+
+  :param str keyword: line to look up
+  :param bytes content: content to look through
+
+  :returns: **bytes** value on the given line, **None** if the line doesn't
+    exist
+
+  :raises: **ValueError** if the content isn't bytes
+  """
+
+  if not isinstance(content, bytes):
+    raise ValueError("Content must be bytes, got a %s" % type(content))
+
+  line_match = re.search(stem.util.str_tools._to_bytes("^(opt )?%s(?:[%s]+(.*))?$" % (keyword, WHITESPACE)), content, re.MULTILINE)
+
+  if line_match:
+    value = line_match.groups()[1]
+    return b"" if value is None else value
+  else:
+    return None
+
+
 def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None, include_ending_keyword = False):
   """
   Reads from the descriptor file until we get to one of the given keywords or reach the
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index a02eb28..000b864 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -152,11 +152,11 @@ class ServerDescriptor(stem.descriptor.Descriptor):
   :var int socks_port: **\*** port used as client (deprecated, always **None**)
   :var int dir_port: **\*** port used for descriptor mirroring
 
-  :var str platform: line with operating system and tor version
+  :var bytes platform: line with operating system and tor version
   :var stem.version.Version tor_version: version of tor
   :var str operating_system: operating system
   :var int uptime: uptime when published in seconds
-  :var str contact: contact information
+  :var bytes contact: contact information
   :var stem.exit_policy.ExitPolicy exit_policy: **\*** stated exit policy
   :var stem.exit_policy.MicroExitPolicy exit_policy_v6: **\*** exit policy for IPv6
   :var set family: **\*** nicknames or fingerprints of declared family
@@ -209,6 +209,13 @@ class ServerDescriptor(stem.descriptor.Descriptor):
     """
 
     super(ServerDescriptor, self).__init__(raw_contents)
+
+    # Only a few things can be arbitrary bytes according to the dir-spec, so
+    # parsing them separately.
+
+    self.platform = stem.descriptor._get_bytes_field("platform", raw_contents)
+    self.contact = stem.descriptor._get_bytes_field("contact", raw_contents)
+
     raw_contents = stem.util.str_tools._to_unicode(raw_contents)
 
     self.nickname = None
@@ -220,11 +227,9 @@ class ServerDescriptor(stem.descriptor.Descriptor):
     self.socks_port = None
     self.dir_port = None
 
-    self.platform = None
     self.tor_version = None
     self.operating_system = None
     self.uptime = None
-    self.contact = None
     self.exit_policy = None
     self.exit_policy_v6 = stem.exit_policy.MicroExitPolicy("reject 1-65535")
     self.family = set()
@@ -405,16 +410,16 @@ class ServerDescriptor(stem.descriptor.Descriptor):
       elif keyword == "platform":
         # "platform" string
 
-        self.platform = value
-
-        # This line can contain any arbitrary data, but tor seems to report its
-        # version followed by the os like the following...
-        # platform Tor 0.2.2.35 (git-73ff13ab3cc9570d) on Linux x86_64
+        # The platform attribute was set earlier. This line can contain any
+        # arbitrary data, but tor seems to report its version followed by the
+        # os like the following...
+        #
+        #   platform Tor 0.2.2.35 (git-73ff13ab3cc9570d) on Linux x86_64
         #
         # There's no guarantee that we'll be able to pick these out the
         # version, but might as well try to save our caller the effort.
 
-        platform_match = re.match("^Tor (\S*).* on (.*)$", self.platform)
+        platform_match = re.match("^Tor (\S*).* on (.*)$", value)
 
         if platform_match:
           version_str, self.operating_system = platform_match.groups()
@@ -490,7 +495,7 @@ class ServerDescriptor(stem.descriptor.Descriptor):
 
           raise ValueError("Uptime line must have an integer value: %s" % value)
       elif keyword == "contact":
-        self.contact = value
+        pass  # parsed as a bytes field earlier
       elif keyword == "protocols":
         protocols_match = re.match("^Link (.*) Circuit (.*)$", value)
 
diff --git a/test/integ/descriptor/server_descriptor.py b/test/integ/descriptor/server_descriptor.py
index 9217a57..696634a 100644
--- a/test/integ/descriptor/server_descriptor.py
+++ b/test/integ/descriptor/server_descriptor.py
@@ -66,12 +66,12 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     self.assertEquals(9001, desc.or_port)
     self.assertEquals(None, desc.socks_port)
     self.assertEquals(None, desc.dir_port)
-    self.assertEquals("Tor 0.2.1.30 on Linux x86_64", desc.platform)
+    self.assertEquals(b"Tor 0.2.1.30 on Linux x86_64", desc.platform)
     self.assertEquals(stem.version.Version("0.2.1.30"), desc.tor_version)
     self.assertEquals("Linux x86_64", desc.operating_system)
     self.assertEquals(588217, desc.uptime)
     self.assertEquals(datetime.datetime(2012, 3, 1, 17, 15, 27), desc.published)
-    self.assertEquals("www.atagar.com/contact", desc.contact)
+    self.assertEquals(b"www.atagar.com/contact", desc.contact)
     self.assertEquals(["1", "2"], desc.link_protocols)
     self.assertEquals(["1"], desc.circuit_protocols)
     self.assertEquals(False, desc.hibernating)
@@ -120,7 +120,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     self.assertEquals(8000, desc.or_port)
     self.assertEquals(None, desc.socks_port)
     self.assertEquals(None, desc.dir_port)
-    self.assertEquals("Tor 0.1.0.14 on FreeBSD i386", desc.platform)
+    self.assertEquals(b"Tor 0.1.0.14 on FreeBSD i386", desc.platform)
     self.assertEquals(stem.version.Version("0.1.0.14"), desc.tor_version)
     self.assertEquals("FreeBSD i386", desc.operating_system)
     self.assertEquals(64820, desc.uptime)
@@ -198,7 +198,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
 
     descriptor_file = open(get_resource("non-ascii_descriptor"), 'rb')
 
-    expected_contact = b"2048R/F171EC1F Johan Bl\xc3\xa5b\xc3\xa4ck \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf".decode("utf-8", "replace")
+    expected_contact = b"2048R/F171EC1F Johan Bl\xc3\xa5b\xc3\xa4ck \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf"
 
     desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
     self.assertEquals("torrelay389752132", desc.nickname)
@@ -207,7 +207,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     self.assertEquals(9001, desc.or_port)
     self.assertEquals(None, desc.socks_port)
     self.assertEquals(None, desc.dir_port)
-    self.assertEquals("Tor 0.2.2.35 (git-4f42b0a93422f70e) on Linux x86_64", desc.platform)
+    self.assertEquals(b"Tor 0.2.2.35 (git-4f42b0a93422f70e) on Linux x86_64", desc.platform)
     self.assertEquals(stem.version.Version("0.2.2.35"), desc.tor_version)
     self.assertEquals("Linux x86_64", desc.operating_system)
     self.assertEquals(3103848, desc.uptime)
@@ -250,8 +250,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     # the contact info block is huge so just checking the start and end,
     # including some of the embedded carriage returns
 
-    contact_start = "jie1 at pacbell dot net -----BEGIN PGP PUBLIC KEY BLOCK-----\rVersion:"
-    contact_end = "YFRk3NhCY=\r=Xaw3\r-----END PGP PUBLIC KEY BLOCK-----"
+    contact_start = b"jie1 at pacbell dot net -----BEGIN PGP PUBLIC KEY BLOCK-----\rVersion:"
+    contact_end = b"YFRk3NhCY=\r=Xaw3\r-----END PGP PUBLIC KEY BLOCK-----"
 
     self.assertTrue(desc.contact.startswith(contact_start))
     self.assertTrue(desc.contact.endswith(contact_end))
@@ -295,12 +295,12 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     self.assertEquals(9001, desc.or_port)
     self.assertEquals(None, desc.socks_port)
     self.assertEquals(None, desc.dir_port)
-    self.assertEquals("Tor 0.2.3.12-alpha (git-800942b4176ca31c) on Linux x86_64", desc.platform)
+    self.assertEquals(b"Tor 0.2.3.12-alpha (git-800942b4176ca31c) on Linux x86_64", desc.platform)
     self.assertEquals(stem.version.Version("0.2.3.12-alpha"), desc.tor_version)
     self.assertEquals("Linux x86_64", desc.operating_system)
     self.assertEquals(186, desc.uptime)
     self.assertEquals(datetime.datetime(2012, 3, 22, 17, 34, 38), desc.published)
-    self.assertEquals("somebody", desc.contact)
+    self.assertEquals(b"somebody", desc.contact)
     self.assertEquals(["1", "2"], desc.link_protocols)
     self.assertEquals(["1"], desc.circuit_protocols)
     self.assertEquals(False, desc.hibernating)
diff --git a/test/unit/descriptor/server_descriptor.py b/test/unit/descriptor/server_descriptor.py
index 7e7828d..e8fe6c0 100644
--- a/test/unit/descriptor/server_descriptor.py
+++ b/test/unit/descriptor/server_descriptor.py
@@ -47,7 +47,7 @@ class TestServerDescriptor(unittest.TestCase):
     """
 
     desc = get_relay_server_descriptor({"opt": "contact www.atagar.com/contact/"})
-    self.assertEquals("www.atagar.com/contact/", desc.contact)
+    self.assertEquals(b"www.atagar.com/contact/", desc.contact)
 
   def test_unrecognized_line(self):
     """
@@ -136,12 +136,12 @@ class TestServerDescriptor(unittest.TestCase):
 
     desc_text = get_relay_server_descriptor({"platform": ""}, content = True)
     desc = RelayDescriptor(desc_text, validate = False)
-    self.assertEquals("", desc.platform)
+    self.assertEquals(b"", desc.platform)
 
     # does the same but with 'platform ' replaced with 'platform'
     desc_text = desc_text.replace(b"platform ", b"platform")
     desc = RelayDescriptor(desc_text, validate = False)
-    self.assertEquals("", desc.platform)
+    self.assertEquals(b"", desc.platform)
 
   def test_protocols_no_circuit_versions(self):
     """
@@ -234,7 +234,7 @@ class TestServerDescriptor(unittest.TestCase):
 
     desc_text = get_relay_server_descriptor({"<replace>": ""}, content = True)
     desc_text = desc_text.replace(b"<replace>", b"contact foo\ncontact bar")
-    self._expect_invalid_attr(desc_text, "contact", "foo")
+    self._expect_invalid_attr(desc_text, "contact", b"foo")
 
   def test_missing_required_attr(self):
     """





More information about the tor-commits mailing list