commit c7f34c929a4f091b0527bec30d340472b55eacab Author: Damian Johnson atagar@torproject.org Date: Fri Mar 22 19:49:06 2013 -0700
Swapping contact and platform attributes to bytes
The 'contact' and 'platform' lines in a server descriptor can have any arbitrary byte content. This isn't necessarily unicode, so we can't provide that to our callers without potentially mangling the data.
Non-unicode contact lines are surprisingly common (fourteen instance in one cached-descriptors, around 0.4%). --- stem/descriptor/__init__.py | 27 +++++++++++++++++++++++++++ stem/descriptor/server_descriptor.py | 27 ++++++++++++++++----------- test/integ/descriptor/server_descriptor.py | 18 +++++++++--------- test/unit/descriptor/server_descriptor.py | 8 ++++---- 4 files changed, 56 insertions(+), 24 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py index 40c1049..04ee7dc 100644 --- a/stem/descriptor/__init__.py +++ b/stem/descriptor/__init__.py @@ -318,6 +318,33 @@ class Descriptor(object): return self._raw_contents
+def _get_bytes_field(keyword, content): + """ + Provides the value corresponding to the given keyword. This is handy to fetch + values specifically allowed to be arbitrary bytes prior to converting to + unicode. + + :param str keyword: line to look up + :param bytes content: content to look through + + :returns: **bytes** value on the given line, **None** if the line doesn't + exist + + :raises: **ValueError** if the content isn't bytes + """ + + if not isinstance(content, bytes): + raise ValueError("Content must be bytes, got a %s" % type(content)) + + line_match = re.search(stem.util.str_tools._to_bytes("^(opt )?%s(?:[%s]+(.*))?$" % (keyword, WHITESPACE)), content, re.MULTILINE) + + if line_match: + value = line_match.groups()[1] + return b"" if value is None else value + else: + return None + + def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None, include_ending_keyword = False): """ Reads from the descriptor file until we get to one of the given keywords or reach the diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py index a02eb28..000b864 100644 --- a/stem/descriptor/server_descriptor.py +++ b/stem/descriptor/server_descriptor.py @@ -152,11 +152,11 @@ class ServerDescriptor(stem.descriptor.Descriptor): :var int socks_port: ***** port used as client (deprecated, always **None**) :var int dir_port: ***** port used for descriptor mirroring
- :var str platform: line with operating system and tor version + :var bytes platform: line with operating system and tor version :var stem.version.Version tor_version: version of tor :var str operating_system: operating system :var int uptime: uptime when published in seconds - :var str contact: contact information + :var bytes contact: contact information :var stem.exit_policy.ExitPolicy exit_policy: ***** stated exit policy :var stem.exit_policy.MicroExitPolicy exit_policy_v6: ***** exit policy for IPv6 :var set family: ***** nicknames or fingerprints of declared family @@ -209,6 +209,13 @@ class ServerDescriptor(stem.descriptor.Descriptor): """
super(ServerDescriptor, self).__init__(raw_contents) + + # Only a few things can be arbitrary bytes according to the dir-spec, so + # parsing them separately. + + self.platform = stem.descriptor._get_bytes_field("platform", raw_contents) + self.contact = stem.descriptor._get_bytes_field("contact", raw_contents) + raw_contents = stem.util.str_tools._to_unicode(raw_contents)
self.nickname = None @@ -220,11 +227,9 @@ class ServerDescriptor(stem.descriptor.Descriptor): self.socks_port = None self.dir_port = None
- self.platform = None self.tor_version = None self.operating_system = None self.uptime = None - self.contact = None self.exit_policy = None self.exit_policy_v6 = stem.exit_policy.MicroExitPolicy("reject 1-65535") self.family = set() @@ -405,16 +410,16 @@ class ServerDescriptor(stem.descriptor.Descriptor): elif keyword == "platform": # "platform" string
- self.platform = value - - # This line can contain any arbitrary data, but tor seems to report its - # version followed by the os like the following... - # platform Tor 0.2.2.35 (git-73ff13ab3cc9570d) on Linux x86_64 + # The platform attribute was set earlier. This line can contain any + # arbitrary data, but tor seems to report its version followed by the + # os like the following... + # + # platform Tor 0.2.2.35 (git-73ff13ab3cc9570d) on Linux x86_64 # # There's no guarantee that we'll be able to pick these out the # version, but might as well try to save our caller the effort.
- platform_match = re.match("^Tor (\S*).* on (.*)$", self.platform) + platform_match = re.match("^Tor (\S*).* on (.*)$", value)
if platform_match: version_str, self.operating_system = platform_match.groups() @@ -490,7 +495,7 @@ class ServerDescriptor(stem.descriptor.Descriptor):
raise ValueError("Uptime line must have an integer value: %s" % value) elif keyword == "contact": - self.contact = value + pass # parsed as a bytes field earlier elif keyword == "protocols": protocols_match = re.match("^Link (.*) Circuit (.*)$", value)
diff --git a/test/integ/descriptor/server_descriptor.py b/test/integ/descriptor/server_descriptor.py index 9217a57..696634a 100644 --- a/test/integ/descriptor/server_descriptor.py +++ b/test/integ/descriptor/server_descriptor.py @@ -66,12 +66,12 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4= self.assertEquals(9001, desc.or_port) self.assertEquals(None, desc.socks_port) self.assertEquals(None, desc.dir_port) - self.assertEquals("Tor 0.2.1.30 on Linux x86_64", desc.platform) + self.assertEquals(b"Tor 0.2.1.30 on Linux x86_64", desc.platform) self.assertEquals(stem.version.Version("0.2.1.30"), desc.tor_version) self.assertEquals("Linux x86_64", desc.operating_system) self.assertEquals(588217, desc.uptime) self.assertEquals(datetime.datetime(2012, 3, 1, 17, 15, 27), desc.published) - self.assertEquals("www.atagar.com/contact", desc.contact) + self.assertEquals(b"www.atagar.com/contact", desc.contact) self.assertEquals(["1", "2"], desc.link_protocols) self.assertEquals(["1"], desc.circuit_protocols) self.assertEquals(False, desc.hibernating) @@ -120,7 +120,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4= self.assertEquals(8000, desc.or_port) self.assertEquals(None, desc.socks_port) self.assertEquals(None, desc.dir_port) - self.assertEquals("Tor 0.1.0.14 on FreeBSD i386", desc.platform) + self.assertEquals(b"Tor 0.1.0.14 on FreeBSD i386", desc.platform) self.assertEquals(stem.version.Version("0.1.0.14"), desc.tor_version) self.assertEquals("FreeBSD i386", desc.operating_system) self.assertEquals(64820, desc.uptime) @@ -198,7 +198,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
descriptor_file = open(get_resource("non-ascii_descriptor"), 'rb')
- expected_contact = b"2048R/F171EC1F Johan Bl\xc3\xa5b\xc3\xa4ck \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf".decode("utf-8", "replace") + expected_contact = b"2048R/F171EC1F Johan Bl\xc3\xa5b\xc3\xa4ck \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf"
desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0")) self.assertEquals("torrelay389752132", desc.nickname) @@ -207,7 +207,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4= self.assertEquals(9001, desc.or_port) self.assertEquals(None, desc.socks_port) self.assertEquals(None, desc.dir_port) - self.assertEquals("Tor 0.2.2.35 (git-4f42b0a93422f70e) on Linux x86_64", desc.platform) + self.assertEquals(b"Tor 0.2.2.35 (git-4f42b0a93422f70e) on Linux x86_64", desc.platform) self.assertEquals(stem.version.Version("0.2.2.35"), desc.tor_version) self.assertEquals("Linux x86_64", desc.operating_system) self.assertEquals(3103848, desc.uptime) @@ -250,8 +250,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4= # the contact info block is huge so just checking the start and end, # including some of the embedded carriage returns
- contact_start = "jie1 at pacbell dot net -----BEGIN PGP PUBLIC KEY BLOCK-----\rVersion:" - contact_end = "YFRk3NhCY=\r=Xaw3\r-----END PGP PUBLIC KEY BLOCK-----" + contact_start = b"jie1 at pacbell dot net -----BEGIN PGP PUBLIC KEY BLOCK-----\rVersion:" + contact_end = b"YFRk3NhCY=\r=Xaw3\r-----END PGP PUBLIC KEY BLOCK-----"
self.assertTrue(desc.contact.startswith(contact_start)) self.assertTrue(desc.contact.endswith(contact_end)) @@ -295,12 +295,12 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4= self.assertEquals(9001, desc.or_port) self.assertEquals(None, desc.socks_port) self.assertEquals(None, desc.dir_port) - self.assertEquals("Tor 0.2.3.12-alpha (git-800942b4176ca31c) on Linux x86_64", desc.platform) + self.assertEquals(b"Tor 0.2.3.12-alpha (git-800942b4176ca31c) on Linux x86_64", desc.platform) self.assertEquals(stem.version.Version("0.2.3.12-alpha"), desc.tor_version) self.assertEquals("Linux x86_64", desc.operating_system) self.assertEquals(186, desc.uptime) self.assertEquals(datetime.datetime(2012, 3, 22, 17, 34, 38), desc.published) - self.assertEquals("somebody", desc.contact) + self.assertEquals(b"somebody", desc.contact) self.assertEquals(["1", "2"], desc.link_protocols) self.assertEquals(["1"], desc.circuit_protocols) self.assertEquals(False, desc.hibernating) diff --git a/test/unit/descriptor/server_descriptor.py b/test/unit/descriptor/server_descriptor.py index 7e7828d..e8fe6c0 100644 --- a/test/unit/descriptor/server_descriptor.py +++ b/test/unit/descriptor/server_descriptor.py @@ -47,7 +47,7 @@ class TestServerDescriptor(unittest.TestCase): """
desc = get_relay_server_descriptor({"opt": "contact www.atagar.com/contact/"}) - self.assertEquals("www.atagar.com/contact/", desc.contact) + self.assertEquals(b"www.atagar.com/contact/", desc.contact)
def test_unrecognized_line(self): """ @@ -136,12 +136,12 @@ class TestServerDescriptor(unittest.TestCase):
desc_text = get_relay_server_descriptor({"platform": ""}, content = True) desc = RelayDescriptor(desc_text, validate = False) - self.assertEquals("", desc.platform) + self.assertEquals(b"", desc.platform)
# does the same but with 'platform ' replaced with 'platform' desc_text = desc_text.replace(b"platform ", b"platform") desc = RelayDescriptor(desc_text, validate = False) - self.assertEquals("", desc.platform) + self.assertEquals(b"", desc.platform)
def test_protocols_no_circuit_versions(self): """ @@ -234,7 +234,7 @@ class TestServerDescriptor(unittest.TestCase):
desc_text = get_relay_server_descriptor({"<replace>": ""}, content = True) desc_text = desc_text.replace(b"<replace>", b"contact foo\ncontact bar") - self._expect_invalid_attr(desc_text, "contact", "foo") + self._expect_invalid_attr(desc_text, "contact", b"foo")
def test_missing_required_attr(self): """
tor-commits@lists.torproject.org