[tor-commits] [stem/master] Using binary mode when reading descriptors

atagar at torproject.org atagar at torproject.org
Sat Feb 2 18:20:50 UTC 2013


commit 3afa4346d102d99eab93929be738cdb594105d9b
Author: Damian Johnson <atagar at torproject.org>
Date:   Thu Jan 31 09:44:18 2013 -0800

    Using binary mode when reading descriptors
    
    Now Damian, repleat after me: text mode is bad.
    
    In python 2.x text mode and binary mode seem to be indistinguishable, but in
    python 3 there's one tiny little difference: text mode is around 33x slower.
    The integ test that read the cached-consensus took over five minutes (by
    comparison to ten seconds with python 2.7), and in one case simply hung for
    twenty minutes before I killed it.
    
    I'm not aware of any disadvantage to using binary mode, so opting for that.
---
 stem/descriptor/__init__.py                   |   11 +++++-
 stem/descriptor/reader.py                     |    7 +---
 stem/descriptor/server_descriptor.py          |    2 +-
 stem/util/str_tools.py                        |    1 +
 test/integ/descriptor/__init__.py             |   15 -------
 test/integ/descriptor/extrainfo_descriptor.py |    8 ++--
 test/integ/descriptor/networkstatus.py        |   18 ++++----
 test/integ/descriptor/server_descriptor.py    |   55 ++++++++-----------------
 test/unit/descriptor/server_descriptor.py     |    3 +-
 9 files changed, 45 insertions(+), 75 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index bebcdcc..788b442 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -83,7 +83,7 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
   tordnsel 1.0                          **unsupported**
   ===================================== =====
 
-  If you're using python 3 then beware of the open() function's universal
+  If you're using **python 3** then beware of the open() function's universal
   newline translation. By default open() converts all common line endings (NL,
   CR, and CRNL) into NL. In some edge cases this can cause us to misparse
   content. To disable newline translation set the **newline** to an empty
@@ -93,6 +93,15 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate =
 
     my_descriptor_file = open(descrptor_path, newline='')
 
+  What's more, python 3's read performance in **text mode** is deplorably bad
+  (my testing with python 3.2 shows it to be 33x slower). Using **binary mode**
+  is strongly suggested. If you do this then newline translation is
+  automatically disabled...
+
+  ::
+
+    my_descriptor_file = open(descriptor_path, 'rb')
+
   :param file descriptor_file: opened file with the descriptor contents
   :param str descriptor_type: `descriptor type <https://metrics.torproject.org/formats.html#descriptortypes>`_, this is guessed if not provided
   :param str path: absolute path to the file's location on disk
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index ef35fc0..f00e0aa 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -513,12 +513,7 @@ class DescriptorReader(object):
     try:
       self._notify_read_listeners(target)
 
-      if stem.prereq.is_python_3():
-        target_file = open(target, newline = '')
-      else:
-        target_file = open(target)
-
-      with target_file as target_file:
+      with open(target, 'rb') as target_file:
         for desc in stem.descriptor.parse_file(target_file, validate = self._validate, path = target):
           if self._is_stopped.isSet():
             return
diff --git a/stem/descriptor/server_descriptor.py b/stem/descriptor/server_descriptor.py
index 2376a56..95a4ef0 100644
--- a/stem/descriptor/server_descriptor.py
+++ b/stem/descriptor/server_descriptor.py
@@ -124,7 +124,7 @@ def _parse_file(descriptor_file, is_bridge = False, validate = True):
 
     if descriptor_content:
       # strip newlines from annotations
-      annotations = map(str.strip, annotations)
+      annotations = map(unicode.strip, annotations)
 
       descriptor_text = "".join(descriptor_content)
 
diff --git a/stem/util/str_tools.py b/stem/util/str_tools.py
index 37756ed..2733875 100644
--- a/stem/util/str_tools.py
+++ b/stem/util/str_tools.py
@@ -72,6 +72,7 @@ else:
     else:
       return msg
 
+
 def to_bytes(msg):
   """
   Provides the ASCII bytes for the given string. This is purely to provide
diff --git a/test/integ/descriptor/__init__.py b/test/integ/descriptor/__init__.py
index 31421f1..58f9f81 100644
--- a/test/integ/descriptor/__init__.py
+++ b/test/integ/descriptor/__init__.py
@@ -12,8 +12,6 @@ __all__ = [
 
 import os
 
-import stem.prereq
-
 DESCRIPTOR_TEST_DATA = os.path.join(os.path.dirname(__file__), "data")
 
 
@@ -23,16 +21,3 @@ def get_resource(filename):
   """
 
   return os.path.join(DESCRIPTOR_TEST_DATA, filename)
-
-
-def open_desc(filename, absolute = False):
-  """
-  Provides the file for a given descriptor in our data directory.
-  """
-
-  path = filename if absolute else get_resource(filename)
-
-  if stem.prereq.is_python_3():
-    return open(path, newline = '')
-  else:
-    return open(path)
diff --git a/test/integ/descriptor/extrainfo_descriptor.py b/test/integ/descriptor/extrainfo_descriptor.py
index 0bcc23f..4cb9803 100644
--- a/test/integ/descriptor/extrainfo_descriptor.py
+++ b/test/integ/descriptor/extrainfo_descriptor.py
@@ -12,7 +12,7 @@ import stem.descriptor.extrainfo_descriptor
 import test.runner
 
 from stem.descriptor.extrainfo_descriptor import DirResponse
-from test.integ.descriptor import open_desc
+from test.integ.descriptor import get_resource
 
 
 class TestExtraInfoDescriptor(unittest.TestCase):
@@ -21,7 +21,7 @@ class TestExtraInfoDescriptor(unittest.TestCase):
     Parses and checks our results against an extrainfo descriptor from metrics.
     """
 
-    descriptor_file = open_desc("extrainfo_relay_descriptor")
+    descriptor_file = open(get_resource("extrainfo_relay_descriptor"), 'rb')
     descriptor_file.readline()  # strip header
     descriptor_contents = descriptor_file.read()
     descriptor_file.close()
@@ -70,7 +70,7 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
     metrics.
     """
 
-    descriptor_file = open_desc("extrainfo_bridge_descriptor")
+    descriptor_file = open(get_resource("extrainfo_bridge_descriptor"), 'rb')
     descriptor_file.readline()  # strip header
     descriptor_contents = descriptor_file.read()
     descriptor_file.close()
@@ -146,7 +146,7 @@ k0d2aofcVbHr4fPQOSST0LXDrhFl5Fqo5um296zpJGvRUeO6S44U/EfJAGShtqWw
       test.runner.skip(self, "(no cached descriptors)")
       return
 
-    with open_desc(descriptor_path, absolute = True) as descriptor_file:
+    with open(descriptor_path, 'rb') as descriptor_file:
       for desc in stem.descriptor.extrainfo_descriptor._parse_file(descriptor_file):
         unrecognized_lines = desc.get_unrecognized_lines()
 
diff --git a/test/integ/descriptor/networkstatus.py b/test/integ/descriptor/networkstatus.py
index 19c3d8f..df676b4 100644
--- a/test/integ/descriptor/networkstatus.py
+++ b/test/integ/descriptor/networkstatus.py
@@ -15,7 +15,7 @@ import stem.descriptor.networkstatus
 import stem.version
 import test.runner
 
-from test.integ.descriptor import get_resource, open_desc
+from test.integ.descriptor import get_resource
 
 
 class TestNetworkStatus(unittest.TestCase):
@@ -42,7 +42,7 @@ class TestNetworkStatus(unittest.TestCase):
       return
 
     count = 0
-    with open_desc(consensus_path, absolute = True) as descriptor_file:
+    with open(consensus_path, 'rb') as descriptor_file:
       document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
 
       for router in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type):
@@ -89,7 +89,7 @@ class TestNetworkStatus(unittest.TestCase):
       return
 
     count = 0
-    with open_desc(consensus_path, absolute = True) as descriptor_file:
+    with open(consensus_path, 'rb') as descriptor_file:
       document_type = stem.descriptor.networkstatus.NetworkStatusDocumentV3
 
       for router in stem.descriptor.networkstatus._parse_file(descriptor_file, document_type, is_microdescriptor = True):
@@ -120,7 +120,7 @@ class TestNetworkStatus(unittest.TestCase):
     consensus_path = get_resource("metrics_consensus")
 
     for specify_type in (True, False):
-      with open_desc(consensus_path, absolute = True) as descriptor_file:
+      with open(consensus_path, 'rb') as descriptor_file:
         if specify_type:
           descriptors = stem.descriptor.parse_file(descriptor_file, "network-status-consensus-3 1.0", path = consensus_path)
         else:
@@ -142,7 +142,7 @@ class TestNetworkStatus(unittest.TestCase):
 
     consensus_path = get_resource("bridge_network_status")
 
-    with open_desc(consensus_path, absolute = True) as descriptor_file:
+    with open(consensus_path, 'rb') as descriptor_file:
       descriptors = stem.descriptor.parse_file(descriptor_file, path = consensus_path)
 
       router = next(descriptors)
@@ -245,7 +245,7 @@ mfWcW0b+jsrXcJoCxV5IrwCDF3u1aC3diwZY6yiG186pwWbOwE41188XI2DeYPwE
 I/TJmV928na7RLZe2mGHCAW3VQOvV+QkCfj05VZ8CsY=
 -----END SIGNATURE-----"""
 
-    with open_desc("cached-consensus") as descriptor_file:
+    with open(get_resource("cached-consensus"), 'rb') as descriptor_file:
       document = stem.descriptor.networkstatus.NetworkStatusDocumentV3(descriptor_file.read(), default_params = False)
 
       self.assertEquals(3, document.version)
@@ -316,7 +316,7 @@ nTA+fD8JQqpPtb8b0SnG9kwy75eS//sRu7TErie2PzGMxrf9LH0LAgMBAAE=
 TpQQk3nNQF8z6UIvdlvP+DnJV4izWVkQEZgUZgIVM0E=
 -----END SIGNATURE-----"""
 
-    with open_desc("cached-consensus-v2") as descriptor_file:
+    with open(get_resource("cached-consensus-v2"), 'rb') as descriptor_file:
       descriptor_file.readline()  # strip header
       document = stem.descriptor.networkstatus.NetworkStatusDocumentV2(descriptor_file.read())
 
@@ -381,7 +381,7 @@ TpQQk3nNQF8z6UIvdlvP+DnJV4izWVkQEZgUZgIVM0E=
 
     vote_path = get_resource("metrics_vote")
 
-    with open_desc(vote_path, absolute = True) as descriptor_file:
+    with open(vote_path, 'rb') as descriptor_file:
       descriptors = stem.descriptor.parse_file(descriptor_file, path = vote_path)
 
       router = next(descriptors)
@@ -443,7 +443,7 @@ JZ/1HL9sHyZfo6bwaC6YSM9PNiiY6L7rnGpS7UkHiFI+M96VCMorvjm5YPs3FioJ
 DnN5aFtYKiTc19qIC7Nmo+afPdDEf0MlJvEOP5EWl3w=
 -----END SIGNATURE-----"""
 
-    with open_desc("vote") as descriptor_file:
+    with open(get_resource("vote"), 'rb') as descriptor_file:
       document = stem.descriptor.networkstatus.NetworkStatusDocumentV3(descriptor_file.read(), default_params = False)
 
       self.assertEquals(3, document.version)
diff --git a/test/integ/descriptor/server_descriptor.py b/test/integ/descriptor/server_descriptor.py
index 549cea8..c1cd742 100644
--- a/test/integ/descriptor/server_descriptor.py
+++ b/test/integ/descriptor/server_descriptor.py
@@ -12,11 +12,10 @@ import stem.control
 import stem.descriptor
 import stem.descriptor.server_descriptor
 import stem.exit_policy
-import stem.util.str_tools
 import stem.version
 import test.runner
 
-from test.integ.descriptor import open_desc
+from test.integ.descriptor import get_resource
 
 
 class TestServerDescriptor(unittest.TestCase):
@@ -25,10 +24,7 @@ class TestServerDescriptor(unittest.TestCase):
     Parses and checks our results against a server descriptor from metrics.
     """
 
-    descriptor_file = open_desc("example_descriptor")
-    descriptor_file.readline()  # strip header
-    descriptor_contents = descriptor_file.read()
-    descriptor_file.close()
+    descriptor_file = open(get_resource("example_descriptor"), 'rb')
 
     expected_family = [
       "$0CE3CFB1E9CC47B63EA8869813BF6FAB7D4540C1",
@@ -59,7 +55,7 @@ dskLSPz8beUW7bzwDjR6EVNGpyoZde83Ejvau+5F2c6cGnlu91fiZN3suE88iE6e
 Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
 -----END SIGNATURE-----"""
 
-    desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+    desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
     self.assertEquals("caerSidi", desc.nickname)
     self.assertEquals("A7569A83B5706AB1B1A9CB52EFF7D2D32E4553EB", desc.fingerprint)
     self.assertEquals("71.35.133.197", desc.address)
@@ -95,7 +91,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     Parses and checks our results against a server descriptor from metrics.
     """
 
-    with open_desc("metrics_server_desc_multiple") as descriptor_file:
+    with open(get_resource("metrics_server_desc_multiple"), 'rb') as descriptor_file:
       descriptors = list(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
 
       self.assertEquals(2, len(descriptors))
@@ -111,12 +107,9 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     Parses a relay server descriptor from 2005.
     """
 
-    descriptor_file = open_desc("old_descriptor")
-    descriptor_file.readline()  # strip header
-    descriptor_contents = descriptor_file.read()
-    descriptor_file.close()
+    descriptor_file = open(get_resource("old_descriptor"), 'rb')
 
-    desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+    desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
     self.assertEquals("krypton", desc.nickname)
     self.assertEquals("3E2F63E2356F52318B536A12B6445373808A5D6C", desc.fingerprint)
     self.assertEquals("212.37.39.59", desc.address)
@@ -173,8 +166,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
       test.runner.skip(self, "(no cached descriptors)")
       return
 
-    with open_desc(descriptor_path, absolute = True) as descriptor_file:
-      for desc in stem.descriptor.server_descriptor._parse_file(descriptor_file):
+    with open(descriptor_path, 'rb') as descriptor_file:
+      for desc in stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"):
         # the following attributes should be deprecated, and not appear in the wild
         self.assertEquals(None, desc.read_history_end)
         self.assertEquals(None, desc.write_history_end)
@@ -197,14 +190,11 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     Parses a descriptor with non-ascii content.
     """
 
-    descriptor_file = open_desc("non-ascii_descriptor")
-    descriptor_file.readline()  # strip header
-    descriptor_contents = stem.util.str_tools.to_unicode(descriptor_file.read())
-    descriptor_file.close()
+    descriptor_file = open(get_resource("non-ascii_descriptor"), 'rb')
 
     expected_contact = b"2048R/F171EC1F Johan Bl\xc3\xa5b\xc3\xa4ck \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf".decode("utf-8", "replace")
 
-    desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+    desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
     self.assertEquals("torrelay389752132", desc.nickname)
     self.assertEquals("5D47E91A1F7421A4E3255F4D04E534E9A21407BB", desc.fingerprint)
     self.assertEquals("130.243.230.116", desc.address)
@@ -237,12 +227,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     returns ('\r' entries).
     """
 
-    descriptor_file = open_desc("cr_in_contact_line")
-    descriptor_file.readline()  # strip header
-    descriptor_contents = descriptor_file.read()
-    descriptor_file.close()
-
-    desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+    descriptor_file = open(get_resource("cr_in_contact_line"), 'rb')
+    desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
 
     self.assertEquals("pogonip", desc.nickname)
     self.assertEquals("6DABD62BC65D4E6FE620293157FC76968DAB9C9B", desc.fingerprint)
@@ -263,12 +249,8 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     where we shouldn't be.
     """
 
-    descriptor_file = open_desc("negative_uptime")
-    descriptor_file.readline()  # strip header
-    descriptor_contents = descriptor_file.read()
-    descriptor_file.close()
-
-    desc = stem.descriptor.server_descriptor.RelayDescriptor(descriptor_contents)
+    descriptor_file = open(get_resource("negative_uptime"), 'rb')
+    desc = next(stem.descriptor.parse_file(descriptor_file, "server-descriptor 1.0"))
 
     self.assertEquals("TipTor", desc.nickname)
     self.assertEquals("137962D4931DBF08A24E843288B8A155D6D2AEDD", desc.fingerprint)
@@ -277,7 +259,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     # modify the relay version so it's after when the negative uptime bug
     # should appear
 
-    descriptor_contents = descriptor_contents.replace("Tor 0.1.1.25", "Tor 0.1.2.7")
+    descriptor_contents = str(desc).replace("Tor 0.1.1.25", "Tor 0.1.2.7")
     self.assertRaises(ValueError, stem.descriptor.server_descriptor.RelayDescriptor, descriptor_contents)
 
   def test_bridge_descriptor(self):
@@ -285,10 +267,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
     Parses a bridge descriptor.
     """
 
-    descriptor_file = open_desc("bridge_descriptor")
-    descriptor_file.readline()  # strip header
-    descriptor_contents = descriptor_file.read()
-    descriptor_file.close()
+    descriptor_file = open(get_resource("bridge_descriptor"), 'rb')
 
     expected_family = [
       "$CE396C72A3D0880F74C064FEA79D68C15BD380B9",
@@ -296,7 +275,7 @@ Qlx9HNCqCY877ztFRC624ja2ql6A2hBcuoYMbkHjcQ4=
       "$8C8A470D7C23151665A7B84E75E89FCC205A3304",
     ]
 
-    desc = stem.descriptor.server_descriptor.BridgeDescriptor(descriptor_contents)
+    desc = next(stem.descriptor.parse_file(descriptor_file, "bridge-server-descriptor 1.0"))
     self.assertEquals("Unnamed", desc.nickname)
     self.assertEquals("AE54E28ED069CDF45F3009F963EE3B3D6FA26A2E", desc.fingerprint)
     self.assertEquals("10.45.227.253", desc.address)
diff --git a/test/unit/descriptor/server_descriptor.py b/test/unit/descriptor/server_descriptor.py
index 00fd96f..66ec460 100644
--- a/test/unit/descriptor/server_descriptor.py
+++ b/test/unit/descriptor/server_descriptor.py
@@ -9,6 +9,7 @@ import unittest
 import stem.descriptor.server_descriptor
 import stem.exit_policy
 import stem.prereq
+import stem.util.str_tools
 
 from stem.descriptor.server_descriptor import RelayDescriptor, BridgeDescriptor
 
@@ -209,7 +210,7 @@ class TestServerDescriptor(unittest.TestCase):
     desc_text += "\ntrailing text that should be ignored, ho hum"
 
     # running _parse_file should provide an iterator with a single descriptor
-    desc_iter = stem.descriptor.server_descriptor._parse_file(StringIO.StringIO(desc_text))
+    desc_iter = stem.descriptor.server_descriptor._parse_file(StringIO.StringIO(stem.util.str_tools.to_unicode(desc_text)))
     desc_entries = list(desc_iter)
     self.assertEquals(1, len(desc_entries))
     desc = desc_entries[0]





More information about the tor-commits mailing list