[tor-commits] [stem/master] Normalize ControlMessage content to be bytes

atagar at torproject.org atagar at torproject.org
Wed May 22 05:01:53 UTC 2013


commit 0362564cc8e1d48ebf7b61c75550fa9e1c4fd321
Author: Damian Johnson <atagar at torproject.org>
Date:   Sun May 19 17:29:43 2013 -0700

    Normalize ControlMessage content to be bytes
    
    The socket module's recv_message function converted read content to be a str
    (bytes in python 2.x and unicode in python 3.x). In 99.9% of the cases this is
    exactly what we want, but in a few edge cases we actually want byte content
    under python 3.x.
    
    For instance, in #8755 descriptors with non-unicode contact lines are being
    altered by the socket module, causing our validation of their signature to
    fail.
    
    Functionally this commit should have no effect besides adding the option to get
    byte content from the ControlMessage. The tests seem happy so fingers crossed!
---
 stem/response/__init__.py |   32 +++++++++++++++++++++++++++-----
 stem/socket.py            |   40 +++++++++++++++++++++-------------------
 stem/util/log.py          |    8 +++++++-
 3 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/stem/response/__init__.py b/stem/response/__init__.py
index 92e4527..01af23e 100644
--- a/stem/response/__init__.py
+++ b/stem/response/__init__.py
@@ -167,7 +167,7 @@ class ControlMessage(object):
 
     return False
 
-  def content(self):
+  def content(self, get_bytes = False):
     """
     Provides the parsed message content. These are entries of the form...
 
@@ -189,19 +189,33 @@ class ControlMessage(object):
     For data entries the content is the full multi-line payload with newline
     linebreaks and leading periods unescaped.
 
+    The **status_code** and **divider** are both strings (**bytes** in python
+    2.x and **unicode** in python 3.x). The **content** however is **bytes** if
+    **get_bytes** is **True**.
+
+    :param bool get_bytes: provides **bytes** for the **content** rather than a **str**
+
     :returns: **list** of (str, str, str) tuples for the components of this message
     """
 
-    return list(self._parsed_content)
+    if stem.prereq.is_python_3() and not get_bytes:
+      return [(code, div, stem.util.str_tools._to_unicode(content)) for (code, div, content) in self._parsed_content]
+    else:
+      return list(self._parsed_content)
 
-  def raw_content(self):
+  def raw_content(self, get_bytes = False):
     """
     Provides the unparsed content read from the control socket.
 
+    :param bool get_bytes: if **True** then this provides **bytes** rather than a **str**
+
     :returns: **str** of the socket data used to generate this message
     """
 
-    return self._raw_content
+    if stem.prereq.is_python_3() and not get_bytes:
+      return stem.util.str_tools._to_unicode(self._raw_content)
+    else:
+      return self._raw_content
 
   def __str__(self):
     """
@@ -235,6 +249,9 @@ class ControlMessage(object):
     """
 
     for _, _, content in self._parsed_content:
+      if stem.prereq.is_python_3():
+        content = stem.util.str_tools._to_unicode(content)
+
       yield ControlLine(content)
 
   def __len__(self):
@@ -249,7 +266,12 @@ class ControlMessage(object):
     :returns: :class:`~stem.response.ControlLine` at the index
     """
 
-    return ControlLine(self._parsed_content[index][2])
+    content = self._parsed_content[index][2]
+
+    if stem.prereq.is_python_3():
+      content = stem.util.str_tools._to_unicode(content)
+
+    return ControlLine(content)
 
 
 class ControlLine(str):
diff --git a/stem/socket.py b/stem/socket.py
index bfe5839..379e5ec 100644
--- a/stem/socket.py
+++ b/stem/socket.py
@@ -464,15 +464,16 @@ def recv_message(control_file):
       a complete message
   """
 
-  parsed_content, raw_content = [], ""
+  parsed_content, raw_content = [], b""
   logging_prefix = "Error while receiving a control message (%s): "
 
   while True:
     try:
-      line = control_file.readline()
+      # From a real socket readline() would always provide bytes, but during
+      # tests we might be given a StringIO in which case it's unicode under
+      # python 3.x.
 
-      if stem.prereq.is_python_3():
-        line = stem.util.str_tools._to_unicode(line)
+      line = stem.util.str_tools._to_bytes(control_file.readline())
     except AttributeError:
       # if the control_file has been closed then we will receive:
       # AttributeError: 'NoneType' object has no attribute 'recv'
@@ -509,17 +510,21 @@ def recv_message(control_file):
       prefix = logging_prefix % "ProtocolError"
       log.info(prefix + "line too short, \"%s\"" % log.escape(line))
       raise stem.ProtocolError("Badly formatted reply line: too short")
-    elif not re.match(r'^[a-zA-Z0-9]{3}[-+ ]', line):
+    elif not re.match(b'^[a-zA-Z0-9]{3}[-+ ]', line):
       prefix = logging_prefix % "ProtocolError"
       log.info(prefix + "malformed status code/divider, \"%s\"" % log.escape(line))
       raise stem.ProtocolError("Badly formatted reply line: beginning is malformed")
-    elif not line.endswith("\r\n"):
+    elif not line.endswith(b"\r\n"):
       prefix = logging_prefix % "ProtocolError"
       log.info(prefix + "no CRLF linebreak, \"%s\"" % log.escape(line))
       raise stem.ProtocolError("All lines should end with CRLF")
 
     line = line[:-2]  # strips off the CRLF
-    status_code, divider, content = line[:3], line[3], line[4:]
+    status_code, divider, content = line[:3], line[3:4], line[4:]
+
+    if stem.prereq.is_python_3():
+      status_code = stem.util.str_tools._to_unicode(status_code)
+      divider = stem.util.str_tools._to_unicode(divider)
 
     if divider == "-":
       # mid-reply line, keep pulling for more content
@@ -528,8 +533,8 @@ def recv_message(control_file):
       # end of the message, return the message
       parsed_content.append((status_code, divider, content))
 
-      log_message = raw_content.replace("\r\n", "\n").rstrip()
-      log.trace("Received from tor:\n" + log_message)
+      log_message = raw_content.replace(b"\r\n", b"\n").rstrip()
+      log.trace("Received from tor:\n" + stem.util.str_tools._to_unicode(log_message))
 
       return stem.response.ControlMessage(parsed_content, raw_content)
     elif divider == "+":
@@ -538,10 +543,7 @@ def recv_message(control_file):
 
       while True:
         try:
-          line = control_file.readline()
-
-          if stem.prereq.is_python_3():
-            line = stem.util.str_tools._to_unicode(line)
+          line = stem.util.str_tools._to_bytes(control_file.readline())
         except socket.error as exc:
           prefix = logging_prefix % "SocketClosed"
           log.info(prefix + "received an exception while mid-way through a data reply (exception: \"%s\", read content: \"%s\")" % (exc, log.escape(raw_content)))
@@ -549,11 +551,11 @@ def recv_message(control_file):
 
         raw_content += line
 
-        if not line.endswith("\r\n"):
+        if not line.endswith(b"\r\n"):
           prefix = logging_prefix % "ProtocolError"
           log.info(prefix + "CRLF linebreaks missing from a data reply, \"%s\"" % log.escape(raw_content))
           raise stem.ProtocolError("All lines should end with CRLF")
-        elif line == ".\r\n":
+        elif line == b".\r\n":
           break  # data block termination
 
         line = line[:-2]  # strips off the CRLF
@@ -561,22 +563,22 @@ def recv_message(control_file):
         # lines starting with a period are escaped by a second period (as per
         # section 2.4 of the control-spec)
 
-        if line.startswith(".."):
+        if line.startswith(b".."):
           line = line[1:]
 
         # appends to previous content, using a newline rather than CRLF
         # separator (more conventional for multi-line string content outside
         # the windows world)
 
-        content += "\n" + line
+        content += b"\n" + line
 
       parsed_content.append((status_code, divider, content))
     else:
       # this should never be reached due to the prefix regex, but might as well
       # be safe...
       prefix = logging_prefix % "ProtocolError"
-      log.warn(prefix + "\"%s\" isn't a recognized divider type" % line)
-      raise stem.ProtocolError("Unrecognized divider type '%s': %s" % (divider, line))
+      log.warn(prefix + "\"%s\" isn't a recognized divider type" % divider)
+      raise stem.ProtocolError("Unrecognized divider type '%s': %s" % (divider, stem.util.str_tools._to_unicode(line)))
 
 
 def send_formatting(message):
diff --git a/stem/util/log.py b/stem/util/log.py
index ca69e7f..6446c23 100644
--- a/stem/util/log.py
+++ b/stem/util/log.py
@@ -50,7 +50,9 @@ them at your own risk.**
 
 import logging
 
+import stem.prereq
 import stem.util.enum
+import stem.util.str_tools
 
 # Logging runlevels. These are *very* commonly used so including shorter
 # aliases (so they can be referenced as log.DEBUG, log.WARN, etc).
@@ -118,13 +120,17 @@ def logging_level(runlevel):
 
 def escape(message):
   """
-  Escapes specific sequences for logging (newlines, tabs, carriage returns).
+  Escapes specific sequences for logging (newlines, tabs, carriage returns). If
+  the input is **bytes** then this converts it to **unicode** under python 3.x.
 
   :param str message: string to be escaped
 
   :returns: str that is escaped
   """
 
+  if stem.prereq.is_python_3():
+    message = stem.util.str_tools._to_unicode(message)
+
   for pattern, replacement in (("\n", "\\n"), ("\r", "\\r"), ("\t", "\\t")):
     message = message.replace(pattern, replacement)
 





More information about the tor-commits mailing list