commit b4c4835d957463d6a453d9d7f7ad6007627abc96 Author: Damian Johnson atagar@torproject.org Date: Tue Jan 29 09:03:30 2013 -0800
Normalizing descriptor handling as unicode
Our python 3 descriptor integration tests were having troubles due to the ASCII/unicode switch. Adding a file interceptor so we always parse descriptors as unicode.
Yes, yes, I know. Ewwww. I definitely don't like the _UnicodeReader helper so suggestions welcome for an alternative. --- stem/descriptor/__init__.py | 65 ++++++++++++++++++++++++++++++++++++++ stem/descriptor/networkstatus.py | 4 +- stem/exit_policy.py | 6 ++-- stem/util/connection.py | 4 +- stem/util/tor_tools.py | 6 ++-- test/integ/descriptor/reader.py | 2 +- test/settings.cfg | 2 +- 7 files changed, 77 insertions(+), 12 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py index 0e4a3e1..fe6c03c 100644 --- a/stem/descriptor/__init__.py +++ b/stem/descriptor/__init__.py @@ -27,6 +27,8 @@ __all__ = [ import os import re
+import stem.prereq + try: # added in python 2.7 from collections import OrderedDict @@ -97,6 +99,10 @@ def parse_file(descriptor_file, descriptor_type = None, path = None, validate = import stem.descriptor.extrainfo_descriptor import stem.descriptor.networkstatus
+ # attempt to read content as unicode + + descriptor_file = _UnicodeReader(descriptor_file) + # The tor descriptor specifications do not provide a reliable method for # identifying a descriptor file's type and version so we need to guess # based on its filename. Metrics descriptors, however, can be identified @@ -236,6 +242,65 @@ class Descriptor(object): return self._raw_contents
+class _UnicodeReader(object): + """ + File-like object that wraps another file. This replaces read ASCII bytes with + unicode content. This only supports read operations. + """ + + def __init__(self, wrapped_file): + self.wrapped_file = wrapped_file + + def close(self): + return self.wrapped_file.close() + + def getvalue(self): + return self.wrapped_file.getvalue() + + def isatty(self): + return self.wrapped_file.isatty() + + def next(self): + return self.wrapped_file.next() + + def read(self, n = -1): + return self._to_unicode(self.wrapped_file.read(n)) + + def readline(self): + return self._to_unicode(self.wrapped_file.readline()) + + def readlines(self, sizehint = 0): + # being careful to do in-place conversion so we don't accidently double our + # memory usage + + results = self.wrapped_file.readlines(sizehint) + + for i in xrange(len(results)): + results[i] = self._to_unicode(results[i]) + + return results + + def seek(self, pos, mode = 0): + return self.wrapped_file.seek(pos, mode) + + def tell(self): + return self.wrapped_file.tell() + + def _to_unicode(self, msg): + if msg is None: + return msg + + if stem.prereq.is_python_3(): + is_unicode = isinstance(msg, str) + else: + is_unicode = isinstance(msg, unicode) + + if is_unicode: + return msg + else: + return msg.decode("utf-8", "replace") + + def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_first = False, skip = False, end_position = None, include_ending_keyword = False): """ Reads from the descriptor file until we get to one of the given keywords or reach the diff --git a/stem/descriptor/networkstatus.py b/stem/descriptor/networkstatus.py index b4bc188..69f1a44 100644 --- a/stem/descriptor/networkstatus.py +++ b/stem/descriptor/networkstatus.py @@ -1343,10 +1343,10 @@ class DocumentSignature(object):
if validate: if not stem.util.tor_tools.is_valid_fingerprint(identity): - raise ValueError("Malformed fingerprint (%s) in the document signature" % (identity)) + raise ValueError("Malformed fingerprint (%s) in the document signature" % identity)
if not stem.util.tor_tools.is_valid_fingerprint(key_digest): - raise ValueError("Malformed key digest (%s) in the document signature" % (key_digest)) + raise ValueError("Malformed key digest (%s) in the document signature" % key_digest)
self.method = method self.identity = identity diff --git a/stem/exit_policy.py b/stem/exit_policy.py index 0ad2c54..e60f121 100644 --- a/stem/exit_policy.py +++ b/stem/exit_policy.py @@ -104,7 +104,7 @@ def get_config_policy(rules): :raises: **ValueError** if input isn't a valid tor exit policy """
- if isinstance(rules, str): + if isinstance(rules, (str, unicode)): rules = rules.split(',')
result = [] @@ -143,7 +143,7 @@ class ExitPolicy(object): def __init__(self, *rules): # sanity check the types for rule in rules: - if not isinstance(rule, (str, ExitPolicyRule)): + if not isinstance(rule, (str, unicode, ExitPolicyRule)): raise TypeError("Exit policy rules can only contain strings or ExitPolicyRules, got a %s (%s)" % (type(rule), rules))
self._rules = None # lazily loaded series of ExitPolicyRule @@ -300,7 +300,7 @@ class ExitPolicy(object): is_all_accept, is_all_reject = True, True
for rule in self._input_rules: - if isinstance(rule, str): + if isinstance(rule, (str, unicode)): rule = ExitPolicyRule(rule.strip())
if rule.is_accept: diff --git a/stem/util/connection.py b/stem/util/connection.py index d7c0299..86ba49a 100644 --- a/stem/util/connection.py +++ b/stem/util/connection.py @@ -40,7 +40,7 @@ def is_valid_ip_address(address): :returns: **True** if input is a valid IPv4 address, **False** otherwise """
- if not isinstance(address, str): + if not isinstance(address, (str, unicode)): return False
# checks if theres four period separated values @@ -108,7 +108,7 @@ def is_valid_port(entry, allow_zero = False): return False
return True - elif isinstance(entry, str): + elif isinstance(entry, (str, unicode)): if not entry.isdigit(): return False elif entry[0] == "0" and len(entry) > 1: diff --git a/stem/util/tor_tools.py b/stem/util/tor_tools.py index 2e52cee..e61a96c 100644 --- a/stem/util/tor_tools.py +++ b/stem/util/tor_tools.py @@ -45,7 +45,7 @@ def is_valid_fingerprint(entry, check_prefix = False): :returns: **True** if the string could be a relay fingerprint, **False** otherwise """
- if not isinstance(entry, str): + if not isinstance(entry, (str, unicode)): return False elif check_prefix: if not entry or entry[0] != "$": @@ -65,7 +65,7 @@ def is_valid_nickname(entry): :returns: **True** if the string could be a nickname, **False** otherwise """
- if not isinstance(entry, str): + if not isinstance(entry, (str, unicode)): return False
return bool(NICKNAME_PATTERN.match(entry)) @@ -78,7 +78,7 @@ def is_valid_circuit_id(entry): :returns: **True** if the string could be a circuit id, **False** otherwise """
- if not isinstance(entry, str): + if not isinstance(entry, (str, unicode)): return False
return bool(CIRC_ID_PATTERN.match(entry)) diff --git a/test/integ/descriptor/reader.py b/test/integ/descriptor/reader.py index 97ea27c..936cf39 100644 --- a/test/integ/descriptor/reader.py +++ b/test/integ/descriptor/reader.py @@ -64,7 +64,7 @@ def _get_raw_tar_descriptors(): if tar_entry.isfile(): entry = tar_file.extractfile(tar_entry) entry.readline() # strip header - raw_descriptors.append(entry.read()) + raw_descriptors.append(entry.read().decode("utf-8", "replace")) entry.close() finally: if tar_file: diff --git a/test/settings.cfg b/test/settings.cfg index 4d99bfe..3de9d00 100644 --- a/test/settings.cfg +++ b/test/settings.cfg @@ -156,7 +156,7 @@ target.torrc RUN_PTRACE => PORT, PTRACE pyflakes.ignore stem/prereq.py => 'RSA' imported but unused pyflakes.ignore stem/prereq.py => 'asn1' imported but unused pyflakes.ignore stem/prereq.py => 'long_to_bytes' imported but unused -pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 32 +pyflakes.ignore stem/descriptor/__init__.py => redefinition of unused 'OrderedDict' from line 34 pyflakes.ignore stem/util/str_tools.py => redefinition of function '_to_bytes' from line 53 pyflakes.ignore test/mocking.py => undefined name 'builtins' pyflakes.ignore test/unit/response/events.py => 'from stem import *' used; unable to detect undefined names
tor-commits@lists.torproject.org