commit 756625453f5ac0ff9415d4cb04bc373c70bf79ef Author: Damian Johnson atagar@torproject.org Date: Sat Aug 18 18:53:07 2012 -0700
Keyword parsing broken for tab dividers
Most descriptor types (server/extrainfo descriptors, V2 network status documents, etc) have their divider whitespace defined as one or more spaces or tabs. God knows why they chose that - there's no good reason for outputing that and it makes parsing a pita. An exception is V3 network status documents, which redefine SP as being a single space.
I've never seen this in the wild so this is mostly an academic concern, but I'm trying very, very hard to make stem spec conformant.
The _read_until_keywords() and _skip_until_keywords() functions only worked if the keyword was divided by a space, not a tab.
I sunk two hours into fixing a bug with the KEYWORD_LINE regex which caused it to match against signature content. Long story short, we were accidently fuzzing our parser by letting there be a 'keyword' and 'value' when there was no whitespace between the two. --- stem/descriptor/__init__.py | 20 +++++++++++++++----- 1 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py index e6f3e4c..814fc49 100644 --- a/stem/descriptor/__init__.py +++ b/stem/descriptor/__init__.py @@ -28,7 +28,7 @@ import datetime
KEYWORD_CHAR = "a-zA-Z0-9-" WHITESPACE = " \t" -KEYWORD_LINE = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE)) +KEYWORD_LINE = re.compile("^([%s]+)(?:[%s]+(.*))?$" % (KEYWORD_CHAR, WHITESPACE)) PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE)) PGP_BLOCK_END = "-----END %s-----"
@@ -278,8 +278,13 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_fi line = descriptor_file.readline() if not line: break # EOF
- if " " in line: line_keyword = line.split(" ", 1)[0] - else: line_keyword = line.strip() + line_match = KEYWORD_LINE.match(line) + + if not line_match: + # no spaces or tabs in the line + line_keyword = line.strip() + else: + line_keyword = line_match.groups()[0]
if line_keyword in keywords: if inclusive: content.append(line) @@ -310,8 +315,13 @@ def _skip_until_keywords(keywords, descriptor_file, inclusive = False): line = descriptor_file.readline() if not line: break # EOF
- if " " in line: line_keyword = line.split(" ", 1)[0] - else: line_keyword = line.strip() + line_match = KEYWORD_LINE.match(line) + + if not line_match: + # no spaces or tabs in the line + line_keyword = line.strip() + else: + line_keyword = line_match.groups()[0]
if line_keyword in keywords: if not inclusive: descriptor_file.seek(last_position)
tor-commits@lists.torproject.org