[tor-commits] [stem/master] Keyword parsing broken for tab dividers

atagar at torproject.org atagar at torproject.org
Sat Oct 13 18:35:45 UTC 2012


commit 756625453f5ac0ff9415d4cb04bc373c70bf79ef
Author: Damian Johnson <atagar at torproject.org>
Date:   Sat Aug 18 18:53:07 2012 -0700

    Keyword parsing broken for tab dividers
    
    Most descriptor types (server/extrainfo descriptors, V2 network status
    documents, etc) have their divider whitespace defined as one or more spaces or
    tabs. God knows why they chose that - there's no good reason for outputing that
    and it makes parsing a pita. An exception is V3 network status documents, which
    redefine SP as being a single space.
    
    I've never seen this in the wild so this is mostly an academic concern, but I'm
    trying very, very hard to make stem spec conformant.
    
    The _read_until_keywords() and _skip_until_keywords() functions only worked if
    the keyword was divided by a space, not a tab.
    
    I sunk two hours into fixing a bug with the KEYWORD_LINE regex which caused it
    to match against signature content. Long story short, we were accidently
    fuzzing our parser by letting there be a 'keyword' and 'value' when there was
    no whitespace between the two.
---
 stem/descriptor/__init__.py |   20 +++++++++++++++-----
 1 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index e6f3e4c..814fc49 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -28,7 +28,7 @@ import datetime
 
 KEYWORD_CHAR    = "a-zA-Z0-9-"
 WHITESPACE      = " \t"
-KEYWORD_LINE    = re.compile("^([%s]+)[%s]*(.*)$" % (KEYWORD_CHAR, WHITESPACE))
+KEYWORD_LINE    = re.compile("^([%s]+)(?:[%s]+(.*))?$" % (KEYWORD_CHAR, WHITESPACE))
 PGP_BLOCK_START = re.compile("^-----BEGIN ([%s%s]+)-----$" % (KEYWORD_CHAR, WHITESPACE))
 PGP_BLOCK_END   = "-----END %s-----"
 
@@ -278,8 +278,13 @@ def _read_until_keywords(keywords, descriptor_file, inclusive = False, ignore_fi
     line = descriptor_file.readline()
     if not line: break # EOF
     
-    if " " in line: line_keyword = line.split(" ", 1)[0]
-    else: line_keyword = line.strip()
+    line_match = KEYWORD_LINE.match(line)
+    
+    if not line_match:
+      # no spaces or tabs in the line
+      line_keyword = line.strip()
+    else:
+      line_keyword = line_match.groups()[0]
     
     if line_keyword in keywords:
       if inclusive: content.append(line)
@@ -310,8 +315,13 @@ def _skip_until_keywords(keywords, descriptor_file, inclusive = False):
     line = descriptor_file.readline()
     if not line: break # EOF
     
-    if " " in line: line_keyword = line.split(" ", 1)[0]
-    else: line_keyword = line.strip()
+    line_match = KEYWORD_LINE.match(line)
+    
+    if not line_match:
+      # no spaces or tabs in the line
+      line_keyword = line.strip()
+    else:
+      line_keyword = line_match.groups()[0]
     
     if line_keyword in keywords:
       if not inclusive: descriptor_file.seek(last_position)





More information about the tor-commits mailing list