[tor-commits] [tor-browser/tor-browser-60.2.0esr-8.0-1] Bug 1489744 - Fix a bounds violation crash in the prefs parser. r=glandium, a=RyanVM

Thu Sep 20 08:18:49 UTC 2018

commit 982ef502b7c6d264afd3c403f20c9b6ef7840034
Author: Nicholas Nethercote <nnethercote at mozilla.com>
Date:   Tue Sep 11 09:36:07 2018 +1000

    Bug 1489744 - Fix a bounds violation crash in the prefs parser. r=glandium, a=RyanVM
    
    Currently, if a get_char() call returns EOF, the index moves beyond the
    buffer's bounds and get_char() cannot be called again without triggering a
    panic. As a result, everywhere that encounters an EOF and then does subsequent
    parsing ungets the EOF... except there was one place that failed to do that:
    the match case for CharKind::Slash in get_token(). This meant that a single '/'
    at the end of the input could trigger a bounds violation (but only if it is the
    start of a new token).
    
    This EOF-unget requirement is subtle and easy to get wrong, so this patch
    eliminates it. get_char() now can be called repeatedly after an EOF, and will
    return EOF on each subsequent call. This means that some of the existing
    unget_char() calls can be removed. Some others are still necessary to get line
    numbers correct in error messages, but the outcome of mishandled cases is now
    much less drastic -- an incorrect line number in an error message instead of a
    panic.
    
    The patch also clarifies a couple of related comments.
    
    --HG--
    extra : source : 5097db630d1fb51f7f3fcd5523a924fa823e77ce
    extra : intermediate-source : 2676e53112584b2c08b9a1bf02e15cd9bb0d0ee5
---
 modules/libpref/parser/src/lib.rs     | 70 +++++++++++++++++++----------------
 modules/libpref/test/gtest/Parser.cpp |  9 +++--
 2 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/modules/libpref/parser/src/lib.rs b/modules/libpref/parser/src/lib.rs
index 8d8b9a6125e4..5ea7650252f5 100644
--- a/modules/libpref/parser/src/lib.rs
+++ b/modules/libpref/parser/src/lib.rs
@@ -54,16 +54,10 @@
 
 // This parser uses several important optimizations.
 //
-// - Because "'\0' means EOF" is part of the grammar (see above) we can match
-//   EOF as a normal char/token, which means we can avoid a typical "do we
-//   still have chars remaining?" test in get_char(), which gives a speedup
-//   because get_char() is a very hot function. (Actually, Rust would
-//   bounds-check this function anyway, so we have get_char_unchecked() which
-//   is used for the two hottest call sites.)
-//
-//   This also means EOF is representable by a u8. If EOF was represented by an
-//   out-of-band value such as -1 or 256, we'd have to return a larger type
-//   such as u16 or i16 from get_char().
+// - Because "'\0' means EOF" is part of the grammar (see above), EOF is
+//   representable by a u8. If EOF was represented by an out-of-band value such
+//   as -1 or 256, we'd have to return a larger type such as u16 or i16 from
+//   get_char().
 //
 // - When starting a new token, it uses a lookup table with the first char,
 //   which quickly identifies what kind of token it will be. Furthermore, if
@@ -192,7 +186,7 @@ enum Token {
 #[derive(Clone, Copy, PartialEq)]
 enum CharKind {
     // These are ordered by frequency. See the comment in GetToken().
-    SingleChar, // Unambiguous single-char tokens: [()+,-]
+    SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF
     SpaceNL,    // [\t\v\f \n]
     Keyword,    // [A-Za-z_]
     Quote,      // ["']
@@ -535,13 +529,21 @@ impl<'t> Parser<'t> {
 
     #[inline(always)]
     fn get_char(&mut self) -> u8 {
-        let c = self.buf[self.i];
-        self.i += 1;
-        c
+        // We do the bounds check ourselves so we can return EOF on failure.
+        // (Although the buffer is guaranteed to end in an EOF char, we might
+        // go one char past that, whereupon we must return EOF again.)
+        if self.i < self.buf.len() {
+            let c = unsafe { *self.buf.get_unchecked(self.i) };
+            self.i += 1;
+            c
+        } else {
+            debug_assert!(self.i == self.buf.len());
+            EOF
+        }
     }
 
-    // This function skips the bounds check in non-optimized builds. Using it
-    // at the hottest two call sites gives a ~15% parsing speed boost.
+    // This function skips the bounds check in optimized builds. Using it at
+    // the hottest two call sites gives a ~15% parsing speed boost.
     #[inline(always)]
     unsafe fn get_char_unchecked(&mut self) -> u8 {
         debug_assert!(self.i < self.buf.len());
@@ -568,9 +570,11 @@ impl<'t> Parser<'t> {
     #[inline(always)]
     fn match_single_line_comment(&mut self) {
         loop {
-            // To reach here, the previous char must have been '/', and
-            // assertions elsewhere ensure that there must be at least one
-            // subsequent char (the '\0' for EOF).
+            // To reach here, the previous char must have been '/' (if this is
+            // the first loop iteration) or non-special (if this is the second
+            // or subsequent iteration), and assertions elsewhere ensure that
+            // there must be at least one subsequent char after those chars
+            // (the '\0' for EOF).
             let c = unsafe { self.get_char_unchecked() };
 
             // All the special chars have value <= b'\r'.
@@ -588,8 +592,6 @@ impl<'t> Parser<'t> {
                     break;
                 }
                 EOF => {
-                    // Unget EOF so subsequent calls to get_char() are safe.
-                    self.unget_char();
                     break;
                 }
                 _ => continue
@@ -615,8 +617,6 @@ impl<'t> Parser<'t> {
                     self.match_char(b'\n');
                 }
                 EOF => {
-                    // Unget EOF so subsequent calls to get_char() are safe.
-                    self.unget_char();
                     return false
                 }
                 _ => continue
@@ -630,11 +630,10 @@ impl<'t> Parser<'t> {
         for _ in 0..ndigits {
             value = value << 4;
             match self.get_char() {
-                c @ b'0'... b'9' => value += (c - b'0') as u16,
+                c @ b'0'...b'9' => value += (c - b'0') as u16,
                 c @ b'A'...b'F' => value += (c - b'A') as u16 + 10,
                 c @ b'a'...b'f' => value += (c - b'a') as u16 + 10,
                 _ => {
-                    // Unget in case the char was a closing quote or EOF.
                     self.unget_char();
                     return None;
                 }
@@ -716,7 +715,15 @@ impl<'t> Parser<'t> {
                                 return Token::Error("unterminated /* comment");
                             }
                         }
-                        _ => return Token::Error("expected '/' or '*' after '/'")
+                        c @ _ =>  {
+                            if c == b'\n' || c == b'\r' {
+                                // Unget the newline char; the outer loop will
+                                // reget it and adjust self.line_num
+                                // appropriately.
+                                self.unget_char();
+                            }
+                            return Token::Error("expected '/' or '*' after '/'");
+                        }
                     }
                     continue;
                 }
@@ -871,9 +878,12 @@ impl<'t> Parser<'t> {
                         }
                         continue; // We don't want to str_buf.push(c2) below.
                     }
-                    _ => {
-                        // Unget in case the char is an EOF.
-                        self.unget_char();
+                    c @ _ => {
+                        if c == b'\n' || c == b'\r' {
+                            // Unget the newline char; the outer loop will
+                            // reget it and adjust self.line_num appropriately.
+                            self.unget_char();
+                        }
                         self.string_error_token(
                             &mut token, "unexpected escape sequence character after '\\'");
                         continue;
@@ -894,8 +904,6 @@ impl<'t> Parser<'t> {
                 }
 
             } else if c == EOF {
-                // Unget EOF so subsequent calls to get_char() are safe.
-                self.unget_char();
                 self.string_error_token(&mut token, "unterminated string literal");
                 break;
 
diff --git a/modules/libpref/test/gtest/Parser.cpp b/modules/libpref/test/gtest/Parser.cpp
index d5469dd38007..fe9dea87ee60 100644
--- a/modules/libpref/test/gtest/Parser.cpp
+++ b/modules/libpref/test/gtest/Parser.cpp
@@ -293,11 +293,14 @@ pref("string.bad-keyword", TRUE);
     "test:3: prefs parse error: unterminated /* comment\n"
   );
 
-  // Malformed comment.
+  // Malformed comments (single slashes), followed by whitespace, newline, EOF.
   DEFAULT(R"(
-/ comment
-    )",
+/ comment;
+/
+; /)",
     "test:2: prefs parse error: expected '/' or '*' after '/'\n"
+    "test:3: prefs parse error: expected '/' or '*' after '/'\n"
+    "test:4: prefs parse error: expected '/' or '*' after '/'\n"
   );
 
   // C++-style comment ending in EOF (1).