commit e4f063eb30d140dcf0c04074dfe4ab82307fd7c3 Author: Nicholas Nethercote nnethercote@mozilla.com Date: Tue Sep 11 09:36:07 2018 +1000
Bug 1489744 - Fix a bounds violation crash in the prefs parser. r=glandium, a=RyanVM
Currently, if a get_char() call returns EOF, the index moves beyond the buffer's bounds and get_char() cannot be called again without triggering a panic. As a result, everywhere that encounters an EOF and then does subsequent parsing ungets the EOF... except there was one place that failed to do that: the match case for CharKind::Slash in get_token(). This meant that a single '/' at the end of the input could trigger a bounds violation (but only if it is the start of a new token).
This EOF-unget requirement is subtle and easy to get wrong, so this patch eliminates it. get_char() now can be called repeatedly after an EOF, and will return EOF on each subsequent call. This means that some of the existing unget_char() calls can be removed. Some others are still necessary to get line numbers correct in error messages, but the outcome of mishandled cases is now much less drastic -- an incorrect line number in an error message instead of a panic.
The patch also clarifies a couple of related comments.
--HG-- extra : source : 5097db630d1fb51f7f3fcd5523a924fa823e77ce extra : intermediate-source : 2676e53112584b2c08b9a1bf02e15cd9bb0d0ee5 --- modules/libpref/parser/src/lib.rs | 70 +++++++++++++++++++---------------- modules/libpref/test/gtest/Parser.cpp | 9 +++-- 2 files changed, 45 insertions(+), 34 deletions(-)
diff --git a/modules/libpref/parser/src/lib.rs b/modules/libpref/parser/src/lib.rs index 8d8b9a6125e4..5ea7650252f5 100644 --- a/modules/libpref/parser/src/lib.rs +++ b/modules/libpref/parser/src/lib.rs @@ -54,16 +54,10 @@
// This parser uses several important optimizations. // -// - Because "'\0' means EOF" is part of the grammar (see above) we can match -// EOF as a normal char/token, which means we can avoid a typical "do we -// still have chars remaining?" test in get_char(), which gives a speedup -// because get_char() is a very hot function. (Actually, Rust would -// bounds-check this function anyway, so we have get_char_unchecked() which -// is used for the two hottest call sites.) -// -// This also means EOF is representable by a u8. If EOF was represented by an -// out-of-band value such as -1 or 256, we'd have to return a larger type -// such as u16 or i16 from get_char(). +// - Because "'\0' means EOF" is part of the grammar (see above), EOF is +// representable by a u8. If EOF was represented by an out-of-band value such +// as -1 or 256, we'd have to return a larger type such as u16 or i16 from +// get_char(). // // - When starting a new token, it uses a lookup table with the first char, // which quickly identifies what kind of token it will be. Furthermore, if @@ -192,7 +186,7 @@ enum Token { #[derive(Clone, Copy, PartialEq)] enum CharKind { // These are ordered by frequency. See the comment in GetToken(). - SingleChar, // Unambiguous single-char tokens: [()+,-] + SingleChar, // Unambiguous single-char tokens: [()+,-] or EOF SpaceNL, // [\t\v\f \n] Keyword, // [A-Za-z_] Quote, // ["'] @@ -535,13 +529,21 @@ impl<'t> Parser<'t> {
#[inline(always)] fn get_char(&mut self) -> u8 { - let c = self.buf[self.i]; - self.i += 1; - c + // We do the bounds check ourselves so we can return EOF on failure. + // (Although the buffer is guaranteed to end in an EOF char, we might + // go one char past that, whereupon we must return EOF again.) + if self.i < self.buf.len() { + let c = unsafe { *self.buf.get_unchecked(self.i) }; + self.i += 1; + c + } else { + debug_assert!(self.i == self.buf.len()); + EOF + } }
- // This function skips the bounds check in non-optimized builds. Using it - // at the hottest two call sites gives a ~15% parsing speed boost. + // This function skips the bounds check in optimized builds. Using it at + // the hottest two call sites gives a ~15% parsing speed boost. #[inline(always)] unsafe fn get_char_unchecked(&mut self) -> u8 { debug_assert!(self.i < self.buf.len()); @@ -568,9 +570,11 @@ impl<'t> Parser<'t> { #[inline(always)] fn match_single_line_comment(&mut self) { loop { - // To reach here, the previous char must have been '/', and - // assertions elsewhere ensure that there must be at least one - // subsequent char (the '\0' for EOF). + // To reach here, the previous char must have been '/' (if this is + // the first loop iteration) or non-special (if this is the second + // or subsequent iteration), and assertions elsewhere ensure that + // there must be at least one subsequent char after those chars + // (the '\0' for EOF). let c = unsafe { self.get_char_unchecked() };
// All the special chars have value <= b'\r'. @@ -588,8 +592,6 @@ impl<'t> Parser<'t> { break; } EOF => { - // Unget EOF so subsequent calls to get_char() are safe. - self.unget_char(); break; } _ => continue @@ -615,8 +617,6 @@ impl<'t> Parser<'t> { self.match_char(b'\n'); } EOF => { - // Unget EOF so subsequent calls to get_char() are safe. - self.unget_char(); return false } _ => continue @@ -630,11 +630,10 @@ impl<'t> Parser<'t> { for _ in 0..ndigits { value = value << 4; match self.get_char() { - c @ b'0'... b'9' => value += (c - b'0') as u16, + c @ b'0'...b'9' => value += (c - b'0') as u16, c @ b'A'...b'F' => value += (c - b'A') as u16 + 10, c @ b'a'...b'f' => value += (c - b'a') as u16 + 10, _ => { - // Unget in case the char was a closing quote or EOF. self.unget_char(); return None; } @@ -716,7 +715,15 @@ impl<'t> Parser<'t> { return Token::Error("unterminated /* comment"); } } - _ => return Token::Error("expected '/' or '*' after '/'") + c @ _ => { + if c == b'\n' || c == b'\r' { + // Unget the newline char; the outer loop will + // reget it and adjust self.line_num + // appropriately. + self.unget_char(); + } + return Token::Error("expected '/' or '*' after '/'"); + } } continue; } @@ -871,9 +878,12 @@ impl<'t> Parser<'t> { } continue; // We don't want to str_buf.push(c2) below. } - _ => { - // Unget in case the char is an EOF. - self.unget_char(); + c @ _ => { + if c == b'\n' || c == b'\r' { + // Unget the newline char; the outer loop will + // reget it and adjust self.line_num appropriately. + self.unget_char(); + } self.string_error_token( &mut token, "unexpected escape sequence character after '\'"); continue; @@ -894,8 +904,6 @@ impl<'t> Parser<'t> { }
} else if c == EOF { - // Unget EOF so subsequent calls to get_char() are safe. - self.unget_char(); self.string_error_token(&mut token, "unterminated string literal"); break;
diff --git a/modules/libpref/test/gtest/Parser.cpp b/modules/libpref/test/gtest/Parser.cpp index d5469dd38007..fe9dea87ee60 100644 --- a/modules/libpref/test/gtest/Parser.cpp +++ b/modules/libpref/test/gtest/Parser.cpp @@ -293,11 +293,14 @@ pref("string.bad-keyword", TRUE); "test:3: prefs parse error: unterminated /* comment\n" );
- // Malformed comment. + // Malformed comments (single slashes), followed by whitespace, newline, EOF. DEFAULT(R"( -/ comment - )", +/ comment; +/ +; /)", "test:2: prefs parse error: expected '/' or '*' after '/'\n" + "test:3: prefs parse error: expected '/' or '*' after '/'\n" + "test:4: prefs parse error: expected '/' or '*' after '/'\n" );
// C++-style comment ending in EOF (1).