[tor-commits] [tor/master] string: Add extra UTF-8 test cases

nickm at torproject.org nickm at torproject.org
Tue Jan 7 15:16:24 UTC 2020


commit d62dbb676242dbdfdd121828c97e12e737aa596c
Author: teor <teor at torproject.org>
Date:   Tue Jan 7 17:05:48 2020 +1000

    string: Add extra UTF-8 test cases
    
    These test cases are validated differently by some
    programming languages, because those languages have
    incorrect UTF-8 implementations.
    
    We want to make sure that tor validates them correctly.
    
    Closes ticket 32845.
---
 src/test/test_util.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/test/test_util.c b/src/test/test_util.c
index 7f7e157c1..92623ea0c 100644
--- a/src/test/test_util.c
+++ b/src/test/test_util.c
@@ -4104,10 +4104,43 @@ test_util_string_is_utf8(void *ptr)
   tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
   tt_int_op(1, OP_EQ, string_is_utf8("\xee\x80\x80", 3));
 
-  // The maximum legal codepoint, 10FFFF.
+  // The minimum legal codepoint, 0x00.
+  tt_int_op(1, OP_EQ, string_is_utf8("\0", 1));
+
+  // The maximum legal codepoint, 0x10FFFF.
   tt_int_op(1, OP_EQ, string_is_utf8("\xf4\x8f\xbf\xbf", 4));
   tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\x80\x80", 4));
 
+  /* Test cases that vary between programming languages /
+   * UTF-8 implementations.
+   * Source: POC||GTFO 19, page 43
+‚Äč  * https://www.alchemistowl.org/pocorgtfo/
+   */
+
+  // Invalid (in most implementations)
+  // surrogate
+  tt_int_op(0, OP_EQ, string_is_utf8("\xed\xa0\x81", 3));
+  // nullsurrog
+  tt_int_op(0, OP_EQ, string_is_utf8("\x30\x00\xed\xa0\x81", 5));
+  // threehigh
+  tt_int_op(0, OP_EQ, string_is_utf8("\xed\xbf\xbf", 3));
+  // fourhigh
+  tt_int_op(0, OP_EQ, string_is_utf8("\xf4\x90\xbf\xbf", 4));
+  // fivebyte
+  tt_int_op(0, OP_EQ, string_is_utf8("\xfb\x80\x80\x80\x80", 5));
+  // sixbyte
+  tt_int_op(0, OP_EQ, string_is_utf8("\xfd\x80\x80\x80\x80", 5));
+  // sixhigh
+  tt_int_op(0, OP_EQ, string_is_utf8("\xfd\xbf\xbf\xbf\xbf", 5));
+
+  // Valid (in most implementations)
+  // fourbyte
+  tt_int_op(1, OP_EQ, string_is_utf8("\xf0\x90\x8d\x88", 4));
+  // fourbyte2
+  tt_int_op(1, OP_EQ, string_is_utf8("\xf0\xbf\xbf\xbf", 4));
+  // nullbyte
+  tt_int_op(1, OP_EQ, string_is_utf8("\x30\x31\x32\x00\x33", 5));
+
  done:
   ;
 }





More information about the tor-commits mailing list