From c01b7cdf97e69255dd4a5dddda782ba29a32b3d1 Mon Sep 17 00:00:00 2001 From: Tom Tromey Date: Wed, 20 Jun 2001 16:21:24 +0000 Subject: [PATCH] re PR java/2319 (invalid UTF-8 sequences should be rejected) * lex.c (java_read_char): Disallow invalid and overlong sequences. Fixes PR java/2319. From-SVN: r43475 --- gcc/java/ChangeLog | 5 +++++ gcc/java/lex.c | 33 +++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/gcc/java/ChangeLog b/gcc/java/ChangeLog index f55ad6cf168..96e1ba5743c 100644 --- a/gcc/java/ChangeLog +++ b/gcc/java/ChangeLog @@ -1,3 +1,8 @@ +2001-06-19 Tom Tromey + + * lex.c (java_read_char): Disallow invalid and overlong + sequences. Fixes PR java/2319. + 2001-06-05 Jeff Sturm * decl.c (create_primitive_vtable): Don't call make_decl_rtl. diff --git a/gcc/java/lex.c b/gcc/java/lex.c index 28a73e3874b..35cd31749ca 100644 --- a/gcc/java/lex.c +++ b/gcc/java/lex.c @@ -454,15 +454,21 @@ java_read_char (lex) if (c == EOF) return UEOF; if (c < 128) - return (unicode_t)c; + return (unicode_t) c; else { if ((c & 0xe0) == 0xc0) { c1 = getc (lex->finput); if ((c1 & 0xc0) == 0x80) - return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f)); - c = c1; + { + unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f)); + /* Check for valid 2-byte characters. We explicitly + allow \0 because this encoding is common in the + Java world. */ + if (r == 0 || (r >= 0x80 && r <= 0x7ff)) + return r; + } } else if ((c & 0xf0) == 0xe0) { @@ -471,16 +477,23 @@ java_read_char (lex) { c2 = getc (lex->finput); if ((c2 & 0xc0) == 0x80) - return (unicode_t)(((c & 0xf) << 12) + - (( c1 & 0x3f) << 6) + (c2 & 0x3f)); - else - c = c2; + { + unicode_t r = (unicode_t)(((c & 0xf) << 12) + + (( c1 & 0x3f) << 6) + + (c2 & 0x3f)); + /* Check for valid 3-byte characters. + Don't allow surrogate, \ufffe or \uffff. */ + if (r >= 0x800 && r <= 0xffff + && ! (r >= 0xd800 && r <= 0xdfff) + && r != 0xfffe && r != 0xffff) + return r; + } } - else - c = c1; } - /* We simply don't support invalid characters. */ + /* We simply don't support invalid characters. We also + don't support 4-, 5-, or 6-byte UTF-8 sequences, as these + cannot be valid Java characters. */ java_lex_error ("malformed UTF-8 character", 0); } } -- 2.30.2