Patchwork [gccgo] Don't permit Unicode surrogate pairs in escape sequences

login
register
mail settings
Submitter Ian Taylor
Date Aug. 31, 2010, 6:33 p.m.
Message ID <mcroccieioe.fsf@google.com>
Download mbox | patch
Permalink /patch/63312/
State New
Headers show

Comments

Ian Taylor - Aug. 31, 2010, 6:33 p.m.
This patch changes gccgo to not permit Unicode surrogate pairs in
Unicode escape sequences.  Surrogate pairs are only used in UTF-16, but
in Go escape sequences always generate UTF-8.  Committed to gccgo
branch.

Ian

Patch

diff -r 79786d3fc04a go/lex.cc
--- a/go/lex.cc	Tue Aug 31 11:08:59 2010 -0700
+++ b/go/lex.cc	Tue Aug 31 11:30:44 2010 -0700
@@ -1173,9 +1173,16 @@ 
 			+ (hex_value(p[2]) << 8)
 			+ (hex_value(p[3]) << 4)
 			+ hex_value(p[4]));
+	      if (*value >= 0xd800 && *value < 0xe000)
+		{
+		  error_at(this->location(),"invalid unicode code point 0x%x",
+			   *value);
+		  // Use the replacement character.
+		  *value = 0xfffd;
+		}
 	      return p + 5;
 	    }
-	  this->error("invalid little unicode character");
+	  this->error("invalid little unicode code point");
 	  return p + 1;
 
 	case 'U':
@@ -1192,9 +1199,17 @@ 
 			+ (hex_value(p[6]) << 8)
 			+ (hex_value(p[7]) << 4)
 			+ hex_value(p[8]));
+	      if (*value > 0x10ffff
+		  || (*value >= 0xd800 && *value < 0xe000))
+		{
+		  error_at(this->location(), "invalid unicode code point 0x%x",
+			   *value);
+		  // Use the replacement character.
+		  *value = 0xfffd;
+		}
 	      return p + 9;
 	    }
-	  this->error("invalid big unicode character");
+	  this->error("invalid big unicode code point");
 	  return p + 1;
 
 	default:
@@ -1231,7 +1246,7 @@ 
       if (v > 0x10ffff)
 	{
 	  warning_at(location, 0,
-		     "unicode character 0x%x out of range in string", v);
+		     "unicode code point 0x%x out of range in string", v);
 	  // Turn it into the "replacement character".
 	  v = 0xfffd;
 	}
Index: gcc/testsuite/go.test/test/char_lit.go
===================================================================
--- gcc/testsuite/go.test/test/char_lit.go	(revision 163682)
+++ gcc/testsuite/go.test/test/char_lit.go	(working copy)
@@ -30,15 +30,15 @@  func main() {
 		'\xFE' +
 		'\u0123' +
 		'\ubabe' +
-		'\U0123ABCD' +
-		'\Ucafebabe'
+		'\U0010FFFF' +
+		'\U000ebabe'
 		;
-	if '\Ucafebabe' != 0xcafebabe {
-		print("cafebabe wrong\n");
+	if '\U000ebabe' != 0x000ebabe {
+		print("ebabe wrong\n");
 		os.Exit(1)
 	}
-	if i != 0xcc238de1 {
-		print("number is ", i, " should be ", 0xcc238de1, "\n");
+	if i != 0x20e213 {
+		print("number is ", i, " should be ", 0x20e213, "\n");
 		os.Exit(1)
 		}
 }