diff mbox

C1X Unicode strings without raw strings

Message ID Pine.LNX.4.64.1108181514450.11806@digraph.polyomino.org.uk
State New
Headers show

Commit Message

Joseph Myers Aug. 18, 2011, 3:15 p.m. UTC
C1X has Unicode strings (u"", U"", u8"") but not raw strings.  I've
applied this patch to enable the appropriate features in C1X mode.

C1X also has predefined macros __STDC_UTF_16__ and __STDC_UTF_32__ to
indicate that char16_t and char32_t really are UTF-16 and UTF-32
(which they always are with GCC) rather than some other encoding.
Jason, I've made GCC define those macros for C only, and only if the
uliterals feature is enabled (since without that feature they aren't
particularly meaningful).  C++, at least as of N3291, follows C TR
19769 in having those macros defined by <cuchar> rather than
predefined by the compiler.  (I'm presuming that difference from C1X
is still there in the approved version of C++0X, although it's
doubtful that C++ *should* differ from C1X in this way.)

Bootstrapped with no regressions on x86_64-unknown-linux-gnu.  Applied
to mainline.

gcc/testsuite:
2011-08-18  Joseph Myers  <joseph@codesourcery.com>

	* gcc.dg/c1x-uni-string-1.c, gcc.dg/c1x-uni-string-2.c: New tests.

libcpp:
2011-08-18  Joseph Myers  <joseph@codesourcery.com>

	* include/cpplib.h (struct cpp_options): Add rliterals.
	* init.c  (struct lang_flags, lang_defaults): Add rliterals.
	(cpp_set_lang): Set rliterals option.
	(cpp_init_builtins): Define __STDC_UTF_16__ and __STDC_UTF_32__.
	* lex.c (_cpp_lex_direct): Only accept raw strings if rliterals.

Comments

Michael Matz Aug. 18, 2011, 3:21 p.m. UTC | #1
Hi,

On Thu, 18 Aug 2011, Joseph S. Myers wrote:

> @@ -315,6 +315,10 @@ struct cpp_options
>    /* Nonzero means process u/U prefix literals (UTF-16/32).  */
>    unsigned char uliterals;
>  
> +  /* Nonzero means process r/R rax strings.  If this is set, uliterals
> +     must be set as well.  */
> +  unsigned char rliterals;
> +

s/rax/raw/


Ciao,
Michael.
Jason Merrill Aug. 18, 2011, 9:48 p.m. UTC | #2
On 08/18/2011 11:15 AM, Joseph S. Myers wrote:
> predefined by the compiler.  (I'm presuming that difference from C1X
> is still there in the approved version of C++0X, although it's
> doubtful that C++ *should* differ from C1X in this way.)

Thanks, I've raised the issue.

Jason
diff mbox

Patch

Index: gcc/testsuite/gcc.dg/c1x-uni-string-1.c
===================================================================
--- gcc/testsuite/gcc.dg/c1x-uni-string-1.c	(revision 0)
+++ gcc/testsuite/gcc.dg/c1x-uni-string-1.c	(revision 0)
@@ -0,0 +1,112 @@ 
+/* Test Unicode strings in C1X.  Test valid code.  */
+/* { dg-do run } */
+/* { dg-options "-std=c1x -pedantic-errors" } */
+
+/* More thorough tests are in c-c++-common/raw-string-*.c; this test
+   verifies the particular subset (Unicode but not raw strings) that
+   is in C1X.  */
+
+typedef __CHAR16_TYPE__ char16_t;
+typedef __CHAR32_TYPE__ char32_t;
+typedef __SIZE_TYPE__ size_t;
+
+extern void abort (void);
+extern void exit (int);
+extern int memcmp (const void *, const void *, size_t);
+
+#define R "(R)"
+#define u8R "(u8R)"
+#define uR "(uR)"
+#define UR "(UR)"
+#define LR "(LR)"
+#define u8 randomu8
+#define u randomu
+#define U randomU
+
+const char su8[] = u8"a\u010d";
+const char su8a[] = "a\xc4\x8d";
+
+const char16_t su16[] = u"\u0567";
+const char16_t su16a[] = { 0x0567, 0 };
+
+const char32_t su32[] = U"\u0123";
+const char32_t su32a[] = { 0x0123, 0 };
+
+const char tu[] = R"a";
+const char tua[] = "(R)a";
+
+const char tu8[] = u8R"b";
+const char tu8a[] = "(u8R)b";
+
+const char tu16[] = uR"c";
+const char tu16a[] = "(uR)c";
+
+const char tu32[] = UR"d";
+const char tu32a[] = "(UR)d";
+
+const char tl[] = LR"e";
+const char tla[] = "(LR)e";
+
+#define str(x) #x
+const char ts[] = str(u"a" U"b" u8"c");
+const char tsa[] = "u\"a\" U\"b\" u8\"c\"";
+
+/* GCC always uses UTF-16 and UTF-32 for char16_t and char32_t.  */
+#ifndef __STDC_UTF_16__
+#error "__STDC_UTF_16__ not defined"
+#endif
+#ifndef __STDC_UTF_32__
+#error "__STDC_UTF_32__ not defined"
+#endif
+#define xstr(x) str(x)
+const char tm16[] = xstr(__STDC_UTF_16__);
+const char tm16a[] = "1";
+const char tm32[] = xstr(__STDC_UTF_32__);
+const char tm32a[] = "1";
+
+int
+main (void)
+{
+  if (sizeof (su8) != sizeof (su8a)
+      || memcmp (su8, su8a, sizeof (su8)) != 0)
+    abort ();
+  if (sizeof (su16) != sizeof (su16a)
+      || memcmp (su16, su16a, sizeof (su16)) != 0)
+    abort ();
+  if (sizeof (su32) != sizeof (su32a)
+      || memcmp (su32, su32a, sizeof (su32)) != 0)
+    abort ();
+  if (sizeof (tu) != sizeof (tua)
+      || memcmp (tu, tua, sizeof (tu)) != 0)
+    abort ();
+  if (sizeof (tu8) != sizeof (tu8a)
+      || memcmp (tu8, tu8a, sizeof (tu8)) != 0)
+    abort ();
+  if (sizeof (tu16) != sizeof (tu16a)
+      || memcmp (tu16, tu16a, sizeof (tu16)) != 0)
+    abort ();
+  if (sizeof (tu32) != sizeof (tu32a)
+      || memcmp (tu32, tu32a, sizeof (tu32)) != 0)
+    abort ();
+  if (sizeof (tl) != sizeof (tla)
+      || memcmp (tl, tla, sizeof (tl)) != 0)
+    abort ();
+  if (sizeof (ts) != sizeof (tsa)
+      || memcmp (ts, tsa, sizeof (ts)) != 0)
+    abort ();
+  if (sizeof (tm16) != sizeof (tm16a)
+      || memcmp (tm16, tm16a, sizeof (tm16)) != 0)
+    abort ();
+  if (sizeof (tm32) != sizeof (tm32a)
+      || memcmp (tm32, tm32a, sizeof (tm32)) != 0)
+    abort ();
+  if (u'\u0123' != 0x0123)
+    abort ();
+  if (U'\u0456' != 0x0456)
+    abort ();
+#undef u8
+#define u8
+  if (u8'a' != 'a')
+    abort ();
+  exit (0);
+}
Index: gcc/testsuite/gcc.dg/c1x-uni-string-2.c
===================================================================
--- gcc/testsuite/gcc.dg/c1x-uni-string-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/c1x-uni-string-2.c	(revision 0)
@@ -0,0 +1,8 @@ 
+/* Test Unicode strings in C1X.  Test constraint.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c1x -pedantic-errors" } */
+
+const void *p1 = L"a" u8"b"; /* { dg-error "concatenation" } */
+const void *p2 = L"a" "b" u8"c"; /* { dg-error "concatenation" } */
+const void *p3 = u8"a" L"b"; /* { dg-error "concatenation" } */
+const void *p4 = u8"a" "b" L"c"; /* { dg-error "concatenation" } */
Index: libcpp/include/cpplib.h
===================================================================
--- libcpp/include/cpplib.h	(revision 177847)
+++ libcpp/include/cpplib.h	(working copy)
@@ -1,6 +1,6 @@ 
 /* Definitions for CPP library.
    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
-   2004, 2005, 2007, 2008, 2009, 2010
+   2004, 2005, 2007, 2008, 2009, 2010, 2011
    Free Software Foundation, Inc.
    Written by Per Bothner, 1994-95.
 
@@ -315,6 +315,10 @@  struct cpp_options
   /* Nonzero means process u/U prefix literals (UTF-16/32).  */
   unsigned char uliterals;
 
+  /* Nonzero means process r/R rax strings.  If this is set, uliterals
+     must be set as well.  */
+  unsigned char rliterals;
+
   /* Nonzero means print names of header files (-H).  */
   unsigned char print_include_names;
 
Index: libcpp/init.c
===================================================================
--- libcpp/init.c	(revision 177847)
+++ libcpp/init.c	(working copy)
@@ -1,7 +1,7 @@ 
 /* CPP Library.
    Copyright (C) 1986, 1987, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
    1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008,
-   2009, 2010 Free Software Foundation, Inc.
+   2009, 2010, 2011 Free Software Foundation, Inc.
    Contributed by Per Bothner, 1994-95.
    Based on CCCP program by Paul Rubin, June 1986
    Adapted to ANSI C, Richard Stallman, Jan 1987
@@ -79,22 +79,23 @@  struct lang_flags
   char cplusplus_comments;
   char digraphs;
   char uliterals;
+  char rliterals;
 };
 
 static const struct lang_flags lang_defaults[] =
-{ /*              c99 c++ xnum xid std  //   digr ulit */
-  /* GNUC89   */  { 0,  0,  1,   0,  0,   1,   1,   0 },
-  /* GNUC99   */  { 1,  0,  1,   0,  0,   1,   1,   1 },
-  /* GNUC1X   */  { 1,  0,  1,   0,  0,   1,   1,   1 },
-  /* STDC89   */  { 0,  0,  0,   0,  1,   0,   0,   0 },
-  /* STDC94   */  { 0,  0,  0,   0,  1,   0,   1,   0 },
-  /* STDC99   */  { 1,  0,  1,   0,  1,   1,   1,   0 },
-  /* STDC1X   */  { 1,  0,  1,   0,  1,   1,   1,   0 },
-  /* GNUCXX   */  { 0,  1,  1,   0,  0,   1,   1,   0 },
-  /* CXX98    */  { 0,  1,  1,   0,  1,   1,   1,   0 },
-  /* GNUCXX0X */  { 1,  1,  1,   0,  0,   1,   1,   1 },
-  /* CXX0X    */  { 1,  1,  1,   0,  1,   1,   1,   1 },
-  /* ASM      */  { 0,  0,  1,   0,  0,   1,   0,   0 }
+{ /*              c99 c++ xnum xid std  //   digr ulit rlit */
+  /* GNUC89   */  { 0,  0,  1,   0,  0,   1,   1,   0,   0 },
+  /* GNUC99   */  { 1,  0,  1,   0,  0,   1,   1,   1,   1 },
+  /* GNUC1X   */  { 1,  0,  1,   0,  0,   1,   1,   1,   1 },
+  /* STDC89   */  { 0,  0,  0,   0,  1,   0,   0,   0,   0 },
+  /* STDC94   */  { 0,  0,  0,   0,  1,   0,   1,   0,   0 },
+  /* STDC99   */  { 1,  0,  1,   0,  1,   1,   1,   0,   0 },
+  /* STDC1X   */  { 1,  0,  1,   0,  1,   1,   1,   1,   0 },
+  /* GNUCXX   */  { 0,  1,  1,   0,  0,   1,   1,   0,   0 },
+  /* CXX98    */  { 0,  1,  1,   0,  1,   1,   1,   0,   0 },
+  /* GNUCXX0X */  { 1,  1,  1,   0,  0,   1,   1,   1,   1 },
+  /* CXX0X    */  { 1,  1,  1,   0,  1,   1,   1,   1,   1 },
+  /* ASM      */  { 0,  0,  1,   0,  0,   1,   0,   0,   0 }
   /* xid should be 1 for GNUC99, STDC99, GNUCXX, CXX98, GNUCXX0X, and
      CXX0X when no longer experimental (when all uses of identifiers
      in the compiler have been audited for correct handling of
@@ -118,6 +119,7 @@  cpp_set_lang (cpp_reader *pfile, enum c_
   CPP_OPTION (pfile, cplusplus_comments)	 = l->cplusplus_comments;
   CPP_OPTION (pfile, digraphs)			 = l->digraphs;
   CPP_OPTION (pfile, uliterals)			 = l->uliterals;
+  CPP_OPTION (pfile, rliterals)			 = l->rliterals;
 }
 
 /* Initialize library global state.  */
@@ -464,6 +466,13 @@  cpp_init_builtins (cpp_reader *pfile, in
   else if (CPP_OPTION (pfile, c99))
     _cpp_define_builtin (pfile, "__STDC_VERSION__ 199901L");
 
+  if (CPP_OPTION (pfile, uliterals)
+      && !CPP_OPTION (pfile, cplusplus))
+    {
+      _cpp_define_builtin (pfile, "__STDC_UTF_16__ 1");
+      _cpp_define_builtin (pfile, "__STDC_UTF_32__ 1");
+    }
+
   if (hosted)
     _cpp_define_builtin (pfile, "__STDC_HOSTED__ 1");
   else
Index: libcpp/lex.c
===================================================================
--- libcpp/lex.c	(revision 177847)
+++ libcpp/lex.c	(working copy)
@@ -1,6 +1,6 @@ 
 /* CPP Library - lexical analysis.
-   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010
-   Free Software Foundation, Inc.
+   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010,
+   2011 Free Software Foundation, Inc.
    Contributed by Per Bothner, 1994-95.
    Based on CCCP program by Paul Rubin, June 1986
    Adapted to ANSI C, Richard Stallman, Jan 1987
@@ -2007,18 +2007,20 @@  _cpp_lex_direct (cpp_reader *pfile)
     case 'R':
       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
 	 wide strings or raw strings.  */
-      if (c == 'L' || CPP_OPTION (pfile, uliterals))
+      if (c == 'L' || CPP_OPTION (pfile, rliterals)
+	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
 	{
 	  if ((*buffer->cur == '\'' && c != 'R')
 	      || *buffer->cur == '"'
 	      || (*buffer->cur == 'R'
 		  && c != 'R'
 		  && buffer->cur[1] == '"'
-		  && CPP_OPTION (pfile, uliterals))
+		  && CPP_OPTION (pfile, rliterals))
 	      || (*buffer->cur == '8'
 		  && c == 'u'
 		  && (buffer->cur[1] == '"'
-		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'))))
+		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
+			  && CPP_OPTION (pfile, rliterals)))))
 	    {
 	      lex_string (pfile, result, buffer->cur - 1);
 	      break;