Support UTF-8 character constants for C2x
diff mbox series

Message ID alpine.DEB.2.21.1911142018500.10087@digraph.polyomino.org.uk
State New
Headers show
Series
  • Support UTF-8 character constants for C2x
Related show

Commit Message

Joseph Myers Nov. 14, 2019, 8:20 p.m. UTC
C2x adds u8'' character constants to C.  This patch adds the
corresponding GCC support.

Most of the support was already present for C++ and just needed
enabling for C2x.  However, in C2x these constants have type unsigned
char, which required corresponding adjustments in the compiler and the
preprocessor to give them that type for C.

For C, it seems clear to me that having type unsigned char means the
constants are unsigned in the preprocessor (and thus treated as having
type uintmax_t in #if conditionals), so this patch implements that.  I
included a conditional in the libcpp change to avoid affecting
signedness for C++, but I'm not sure if in fact these constants should
also be unsigned in the preprocessor for C++ in which case that
!CPP_OPTION (pfile, cplusplus) conditional would not be needed.

Bootstrapped with no regressions on x86_64-pc-linux-gnu.  Applied to 
mainline.

gcc/c:
2019-11-14  Joseph Myers  <joseph@codesourcery.com>

	* c-parser.c (c_parser_postfix_expression)
	(c_parser_check_literal_zero): Handle CPP_UTF8CHAR.
	* gimple-parser.c (c_parser_gimple_postfix_expression): Likewise.

gcc/c-family:
2019-11-14  Joseph Myers  <joseph@codesourcery.com>

	* c-lex.c (lex_charconst): Make CPP_UTF8CHAR constants unsigned
	char for C.

gcc/testsuite:
2019-11-14  Joseph Myers  <joseph@codesourcery.com>

	* gcc.dg/c11-utf8char-1.c, gcc.dg/c2x-utf8char-1.c,
	gcc.dg/c2x-utf8char-2.c, gcc.dg/c2x-utf8char-3.c,
	gcc.dg/gnu2x-utf8char-1.c: New tests.

libcpp:
2019-11-14  Joseph Myers  <joseph@codesourcery.com>

	* charset.c (narrow_str_to_charconst): Make CPP_UTF8CHAR constants
	unsigned for C.
	* init.c (lang_defaults): Set utf8_char_literals for GNUC2X and
	STDC2X.

Patch
diff mbox series

Index: gcc/c/c-parser.c
===================================================================
--- gcc/c/c-parser.c	(revision 278253)
+++ gcc/c/c-parser.c	(working copy)
@@ -8783,6 +8783,7 @@  c_parser_postfix_expression (c_parser *parser)
     case CPP_CHAR:
     case CPP_CHAR16:
     case CPP_CHAR32:
+    case CPP_UTF8CHAR:
     case CPP_WCHAR:
       expr.value = c_parser_peek_token (parser)->value;
       /* For the purpose of warning when a pointer is compared with
@@ -10459,6 +10460,7 @@  c_parser_check_literal_zero (c_parser *parser, uns
     case CPP_WCHAR:
     case CPP_CHAR16:
     case CPP_CHAR32:
+    case CPP_UTF8CHAR:
       /* If a parameter is literal zero alone, remember it
 	 for -Wmemset-transposed-args warning.  */
       if (integer_zerop (tok->value)
Index: gcc/c/gimple-parser.c
===================================================================
--- gcc/c/gimple-parser.c	(revision 278253)
+++ gcc/c/gimple-parser.c	(working copy)
@@ -1395,6 +1395,7 @@  c_parser_gimple_postfix_expression (gimple_parser
     case CPP_CHAR:
     case CPP_CHAR16:
     case CPP_CHAR32:
+    case CPP_UTF8CHAR:
     case CPP_WCHAR:
       expr.value = c_parser_peek_token (parser)->value;
       set_c_expr_source_range (&expr, tok_range);
Index: gcc/c-family/c-lex.c
===================================================================
--- gcc/c-family/c-lex.c	(revision 278253)
+++ gcc/c-family/c-lex.c	(working copy)
@@ -1376,7 +1376,9 @@  lex_charconst (const cpp_token *token)
     type = char16_type_node;
   else if (token->type == CPP_UTF8CHAR)
     {
-      if (flag_char8_t)
+      if (!c_dialect_cxx ())
+	type = unsigned_char_type_node;
+      else if (flag_char8_t)
         type = char8_type_node;
       else
         type = char_type_node;
Index: gcc/testsuite/gcc.dg/c11-utf8char-1.c
===================================================================
--- gcc/testsuite/gcc.dg/c11-utf8char-1.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/c11-utf8char-1.c	(working copy)
@@ -0,0 +1,7 @@ 
+/* Test C2x UTF-8 characters.  Test not accepted for C11.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c11 -pedantic-errors" } */
+
+#define z(x) 0
+#define u8 z(
+unsigned char a = u8'a');
Index: gcc/testsuite/gcc.dg/c2x-utf8char-1.c
===================================================================
--- gcc/testsuite/gcc.dg/c2x-utf8char-1.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/c2x-utf8char-1.c	(working copy)
@@ -0,0 +1,29 @@ 
+/* Test C2x UTF-8 characters.  Test valid usages.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c2x -pedantic-errors" } */
+
+unsigned char a = u8'a';
+_Static_assert (u8'a' == 97);
+
+unsigned char b = u8'\0';
+_Static_assert (u8'\0' == 0);
+
+unsigned char c = u8'\xff';
+_Static_assert (u8'\xff' == 255);
+
+unsigned char d = u8'\377';
+_Static_assert (u8'\377' == 255);
+
+_Static_assert (sizeof (u8'a') == 1);
+_Static_assert (sizeof (u8'\0') == 1);
+_Static_assert (sizeof (u8'\xff') == 1);
+_Static_assert (sizeof (u8'\377') == 1);
+
+_Static_assert (_Generic (u8'a', unsigned char: 1, default: 2) == 1);
+_Static_assert (_Generic (u8'\0', unsigned char: 1, default: 2) == 1);
+_Static_assert (_Generic (u8'\xff', unsigned char: 1, default: 2) == 1);
+_Static_assert (_Generic (u8'\377', unsigned char: 1, default: 2) == 1);
+
+#if u8'\0' - 1 < 0
+#error "UTF-8 constants not unsigned in preprocessor"
+#endif
Index: gcc/testsuite/gcc.dg/c2x-utf8char-2.c
===================================================================
--- gcc/testsuite/gcc.dg/c2x-utf8char-2.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/c2x-utf8char-2.c	(working copy)
@@ -0,0 +1,8 @@ 
+/* Test C2x UTF-8 characters.  Character values not affected by
+   different execution character set.  */
+/* { dg-do compile } */
+/* { dg-require-iconv "IBM1047" } */
+/* { dg-options "-std=c2x -pedantic-errors -fexec-charset=IBM1047" } */
+
+_Static_assert (u8'a' == 97);
+_Static_assert (u8'a' != (unsigned char) 'a');
Index: gcc/testsuite/gcc.dg/c2x-utf8char-3.c
===================================================================
--- gcc/testsuite/gcc.dg/c2x-utf8char-3.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/c2x-utf8char-3.c	(working copy)
@@ -0,0 +1,8 @@ 
+/* Test C2x UTF-8 characters.  Test errors for invalid code.  */
+/* { dg-do compile } */
+/* { dg-options "-std=c2x -pedantic-errors" } */
+
+unsigned char a = u8''; /* { dg-error "empty character constant" } */
+unsigned char b = u8'ab'; /* { dg-error "character constant too long for its type" } */
+unsigned char c = u8'\u00ff'; /* { dg-error "character constant too long for its type" } */
+unsigned char d = u8'\x100'; /* { dg-error "hex escape sequence out of range" } */
Index: gcc/testsuite/gcc.dg/gnu2x-utf8char-1.c
===================================================================
--- gcc/testsuite/gcc.dg/gnu2x-utf8char-1.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/gnu2x-utf8char-1.c	(working copy)
@@ -0,0 +1,5 @@ 
+/* Test C2x UTF-8 characters.  Test accepted with -std=gnu2x.  */
+/* { dg-do compile } */
+/* { dg-options "-std=gnu2x" } */
+
+#include "c2x-utf8char-1.c"
Index: libcpp/charset.c
===================================================================
--- libcpp/charset.c	(revision 278253)
+++ libcpp/charset.c	(working copy)
@@ -1928,6 +1928,8 @@  narrow_str_to_charconst (cpp_reader *pfile, cpp_st
   /* Multichar constants are of type int and therefore signed.  */
   if (i > 1)
     unsigned_p = 0;
+  else if (type == CPP_UTF8CHAR && !CPP_OPTION (pfile, cplusplus))
+    unsigned_p = 1;
   else
     unsigned_p = CPP_OPTION (pfile, unsigned_char);
 
Index: libcpp/init.c
===================================================================
--- libcpp/init.c	(revision 278253)
+++ libcpp/init.c	(working copy)
@@ -102,13 +102,13 @@  static const struct lang_flags lang_defaults[] =
   /* GNUC99   */  { 1,  0,  1,  1,  0,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0 },
   /* GNUC11   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0 },
   /* GNUC17   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0 },
-  /* GNUC2X   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     1 },
+  /* GNUC2X   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   1,      1,   1,     1 },
   /* STDC89   */  { 0,  0,  0,  0,  0,  1,  0,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0 },
   /* STDC94   */  { 0,  0,  0,  0,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0 },
   /* STDC99   */  { 1,  0,  1,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0 },
   /* STDC11   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0 },
   /* STDC17   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0 },
-  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   1,     1 },
+  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   1,      0,   1,     1 },
   /* GNUCXX   */  { 0,  1,  1,  1,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0 },
   /* CXX98    */  { 0,  1,  0,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   1,     0 },
   /* GNUCXX11 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    0,     0,     0,   0,      1,   1,     0 },