diff mbox series

[committed] libstdc++: Add "ASCII" as an alias for std::text_encoding::id::ASCII

Message ID 20240131095245.1915153-1-jwakely@redhat.com
State New
Headers show
Series [committed] libstdc++: Add "ASCII" as an alias for std::text_encoding::id::ASCII | expand

Commit Message

Jonathan Wakely Jan. 31, 2024, 9:50 a.m. UTC
SG16 (Unicode and Text Study Group) and LWG are overwhelmingly in favour
of adding this alias, so let's not wait for the issue to get voted into
the working draft.

Tested aarch64-linux. Pushed to trunk.

-- >8 --

As noted in LWG 4043, "ASCII" is not an alias for any known registered
character encoding, so std::text_encoding("ASCII").mib() == id::other.
Add the alias "ASCII" to the implementation-defined superset of aliases
for that encoding.

libstdc++-v3/ChangeLog:

	* include/bits/text_encoding-data.h: Regenerate.
	* scripts/gen_text_encoding_data.py: Add extra_aliases dict
	containing "ASCII".
	* testsuite/std/text_encoding/cons.cc: Check "ascii" is known.

Co-authored-by: Ewan Higgs <ewan.higgs@gmail.com>
Signed-off-by: Ewan Higgs <ewan.higgs@gmail.com>
---
 .../include/bits/text_encoding-data.h         |  3 ++-
 .../scripts/gen_text_encoding_data.py         | 24 ++++++++++++++++++-
 .../testsuite/std/text_encoding/cons.cc       |  5 ++++
 3 files changed, 30 insertions(+), 2 deletions(-)
diff mbox series

Patch

diff --git a/libstdc++-v3/include/bits/text_encoding-data.h b/libstdc++-v3/include/bits/text_encoding-data.h
index 7ac2e9dc3d9..5041e738d21 100644
--- a/libstdc++-v3/include/bits/text_encoding-data.h
+++ b/libstdc++-v3/include/bits/text_encoding-data.h
@@ -14,6 +14,7 @@ 
   {    3, "IBM367" },
   {    3, "cp367" },
   {    3, "csASCII" },
+  {    3, "ASCII" }, // libstdc++ extension
   {    4, "ISO_8859-1:1987" },
   {    4, "iso-ir-100" },
   {    4, "ISO_8859-1" },
@@ -417,7 +418,7 @@ 
   {  104, "csISO2022CN" },
   {  105, "ISO-2022-CN-EXT" },
   {  105, "csISO2022CNEXT" },
-#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET 413
+#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET 414
   {  106, "UTF-8" },
   {  106, "csUTF8" },
   {  109, "ISO-8859-13" },
diff --git a/libstdc++-v3/scripts/gen_text_encoding_data.py b/libstdc++-v3/scripts/gen_text_encoding_data.py
index 2d6f3e4077a..f0ebb42d8c2 100755
--- a/libstdc++-v3/scripts/gen_text_encoding_data.py
+++ b/libstdc++-v3/scripts/gen_text_encoding_data.py
@@ -36,6 +36,18 @@  print("#ifndef _GLIBCXX_GET_ENCODING_DATA")
 print('# error "This is not a public header, do not include it directly"')
 print("#endif\n")
 
+# We need to generate a list of initializers of the form { mib, alias }, e.g.,
+# { 3, "US-ASCII" },
+# { 3, "ISO646-US" },
+# { 3, "csASCII" },
+# { 4, "ISO_8859-1:1987" },
+# { 4, "latin1" },
+# The initializers must be sorted by the mib value. The first entry for
+# a given mib must be the primary name for the encoding. Any aliases for
+# the encoding come after the primary name.
+# We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the
+# offset into the list of the mib=106, alias="UTF-8" entry. This is used
+# to optimize the common case, so we don't need to search for "UTF-8".
 
 charsets = {}
 with open(sys.argv[1], newline='') as f:
@@ -52,10 +64,15 @@  with open(sys.argv[1], newline='') as f:
             aliases.remove(name)
         charsets[mib] = [name] + aliases
 
-# Remove "NATS-DANO" and "NATS-DANO-ADD"
+# Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard.
 charsets.pop(33, None)
 charsets.pop(34, None)
 
+# This is not an official IANA alias, but we include it in the
+# implementation-defined superset of aliases for US-ASCII.
+# See also LWG 4043.
+extra_aliases = {3: ["ASCII"]}
+
 count = 0
 for mib in sorted(charsets.keys()):
     names = charsets[mib]
@@ -64,6 +81,11 @@  for mib in sorted(charsets.keys()):
     for name in names:
         print('  {{ {:4}, "{}" }},'.format(mib, name))
     count += len(names)
+    if mib in extra_aliases:
+        names = extra_aliases[mib]
+        for name in names:
+            print('  {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name))
+        count += len(names)
 
 # <text_encoding> gives an error if this macro is left defined.
 # Do this last, so that the generated output is not usable unless we reach here.
diff --git a/libstdc++-v3/testsuite/std/text_encoding/cons.cc b/libstdc++-v3/testsuite/std/text_encoding/cons.cc
index b9d93641de4..8fcc2ec8c3b 100644
--- a/libstdc++-v3/testsuite/std/text_encoding/cons.cc
+++ b/libstdc++-v3/testsuite/std/text_encoding/cons.cc
@@ -53,6 +53,11 @@  test_construct_by_name()
   VERIFY( e4.name() == s );
   VERIFY( ! e4.aliases().empty() );
   VERIFY( e4.aliases().front() == "US-ASCII"sv ); // primary name
+
+  s = "ascii";
+  std::text_encoding e5(s);
+  VERIFY( e5.mib() == std::text_encoding::ASCII );
+  VERIFY( e5.name() == s );
 }
 
 constexpr void