diff mbox series

[COMMITTED,02/25] gccrs: Implement quick-check for Unicode

Message ID 20240207114419.1100894-3-arthur.cohen@embecosm.com
State New
Headers show
Series [COMMITTED,01/25] gccrs: Parse normal functions with `self` parameter correctly | expand

Commit Message

Arthur Cohen Feb. 7, 2024, 11:43 a.m. UTC
From: Raiki Tamura <tamaron1203@gmail.com>

gcc/rust/ChangeLog:

	* rust-lang.cc (run_rust_tests): Add test.
	* rust-system.h: Add <algorithm>.
	* util/make-rust-unicode.py: Output NFC_Quick_Check table.
	* util/rust-codepoint.h (struct Codepoint): Add is_supplementary
	method.
	* util/rust-unicode-data.h: Generated.
	* util/rust-unicode.cc (binary_search_sorted_array): Removed.
	(lookup_cc): Remove namespace.
	(is_alphabetic): Use std::binary_search
	(nfc_quick_check): New function.
	(nfc_normalize): Use nfc_quick_check.
	(is_nfc_qc_maybe): New function.
	(is_nfc_qc_no): New function.
	(rust_nfc_qc_test): New test.
	* util/rust-unicode.h (is_nfc_qc_no): New function.
	(is_nfc_qc_maybe): New function.
	(enum class): New enum class.
	(nfc_quick_check): New function.
	(rust_nfc_qc_test): New test.

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
---
 gcc/rust/rust-lang.cc              |   1 +
 gcc/rust/rust-system.h             |   1 +
 gcc/rust/util/make-rust-unicode.py |  39 +++++--
 gcc/rust/util/rust-codepoint.h     |   1 +
 gcc/rust/util/rust-unicode-data.h  | 158 ++++++++++++++++++++++++++++-
 gcc/rust/util/rust-unicode.cc      | 143 +++++++++++++-------------
 gcc/rust/util/rust-unicode.h       |  19 ++++
 7 files changed, 280 insertions(+), 82 deletions(-)
diff mbox series

Patch

diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index 8b76ba28ea2..4c2ef108bce 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -438,6 +438,7 @@  run_rust_tests ()
 {
   // Call tests for the rust frontend here
   rust_input_source_test ();
+  rust_nfc_qc_test ();
   rust_utf8_normalize_test ();
   rust_punycode_encode_test ();
   rust_cfg_parser_test ();
diff --git a/gcc/rust/rust-system.h b/gcc/rust/rust-system.h
index 88d6c1095e6..d8a42181700 100644
--- a/gcc/rust/rust-system.h
+++ b/gcc/rust/rust-system.h
@@ -44,6 +44,7 @@ 
 #include <utility>
 #include <fstream>
 #include <array>
+#include <algorithm>
 
 // Rust frontend requires C++11 minimum, so will have unordered_map and set
 #include <unordered_map>
diff --git a/gcc/rust/util/make-rust-unicode.py b/gcc/rust/util/make-rust-unicode.py
index 5303440fd25..f6f04ebdf5b 100644
--- a/gcc/rust/util/make-rust-unicode.py
+++ b/gcc/rust/util/make-rust-unicode.py
@@ -250,6 +250,30 @@  def write_numeric() -> None:
     print("}};")
 
 
+def write_nfc_qc():
+    print(
+        "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_NO_RANGES = {{{{".format(
+            len(nfc_qc_no_ranges)
+        )
+    )
+    print("  // clang-format off")
+    for r in nfc_qc_no_ranges:
+        print("  {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
+    print("  // clang-format on")
+    print("}};")
+
+    print(
+        "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_MAYBE_RANGES = {{{{".format(
+            len(nfc_qc_maybe_ranges)
+        )
+    )
+    print("  // clang-format off")
+    for r in nfc_qc_maybe_ranges:
+        print("  {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
+    print("  // clang-format on")
+    print("}};")
+
+
 def main() -> None:
     if len(sys.argv) != 4:
         print("too few arguments", file=sys.stderr)
@@ -265,13 +289,12 @@  def main() -> None:
     print(COPYRIGHT)
     print()
 
-    print('#include "rust-system.h"')
-    print()
-    print("namespace Rust {")
-    print()
+    print('#include "rust-system.h"\n')
+    print("namespace Rust {\n")
     print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
-    print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
-    print()
+    print(
+        "const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints))
+    )
 
     write_decomposition()
     print()
@@ -283,8 +306,8 @@  def main() -> None:
     print()
     write_numeric()
     print()
-
-    # TODO: write NFC_QC table
+    write_nfc_qc()
+    print()
 
     print("} // namespace Rust")
 
diff --git a/gcc/rust/util/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h
index a75e99e7c0c..2c5fece9d3e 100644
--- a/gcc/rust/util/rust-codepoint.h
+++ b/gcc/rust/util/rust-codepoint.h
@@ -40,6 +40,7 @@  struct Codepoint
   static Codepoint eof () { return Codepoint (UINT32_MAX); }
   bool is_eof () const { return value == UINT32_MAX; }
   bool is_ascii () const { return value <= MAX_ASCII_CODEPOINT; }
+  bool is_supplementary_character () const { return value > 0xFFFF; }
 
   // Returns a C++ string containing string value of codepoint.
   std::string as_string ();
diff --git a/gcc/rust/util/rust-unicode-data.h b/gcc/rust/util/rust-unicode-data.h
index c42460f4541..e07752a4f63 100644
--- a/gcc/rust/util/rust-unicode-data.h
+++ b/gcc/rust/util/rust-unicode-data.h
@@ -20,7 +20,7 @@ 
 
 namespace Rust {
 
-const uint32_t NUM_ALPHABETIC_RANGES = 1117;
+const uint32_t NUM_ALPHABETIC_RANGES = 1141;
 const uint32_t NUM_NUMERIC_CODEPOINTS = 1831;
 
 const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
@@ -4167,6 +4167,7 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x0bd7, 0x0bd8},
   {0x0c00, 0x0c01},
   {0x0c01, 0x0c04},
+  {0x0c04, 0x0c05},
   {0x0c05, 0x0c0d},
   {0x0c0e, 0x0c11},
   {0x0c12, 0x0c29},
@@ -4202,6 +4203,7 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x0ce0, 0x0ce2},
   {0x0ce2, 0x0ce4},
   {0x0cf1, 0x0cf3},
+  {0x0cf3, 0x0cf4},
   {0x0d00, 0x0d02},
   {0x0d02, 0x0d04},
   {0x0d04, 0x0d0d},
@@ -4257,7 +4259,7 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x0f49, 0x0f6d},
   {0x0f71, 0x0f7f},
   {0x0f7f, 0x0f80},
-  {0x0f80, 0x0f82},
+  {0x0f80, 0x0f84},
   {0x0f88, 0x0f8d},
   {0x0f8d, 0x0f98},
   {0x0f99, 0x0fbd},
@@ -4758,6 +4760,7 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x11071, 0x11073},
   {0x11073, 0x11075},
   {0x11075, 0x11076},
+  {0x11080, 0x11082},
   {0x11082, 0x11083},
   {0x11083, 0x110b0},
   {0x110b0, 0x110b3},
@@ -4794,6 +4797,8 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x11234, 0x11235},
   {0x11237, 0x11238},
   {0x1123e, 0x1123f},
+  {0x1123f, 0x11241},
+  {0x11241, 0x11242},
   {0x11280, 0x11287},
   {0x11288, 0x11289},
   {0x1128a, 0x1128e},
@@ -4948,12 +4953,22 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x11ee0, 0x11ef3},
   {0x11ef3, 0x11ef5},
   {0x11ef5, 0x11ef7},
+  {0x11f00, 0x11f02},
+  {0x11f02, 0x11f03},
+  {0x11f03, 0x11f04},
+  {0x11f04, 0x11f11},
+  {0x11f12, 0x11f34},
+  {0x11f34, 0x11f36},
+  {0x11f36, 0x11f3b},
+  {0x11f3e, 0x11f40},
+  {0x11f40, 0x11f41},
   {0x11fb0, 0x11fb1},
   {0x12000, 0x1239a},
   {0x12400, 0x1246f},
   {0x12480, 0x12544},
   {0x12f90, 0x12ff1},
-  {0x13000, 0x1342f},
+  {0x13000, 0x13430},
+  {0x13441, 0x13447},
   {0x14400, 0x14647},
   {0x16800, 0x16a39},
   {0x16a40, 0x16a5f},
@@ -4980,7 +4995,9 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x1aff5, 0x1affc},
   {0x1affd, 0x1afff},
   {0x1b000, 0x1b123},
+  {0x1b132, 0x1b133},
   {0x1b150, 0x1b153},
+  {0x1b155, 0x1b156},
   {0x1b164, 0x1b168},
   {0x1b170, 0x1b2fc},
   {0x1bc00, 0x1bc6b},
@@ -5021,16 +5038,21 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x1df00, 0x1df0a},
   {0x1df0a, 0x1df0b},
   {0x1df0b, 0x1df1f},
+  {0x1df25, 0x1df2b},
   {0x1e000, 0x1e007},
   {0x1e008, 0x1e019},
   {0x1e01b, 0x1e022},
   {0x1e023, 0x1e025},
   {0x1e026, 0x1e02b},
+  {0x1e030, 0x1e06e},
+  {0x1e08f, 0x1e090},
   {0x1e100, 0x1e12d},
   {0x1e137, 0x1e13e},
   {0x1e14e, 0x1e14f},
   {0x1e290, 0x1e2ae},
   {0x1e2c0, 0x1e2ec},
+  {0x1e4d0, 0x1e4eb},
+  {0x1e4eb, 0x1e4ec},
   {0x1e7e0, 0x1e7e7},
   {0x1e7e8, 0x1e7ec},
   {0x1e7ed, 0x1e7ef},
@@ -5076,12 +5098,14 @@  const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
   {0x1f150, 0x1f16a},
   {0x1f170, 0x1f18a},
   {0x20000, 0x2a6e0},
-  {0x2a700, 0x2b739},
+  {0x2a700, 0x2b73a},
   {0x2b740, 0x2b81e},
   {0x2b820, 0x2cea2},
   {0x2ceb0, 0x2ebe1},
+  {0x2ebf0, 0x2ee5e},
   {0x2f800, 0x2fa1e},
   {0x30000, 0x3134b},
+  {0x31350, 0x323b0},
     // clang-format on
   }};
 
@@ -5205,4 +5229,130 @@  const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{
   // clang-format on
 }};
 
+const std::array<std::pair<uint32_t, uint32_t>, 74> NFC_QC_NO_RANGES = {{
+  // clang-format off
+  {0x0340, 0x0342},
+  {0x0343, 0x0345},
+  {0x0374, 0x0375},
+  {0x037e, 0x037f},
+  {0x0387, 0x0388},
+  {0x0958, 0x0960},
+  {0x09dc, 0x09de},
+  {0x09df, 0x09e0},
+  {0x0a33, 0x0a34},
+  {0x0a36, 0x0a37},
+  {0x0a59, 0x0a5c},
+  {0x0a5e, 0x0a5f},
+  {0x0b5c, 0x0b5e},
+  {0x0f43, 0x0f44},
+  {0x0f4d, 0x0f4e},
+  {0x0f52, 0x0f53},
+  {0x0f57, 0x0f58},
+  {0x0f5c, 0x0f5d},
+  {0x0f69, 0x0f6a},
+  {0x0f73, 0x0f74},
+  {0x0f75, 0x0f77},
+  {0x0f78, 0x0f79},
+  {0x0f81, 0x0f82},
+  {0x0f93, 0x0f94},
+  {0x0f9d, 0x0f9e},
+  {0x0fa2, 0x0fa3},
+  {0x0fa7, 0x0fa8},
+  {0x0fac, 0x0fad},
+  {0x0fb9, 0x0fba},
+  {0x1f71, 0x1f72},
+  {0x1f73, 0x1f74},
+  {0x1f75, 0x1f76},
+  {0x1f77, 0x1f78},
+  {0x1f79, 0x1f7a},
+  {0x1f7b, 0x1f7c},
+  {0x1f7d, 0x1f7e},
+  {0x1fbb, 0x1fbc},
+  {0x1fbe, 0x1fbf},
+  {0x1fc9, 0x1fca},
+  {0x1fcb, 0x1fcc},
+  {0x1fd3, 0x1fd4},
+  {0x1fdb, 0x1fdc},
+  {0x1fe3, 0x1fe4},
+  {0x1feb, 0x1fec},
+  {0x1fee, 0x1ff0},
+  {0x1ff9, 0x1ffa},
+  {0x1ffb, 0x1ffc},
+  {0x1ffd, 0x1ffe},
+  {0x2000, 0x2002},
+  {0x2126, 0x2127},
+  {0x212a, 0x212c},
+  {0x2329, 0x232a},
+  {0x232a, 0x232b},
+  {0x2adc, 0x2add},
+  {0xf900, 0xfa0e},
+  {0xfa10, 0xfa11},
+  {0xfa12, 0xfa13},
+  {0xfa15, 0xfa1f},
+  {0xfa20, 0xfa21},
+  {0xfa22, 0xfa23},
+  {0xfa25, 0xfa27},
+  {0xfa2a, 0xfa6e},
+  {0xfa70, 0xfada},
+  {0xfb1d, 0xfb1e},
+  {0xfb1f, 0xfb20},
+  {0xfb2a, 0xfb37},
+  {0xfb38, 0xfb3d},
+  {0xfb3e, 0xfb3f},
+  {0xfb40, 0xfb42},
+  {0xfb43, 0xfb45},
+  {0xfb46, 0xfb4f},
+  {0x1d15e, 0x1d165},
+  {0x1d1bb, 0x1d1c1},
+  {0x2f800, 0x2fa1e},
+  // clang-format on
+}};
+const std::array<std::pair<uint32_t, uint32_t>, 43> NFC_QC_MAYBE_RANGES = {{
+  // clang-format off
+  {0x0300, 0x0305},
+  {0x0306, 0x030d},
+  {0x030f, 0x0310},
+  {0x0311, 0x0312},
+  {0x0313, 0x0315},
+  {0x031b, 0x031c},
+  {0x0323, 0x0329},
+  {0x032d, 0x032f},
+  {0x0330, 0x0332},
+  {0x0338, 0x0339},
+  {0x0342, 0x0343},
+  {0x0345, 0x0346},
+  {0x0653, 0x0656},
+  {0x093c, 0x093d},
+  {0x09be, 0x09bf},
+  {0x09d7, 0x09d8},
+  {0x0b3e, 0x0b3f},
+  {0x0b56, 0x0b57},
+  {0x0b57, 0x0b58},
+  {0x0bbe, 0x0bbf},
+  {0x0bd7, 0x0bd8},
+  {0x0c56, 0x0c57},
+  {0x0cc2, 0x0cc3},
+  {0x0cd5, 0x0cd7},
+  {0x0d3e, 0x0d3f},
+  {0x0d57, 0x0d58},
+  {0x0dca, 0x0dcb},
+  {0x0dcf, 0x0dd0},
+  {0x0ddf, 0x0de0},
+  {0x102e, 0x102f},
+  {0x1161, 0x1176},
+  {0x11a8, 0x11c3},
+  {0x1b35, 0x1b36},
+  {0x3099, 0x309b},
+  {0x110ba, 0x110bb},
+  {0x11127, 0x11128},
+  {0x1133e, 0x1133f},
+  {0x11357, 0x11358},
+  {0x114b0, 0x114b1},
+  {0x114ba, 0x114bb},
+  {0x114bd, 0x114be},
+  {0x115af, 0x115b0},
+  {0x11930, 0x11931},
+  // clang-format on
+}};
+
 } // namespace Rust
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index 999ecb042ca..6bd2db550a1 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -39,75 +39,27 @@  const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
 const uint32_t N_COUNT = V_COUNT * T_COUNT;
 const uint32_t S_COUNT = L_COUNT * N_COUNT;
 
+// Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
 template <std::size_t SIZE>
-int64_t
+bool
 binary_search_ranges (
-  // FIXME: use binray search function from <algorithm>
   const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
   uint32_t target_cp)
 {
-  if (SIZE == 0)
-    return -1;
-
-  uint32_t low, high, mid;
-  uint32_t start, end;
-  low = 0;
-  high = SIZE - 1;
-  mid = (low + high) / 2;
-  while (high - low > 1)
-    {
-      start = ranges[mid].first;
-      end = ranges[mid].second;
-      if (start <= target_cp && target_cp < end)
-	{
-	  return mid;
-	}
-      else if (end <= target_cp)
-	low = mid + 1;
-      else
-	high = mid - 1;
-      mid = (low + high) / 2;
-    }
-
-  if (ranges[mid].first <= target_cp && target_cp < ranges[mid].second)
-    return mid;
-  else
-    return -1;
-}
-
-template <std::size_t SIZE>
-int64_t
-binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
-			    uint32_t target)
-{
-  // FIXME: use binray search function from <algorithm>
-  if (SIZE == 0)
-    return -1;
-
-  uint32_t low, high, mid;
-  low = 0;
-  high = SIZE;
-  mid = (low + high) / 2;
-  while (high - low > 1)
-    {
-      if (array[mid] <= target)
-	low = mid;
-      else
-	high = mid;
-      mid = (low + high) / 2;
-    }
-
-  if (array[mid] == target)
-    return mid;
+  auto it = std::lower_bound (ranges.begin (), ranges.end (), target_cp,
+			      [] (const std::pair<uint32_t, uint32_t> &a,
+				  uint32_t b) { return a.second <= b; });
+  if (it == ranges.end ())
+    return false;
   else
-    return -1;
+    return it->first <= target_cp && target_cp < it->second;
 }
 
 int
 lookup_cc (codepoint_t c)
 {
-  auto it = Rust::CCC_TABLE.find (c.value);
-  if (it != Rust::CCC_TABLE.end ())
+  auto it = CCC_TABLE.find (c.value);
+  if (it != CCC_TABLE.end ())
     return it->second;
   else
     // Starter. Returns zero.
@@ -289,10 +241,44 @@  recomp (string_t s)
   return buf;
 }
 
+// see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
+QuickCheckResult
+nfc_quick_check (const string_t &s)
+{
+  int last_canonical_class = 0;
+  QuickCheckResult res = QuickCheckResult::YES;
+
+  for (unsigned long i = 0; i < s.size (); i++)
+    {
+      codepoint_t c = s[i];
+
+      if (c.is_supplementary_character ())
+	i++;
+
+      int canonical_class = lookup_cc (c);
+      if (last_canonical_class > canonical_class && canonical_class != 0)
+	return QuickCheckResult::NO;
+
+      if (is_nfc_qc_no (c.value))
+	return QuickCheckResult::NO;
+
+      if (is_nfc_qc_maybe (c.value))
+	res = QuickCheckResult::MAYBE;
+
+      last_canonical_class = canonical_class;
+    }
+  return res;
+}
+
 string_t
-nfc_normalize (string_t s)
+nfc_normalize (const string_t &s)
 {
-  // TODO: Quick Check
+  if (nfc_quick_check (s) == QuickCheckResult::YES)
+    return s;
+
+  // TODO: optimize normalization.
+  // i.e. only normalize a limited area around MAYBE character, instead of
+  // performing complete normlization of the entire string
 
   // decompose
   string_t d = decomp_cano (s);
@@ -312,21 +298,26 @@  Utf8String::nfc_normalize () const
 bool
 is_alphabetic (uint32_t codepoint)
 {
-  int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint);
-  if (res < 0)
-    return false;
-  else
-    return true;
+  return binary_search_ranges (ALPHABETIC_RANGES, codepoint);
 }
 
 bool
 is_numeric (uint32_t codepoint)
 {
-  int64_t res = binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint);
-  if (res < 0)
-    return false;
-  else
-    return true;
+  return std::binary_search (NUMERIC_CODEPOINTS.begin (),
+			     NUMERIC_CODEPOINTS.end (), codepoint);
+}
+
+bool
+is_nfc_qc_maybe (uint32_t codepoint)
+{
+  return binary_search_ranges (NFC_QC_MAYBE_RANGES, codepoint);
+}
+
+bool
+is_nfc_qc_no (uint32_t codepoint)
+{
+  return binary_search_ranges (NFC_QC_NO_RANGES, codepoint);
 }
 
 bool
@@ -344,6 +335,18 @@  is_ascii_only (const std::string &str)
 
 namespace selftest {
 
+void
+rust_nfc_qc_test ()
+{
+  ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
+	     Rust::QuickCheckResult::YES);
+  ASSERT_EQ (Rust::nfc_quick_check (
+	       {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
+	     Rust::QuickCheckResult::MAYBE);
+  ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
+	     Rust::QuickCheckResult::NO);
+}
+
 void
 assert_normalize (const std::vector<Rust::Codepoint> origin,
 		  const std::vector<Rust::Codepoint> expected)
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h
index 2538436797f..aa7bd8a5632 100644
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@@ -68,12 +68,31 @@  is_ascii_only (const std::string &str);
 bool
 is_numeric (uint32_t codepoint);
 
+bool
+is_nfc_qc_no (uint32_t codepoint);
+
+bool
+is_nfc_qc_maybe (uint32_t codepoint);
+
+enum class QuickCheckResult
+{
+  YES,
+  NO,
+  MAYBE
+};
+
+QuickCheckResult
+nfc_quick_check (const std::vector<Codepoint> &s);
+
 } // namespace Rust
 
 #if CHECKING_P
 
 namespace selftest {
 
+void
+rust_nfc_qc_test ();
+
 void
 rust_utf8_normalize_test ();