Simplify and generalize rust-demangle's unescaping logic.
diff mbox series

Message ID 98de4121-72c8-4a87-bf44-ab917f6c2055@www.fastmail.com
State New
Headers show
Series
  • Simplify and generalize rust-demangle's unescaping logic.
Related show

Commit Message

Eduard-Mihai Burtescu Aug. 14, 2019, 5:22 p.m. UTC
Previously, rust-demangle.c was special-casing a fixed number
of '$uXY$' escapes, but 'XY' can technically be any hex value,
representing some Unicode codepoint.

This patch adds more general support for '$u...$' escapes,
similar to https://github.com/alexcrichton/rustc-demangle/pull/29,
but only for the the ASCII subset. More complete Unicode support
may come at a later time, but right now I want to keep it simple.

Escapes that decode to ASCII control codes are considered invalid,
as the Rust compiler should never emit them, and to avoid any
undesirable effects from accidentally outputting a control code.

Additionally, the switch statements, which had one case for each
alphanumeric character, were replaced with if-else chains.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

2019-08-14  Eduard-Mihai Burtescu  <eddyb@lyken.rs>
libiberty/ChangeLog:
	* rust-demangle.c (unescape): Remove.
	(parse_lower_hex_nibble): New function.
	(parse_legacy_escape): New function.
	(is_prefixed_hash): Use parse_lower_hex_nibble.
	(looks_like_rust): Use parse_legacy_escape.
	(rust_demangle_sym): Use parse_legacy_escape.
	* testsuite/rust-demangle-expected: Add 'llv$u6d$' test.

Patch
diff mbox series

diff --git a/libiberty/rust-demangle.c b/libiberty/rust-demangle.c
index 2302db45b6f..da591902db1 100644
--- a/libiberty/rust-demangle.c
+++ b/libiberty/rust-demangle.c
@@ -50,7 +50,7 @@  extern void *memset(void *s, int c, size_t n);
 #include "rust-demangle.h"
 
 
-/* Mangled Rust symbols look like this:
+/* Mangled (legacy) Rust symbols look like this:
      _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
 
    The original symbol is:
@@ -74,16 +74,7 @@  extern void *memset(void *s, int c, size_t n);
    ">"  =>  $GT$
    "("  =>  $LP$
    ")"  =>  $RP$
-   " "  =>  $u20$
-   "\"" =>  $u22$
-   "'"  =>  $u27$
-   "+"  =>  $u2b$
-   ";"  =>  $u3b$
-   "["  =>  $u5b$
-   "]"  =>  $u5d$
-   "{"  =>  $u7b$
-   "}"  =>  $u7d$
-   "~"  =>  $u7e$
+   "\u{XY}"  =>  $uXY$
 
    A double ".." means "::" and a single "." means "-".
 
@@ -95,7 +86,8 @@  static const size_t hash_len = 16;
 
 static int is_prefixed_hash (const char *start);
 static int looks_like_rust (const char *sym, size_t len);
-static int unescape (const char **in, char **out, const char *seq, char value);
+static int parse_lower_hex_nibble (char nibble);
+static char parse_legacy_escape (const char **in);
 
 /* INPUT: sym: symbol that has been through C++ (gnu v3) demangling
 
@@ -149,7 +141,7 @@  is_prefixed_hash (const char *str)
   const char *end;
   char seen[16];
   size_t i;
-  int count;
+  int count, nibble;
 
   if (strncmp (str, hash_prefix, hash_prefix_len))
     return 0;
@@ -157,12 +149,12 @@  is_prefixed_hash (const char *str)
 
   memset (seen, 0, sizeof(seen));
   for (end = str + hash_len; str < end; str++)
-    if (*str >= '0' && *str <= '9')
-      seen[*str - '0'] = 1;
-    else if (*str >= 'a' && *str <= 'f')
-      seen[*str - 'a' + 10] = 1;
-    else
-      return 0;
+    {
+      nibble = parse_lower_hex_nibble (*str);
+      if (nibble < 0)
+        return 0;
+      seen[nibble] = 1;
+    }
 
   /* Count how many distinct digits seen */
   count = 0;
@@ -179,57 +171,17 @@  looks_like_rust (const char *str, size_t len)
   const char *end = str + len;
 
   while (str < end)
-    switch (*str)
-      {
-      case '$':
-	if (!strncmp (str, "$C$", 3))
-	  str += 3;
-	else if (!strncmp (str, "$SP$", 4)
-		 || !strncmp (str, "$BP$", 4)
-		 || !strncmp (str, "$RF$", 4)
-		 || !strncmp (str, "$LT$", 4)
-		 || !strncmp (str, "$GT$", 4)
-		 || !strncmp (str, "$LP$", 4)
-		 || !strncmp (str, "$RP$", 4))
-	  str += 4;
-	else if (!strncmp (str, "$u20$", 5)
-		 || !strncmp (str, "$u22$", 5)
-		 || !strncmp (str, "$u27$", 5)
-		 || !strncmp (str, "$u2b$", 5)
-		 || !strncmp (str, "$u3b$", 5)
-		 || !strncmp (str, "$u5b$", 5)
-		 || !strncmp (str, "$u5d$", 5)
-		 || !strncmp (str, "$u7b$", 5)
-		 || !strncmp (str, "$u7d$", 5)
-		 || !strncmp (str, "$u7e$", 5))
-	  str += 5;
-	else
-	  return 0;
-	break;
-      case '.':
-	/* Do not allow three or more consecutive dots */
-	if (!strncmp (str, "...", 3))
-	  return 0;
-	/* Fall through */
-      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-      case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
-      case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
-      case 's': case 't': case 'u': case 'v': case 'w': case 'x':
-      case 'y': case 'z':
-      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-      case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
-      case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
-      case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
-      case 'Y': case 'Z':
-      case '0': case '1': case '2': case '3': case '4': case '5':
-      case '6': case '7': case '8': case '9':
-      case '_':
-      case ':':
-	str++;
-	break;
-      default:
-	return 0;
-      }
+    {
+      if (*str == '$')
+        {
+          if (!parse_legacy_escape (&str))
+            return 0;
+        }
+      else if (*str == '.' || *str == '_' || *str == ':' || ISALNUM (*str))
+        str++;
+      else
+        return 0;
+    }
 
   return 1;
 }
@@ -246,6 +198,7 @@  rust_demangle_sym (char *sym)
   const char *in;
   char *out;
   const char *end;
+  char unescaped;
 
   if (!sym)
     return;
@@ -255,75 +208,49 @@  rust_demangle_sym (char *sym)
   end = sym + strlen (sym) - (hash_prefix_len + hash_len);
 
   while (in < end)
-    switch (*in)
-      {
-      case '$':
-	if (!(unescape (&in, &out, "$C$", ',')
-	      || unescape (&in, &out, "$SP$", '@')
-	      || unescape (&in, &out, "$BP$", '*')
-	      || unescape (&in, &out, "$RF$", '&')
-	      || unescape (&in, &out, "$LT$", '<')
-	      || unescape (&in, &out, "$GT$", '>')
-	      || unescape (&in, &out, "$LP$", '(')
-	      || unescape (&in, &out, "$RP$", ')')
-	      || unescape (&in, &out, "$u20$", ' ')
-	      || unescape (&in, &out, "$u22$", '\"')
-	      || unescape (&in, &out, "$u27$", '\'')
-	      || unescape (&in, &out, "$u2b$", '+')
-	      || unescape (&in, &out, "$u3b$", ';')
-	      || unescape (&in, &out, "$u5b$", '[')
-	      || unescape (&in, &out, "$u5d$", ']')
-	      || unescape (&in, &out, "$u7b$", '{')
-	      || unescape (&in, &out, "$u7d$", '}')
-	      || unescape (&in, &out, "$u7e$", '~'))) {
-	  /* unexpected escape sequence, not looks_like_rust. */
-	  goto fail;
-	}
-	break;
-      case '_':
-	/* If this is the start of a path component and the next
-	   character is an escape sequence, ignore the underscore. The
-	   mangler inserts an underscore to make sure the path
-	   component begins with a XID_Start character. */
-	if ((in == sym || in[-1] == ':') && in[1] == '$')
-	  in++;
-	else
-	  *out++ = *in++;
-	break;
-      case '.':
-	if (in[1] == '.')
-	  {
-	    /* ".." becomes "::" */
-	    *out++ = ':';
-	    *out++ = ':';
-	    in += 2;
-	  }
-	else
-	  {
-	    /* "." becomes "-" */
-	    *out++ = '-';
-	    in++;
-	  }
-	break;
-      case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-      case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
-      case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
-      case 's': case 't': case 'u': case 'v': case 'w': case 'x':
-      case 'y': case 'z':
-      case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-      case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
-      case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
-      case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
-      case 'Y': case 'Z':
-      case '0': case '1': case '2': case '3': case '4': case '5':
-      case '6': case '7': case '8': case '9':
-      case ':':
-	*out++ = *in++;
-	break;
-      default:
-	/* unexpected character in symbol, not looks_like_rust.  */
-	goto fail;
-      }
+    {
+      if (*in == '$')
+        {
+          unescaped = parse_legacy_escape (&in);
+          if (unescaped)
+            *out++ = unescaped;
+          else
+            /* unexpected escape sequence, not looks_like_rust. */
+            goto fail;
+        }
+      else if (*in == '_')
+        {
+          /* If this is the start of a path component and the next
+             character is an escape sequence, ignore the underscore. The
+             mangler inserts an underscore to make sure the path
+             component begins with a XID_Start character. */
+          if ((in == sym || in[-1] == ':') && in[1] == '$')
+            in++;
+          else
+            *out++ = *in++;
+        }
+      else if (*in == '.')
+        {
+          if (in[1] == '.')
+            {
+              /* ".." becomes "::" */
+              *out++ = ':';
+              *out++ = ':';
+              in += 2;
+            }
+          else
+            {
+              /* "." becomes "-" */
+              *out++ = '-';
+              in++;
+            }
+        }
+      else if (*in == ':' || ISALNUM (*in))
+        *out++ = *in++;
+      else
+        /* unexpected character in symbol, not looks_like_rust.  */
+        goto fail;
+    }
   goto done;
 
 fail:
@@ -332,18 +259,78 @@  done:
   *out = '\0';
 }
 
+/* Return a 0x0-0xf value if the char is 0-9a-f, and -1 otherwise. */
 static int
-unescape (const char **in, char **out, const char *seq, char value)
+parse_lower_hex_nibble (char nibble)
 {
-  size_t len = strlen (seq);
+  if ('0' <= nibble && nibble <= '9')
+    return nibble - '0';
+  if ('a' <= nibble && nibble <= 'f')
+    return 0xa + (nibble - 'a');
+  return -1;
+}
 
-  if (strncmp (*in, seq, len))
-    return 0;
+/* Return the unescaped character for a "$...$" escape, or 0 if invalid. */
+static char
+parse_legacy_escape (const char **in)
+{
+  char c = 0;
+  const char *e;
+  size_t escape_len = 0;
+  int lo_nibble = -1, hi_nibble = -1;
 
-  **out = value;
+  if ((*in)[0] != '$')
+    return 0;
 
-  *in += len;
-  *out += 1;
+  e = *in + 1;
+
+  if (e[0] == 'C')
+    {
+      escape_len = 1;
+
+      c = ',';
+    }
+  else
+    {
+      escape_len = 2;
+
+      if (e[0] == 'S' && e[1] == 'P')
+        c = '@';
+      else if (e[0] == 'B' && e[1] == 'P')
+        c = '*';
+      else if (e[0] == 'R' && e[1] == 'F')
+        c = '&';
+      else if (e[0] == 'L' && e[1] == 'T')
+        c = '<';
+      else if (e[0] == 'G' && e[1] == 'T')
+        c = '>';
+      else if (e[0] == 'L' && e[1] == 'P')
+        c = '(';
+      else if (e[0] == 'R' && e[1] == 'P')
+        c = ')';
+      else if (e[0] == 'u')
+        {
+          escape_len = 3;
+
+          hi_nibble = parse_lower_hex_nibble (e[1]);
+          if (hi_nibble < 0)
+            return 0;
+          lo_nibble = parse_lower_hex_nibble (e[2]);
+          if (lo_nibble < 0)
+            return 0;
+
+          /* Only allow non-control ASCII characters. */
+          if (hi_nibble > 7)
+            return 0;
+          c = (hi_nibble << 4) | lo_nibble;
+          if (c < 0x20)
+            return 0;
+        }
+    }
+
+  if (!c || e[escape_len] != '$')
+    return 0;
 
-  return 1;
+  *in += 2 + escape_len;
+  return c;
 }
diff --git a/libiberty/testsuite/rust-demangle-expected b/libiberty/testsuite/rust-demangle-expected
index 0b4288fc37d..c3b03f9f02d 100644
--- a/libiberty/testsuite/rust-demangle-expected
+++ b/libiberty/testsuite/rust-demangle-expected
@@ -159,3 +159,7 @@  _ZN68_$LT$core..nonzero..NonZero$LT$T$GT$$u20$as$u20$core..ops..Deref$GT$5deref1
 --format=rust
 _ZN63_$LT$core..ptr..Unique$LT$T$GT$$u20$as$u20$core..ops..Deref$GT$5deref17h19f2ad4920655e85E
 <core::ptr::Unique<T> as core::ops::Deref>::deref
+#
+--format=rust
+_ZN11issue_609253foo37Foo$LT$issue_60925..llv$u6d$..Foo$GT$3foo17h059a991a004536adE
+issue_60925::foo::Foo<issue_60925::llvm::Foo>::foo