diff mbox series

[v2,28/60] json: Fix \uXXXX for surrogate pairs

Message ID 20180817150559.16243-29-armbru@redhat.com
State New
Headers show
Series json: Fixes, error reporting improvements, cleanups | expand

Commit Message

Markus Armbruster Aug. 17, 2018, 3:05 p.m. UTC
The JSON parser treats each half of a surrogate pair as unpaired
surrogate.  Fix it to recognize surrogate pairs.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
---
 qobject/json-parser.c | 60 ++++++++++++++++++++++++++++---------------
 tests/check-qjson.c   |  3 +--
 2 files changed, 40 insertions(+), 23 deletions(-)

Comments

Eric Blake Aug. 17, 2018, 4:36 p.m. UTC | #1
On 08/17/2018 10:05 AM, Markus Armbruster wrote:
> The JSON parser treats each half of a surrogate pair as unpaired
> surrogate.  Fix it to recognize surrogate pairs.
> 
> Signed-off-by: Markus Armbruster <armbru@redhat.com>
> Reviewed-by: Eric Blake <eblake@redhat.com>

I might have dropped the R-b, to ensure the changes since v1 get 
re-reviewed.

> ---
>   qobject/json-parser.c | 60 ++++++++++++++++++++++++++++---------------
>   tests/check-qjson.c   |  3 +--
>   2 files changed, 40 insertions(+), 23 deletions(-)
> 

> @@ -157,22 +169,28 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
>                   qstring_append_chr(str, '\t');
>                   break;
>               case 'u':
> -                cp = 0;
> -                for (i = 0; i < 4; i++) {
> -                    if (!qemu_isxdigit(*ptr)) {
> -                        parse_error(ctxt, token,
> -                                    "invalid hex escape sequence in string");
> -                        goto out;
> +                cp = cvt4hex(ptr);
> +                ptr += 4;
> +
> +                /* handle surrogate pairs */
> +                if (cp >= 0xD800 && cp <= 0xDBFF
> +                    && ptr[0] == '\\' && ptr[1] == 'u') {
> +                    /* leading surrogate followed by \u */
> +                    cp = 0x10000 + ((cp & 0x3FF) << 10);
> +                    trailing = cvt4hex(ptr + 2);
> +                    if (trailing >= 0xDC00 && trailing <= 0xDFFF) {
> +                        /* followed by trailing surrogate */
> +                        cp |= trailing & 0x3FF;
> +                        ptr += 6;
> +                    } else {
> +                        cp = -1; /* invalid */
>                       }
> -                    cp <<= 4;
> -                    cp |= hex2decimal(*ptr);
> -                    ptr++;
>                   }
>   
>                   if (mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp) < 0) {
>                       parse_error(ctxt, token,
> -                                "\\u%.4s is not a valid Unicode character",
> -                                ptr - 3);
> +                                "%.*s is not a valid Unicode character",
> +                                (int)(ptr - beg), beg);

The error reporting here has indeed been improved over v1.

Reviewed-by: Eric Blake <eblake@redhat.com>
Markus Armbruster Aug. 20, 2018, 8:40 a.m. UTC | #2
Eric Blake <eblake@redhat.com> writes:

> On 08/17/2018 10:05 AM, Markus Armbruster wrote:
>> The JSON parser treats each half of a surrogate pair as unpaired
>> surrogate.  Fix it to recognize surrogate pairs.
>>
>> Signed-off-by: Markus Armbruster <armbru@redhat.com>
>> Reviewed-by: Eric Blake <eblake@redhat.com>
>
> I might have dropped the R-b, to ensure the changes since v1 get
> re-reviewed.

I intended to, but screwed up.  My apologies.

>> ---
>>   qobject/json-parser.c | 60 ++++++++++++++++++++++++++++---------------
>>   tests/check-qjson.c   |  3 +--
>>   2 files changed, 40 insertions(+), 23 deletions(-)
>>
>
>> @@ -157,22 +169,28 @@ static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
>>                   qstring_append_chr(str, '\t');
>>                   break;
>>               case 'u':
>> -                cp = 0;
>> -                for (i = 0; i < 4; i++) {
>> -                    if (!qemu_isxdigit(*ptr)) {
>> -                        parse_error(ctxt, token,
>> -                                    "invalid hex escape sequence in string");
>> -                        goto out;
>> +                cp = cvt4hex(ptr);
>> +                ptr += 4;
>> +
>> +                /* handle surrogate pairs */
>> +                if (cp >= 0xD800 && cp <= 0xDBFF
>> +                    && ptr[0] == '\\' && ptr[1] == 'u') {
>> +                    /* leading surrogate followed by \u */
>> +                    cp = 0x10000 + ((cp & 0x3FF) << 10);
>> +                    trailing = cvt4hex(ptr + 2);
>> +                    if (trailing >= 0xDC00 && trailing <= 0xDFFF) {
>> +                        /* followed by trailing surrogate */
>> +                        cp |= trailing & 0x3FF;
>> +                        ptr += 6;
>> +                    } else {
>> +                        cp = -1; /* invalid */
>>                       }
>> -                    cp <<= 4;
>> -                    cp |= hex2decimal(*ptr);
>> -                    ptr++;
>>                   }
>>                     if (mod_utf8_encode(utf8_buf, sizeof(utf8_buf),
>> cp) < 0) {
>>                       parse_error(ctxt, token,
>> -                                "\\u%.4s is not a valid Unicode character",
>> -                                ptr - 3);
>> +                                "%.*s is not a valid Unicode character",
>> +                                (int)(ptr - beg), beg);
>
> The error reporting here has indeed been improved over v1.
>
> Reviewed-by: Eric Blake <eblake@redhat.com>

Thanks!
diff mbox series

Patch

diff --git a/qobject/json-parser.c b/qobject/json-parser.c
index 9985d9929b..35c201c53f 100644
--- a/qobject/json-parser.c
+++ b/qobject/json-parser.c
@@ -64,16 +64,27 @@  static void GCC_FMT_ATTR(3, 4) parse_error(JSONParserContext *ctxt,
     error_setg(&ctxt->err, "JSON parse error, %s", message);
 }
 
-static int hex2decimal(char ch)
+static int cvt4hex(const char *s)
 {
-    if (ch >= '0' && ch <= '9') {
-        return (ch - '0');
-    } else if (ch >= 'a' && ch <= 'f') {
-        return 10 + (ch - 'a');
-    } else if (ch >= 'A' && ch <= 'F') {
-        return 10 + (ch - 'A');
+    int cp, i;
+
+    cp = 0;
+    for (i = 0; i < 4; i++) {
+        if (!qemu_isxdigit(s[i])) {
+            return -1;
+        }
+        cp <<= 4;
+        if (s[i] >= '0' && s[i] <= '9') {
+            cp |= s[i] - '0';
+        } else if (s[i] >= 'a' && s[i] <= 'f') {
+            cp |= 10 + s[i] - 'a';
+        } else if (s[i] >= 'A' && s[i] <= 'F') {
+            cp |= 10 + s[i] - 'A';
+        } else {
+            return -1;
+        }
     }
-    abort();
+    return cp;
 }
 
 /**
@@ -115,7 +126,8 @@  static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
     const char *ptr = token->str;
     QString *str;
     char quote;
-    int cp, i;
+    const char *beg;
+    int cp, trailing;
     char *end;
     ssize_t len;
     char utf8_buf[5];
@@ -127,7 +139,7 @@  static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
     while (*ptr != quote) {
         assert(*ptr);
         if (*ptr == '\\') {
-            ptr++;
+            beg = ptr++;
             switch (*ptr++) {
             case '"':
                 qstring_append_chr(str, '"');
@@ -157,22 +169,28 @@  static QString *parse_string(JSONParserContext *ctxt, JSONToken *token)
                 qstring_append_chr(str, '\t');
                 break;
             case 'u':
-                cp = 0;
-                for (i = 0; i < 4; i++) {
-                    if (!qemu_isxdigit(*ptr)) {
-                        parse_error(ctxt, token,
-                                    "invalid hex escape sequence in string");
-                        goto out;
+                cp = cvt4hex(ptr);
+                ptr += 4;
+
+                /* handle surrogate pairs */
+                if (cp >= 0xD800 && cp <= 0xDBFF
+                    && ptr[0] == '\\' && ptr[1] == 'u') {
+                    /* leading surrogate followed by \u */
+                    cp = 0x10000 + ((cp & 0x3FF) << 10);
+                    trailing = cvt4hex(ptr + 2);
+                    if (trailing >= 0xDC00 && trailing <= 0xDFFF) {
+                        /* followed by trailing surrogate */
+                        cp |= trailing & 0x3FF;
+                        ptr += 6;
+                    } else {
+                        cp = -1; /* invalid */
                     }
-                    cp <<= 4;
-                    cp |= hex2decimal(*ptr);
-                    ptr++;
                 }
 
                 if (mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp) < 0) {
                     parse_error(ctxt, token,
-                                "\\u%.4s is not a valid Unicode character",
-                                ptr - 3);
+                                "%.*s is not a valid Unicode character",
+                                (int)(ptr - beg), beg);
                     goto out;
                 }
                 qstring_append(str, utf8_buf);
diff --git a/tests/check-qjson.c b/tests/check-qjson.c
index 5c94c80241..3be32f3fcb 100644
--- a/tests/check-qjson.c
+++ b/tests/check-qjson.c
@@ -63,8 +63,7 @@  static void escaped_string(void)
         { "double byte utf-8 \\u00A2", "double byte utf-8 \xc2\xa2" },
         { "triple byte utf-8 \\u20AC", "triple byte utf-8 \xe2\x82\xac" },
         { "quadruple byte utf-8 \\uD834\\uDD1E", /* U+1D11E */
-          /* bug: want \xF0\x9D\x84\x9E */
-          NULL },
+          "quadruple byte utf-8 \xF0\x9D\x84\x9E" },
         { "\\", NULL },
         { "\\z", NULL },
         { "\\ux", NULL },