diff mbox series

[3/6] lib/charset: utf8_get() should return error

Message ID 20210227130840.166193-4-xypron.glpk@gmx.de
State Accepted, archived
Commit ddbaff53da5b99563fa371db0b09544e139fdabb
Delegated to: Heinrich Schuchardt
Headers show
Series efi_loader: Unicode output in UEFI applications | expand

Commit Message

Heinrich Schuchardt Feb. 27, 2021, 1:08 p.m. UTC
utf8_get() should return an error if hitting an illegal UTF-8 sequence and
not silently convert the input to a question mark.

Correct utf_8() and the its unit test.

console_read_unicode() now will ignore illegal UTF-8 sequences.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 lib/charset.c     | 25 ++++++++++++++++---------
 test/unicode_ut.c |  7 +++++++
 2 files changed, 23 insertions(+), 9 deletions(-)

--
2.30.0
diff mbox series

Patch

diff --git a/lib/charset.c b/lib/charset.c
index 1345c8f9f0..946d5ee23e 100644
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -32,7 +32,7 @@  static struct capitalization_table capitalization_table[] =
  *
  * @read_u8:	- stream reader
  * @src:	- string buffer passed to stream reader, optional
- * Return:	- Unicode code point
+ * Return:	- Unicode code point, or -1
  */
 static int get_code(u8 (*read_u8)(void *data), void *data)
 {
@@ -78,7 +78,7 @@  static int get_code(u8 (*read_u8)(void *data), void *data)
 	}
 	return ch;
 error:
-	return '?';
+	return -1;
 }

 /**
@@ -120,14 +120,21 @@  static u8 read_console(void *data)

 int console_read_unicode(s32 *code)
 {
-	if (!tstc()) {
-		/* No input available */
-		return 1;
-	}
+	for (;;) {
+		s32 c;

-	/* Read Unicode code */
-	*code = get_code(read_console, NULL);
-	return 0;
+		if (!tstc()) {
+			/* No input available */
+			return 1;
+		}
+
+		/* Read Unicode code */
+		c = get_code(read_console, NULL);
+		if (c > 0) {
+			*code = c;
+			return 0;
+		}
+	}
 }

 s32 utf8_get(const char **src)
diff --git a/test/unicode_ut.c b/test/unicode_ut.c
index 2cc6b5feff..154361aea7 100644
--- a/test/unicode_ut.c
+++ b/test/unicode_ut.c
@@ -52,6 +52,7 @@  static const char d4[] = {0xf0, 0x90, 0x92, 0x8d, 0xf0, 0x90, 0x92, 0x96,
 static const char j1[] = {0x6a, 0x31, 0xa1, 0x6c, 0x00};
 static const char j2[] = {0x6a, 0x32, 0xc3, 0xc3, 0x6c, 0x00};
 static const char j3[] = {0x6a, 0x33, 0xf0, 0x90, 0xf0, 0x00};
+static const char j4[] = {0xa1, 0x00};

 static int unicode_test_u16_strlen(struct unit_test_state *uts)
 {
@@ -165,6 +166,12 @@  static int unicode_test_utf8_get(struct unit_test_state *uts)
 	ut_asserteq(0x0001048d, code);
 	ut_asserteq_ptr(s, d4 + 4);

+	/* Check illegal character */
+	s = j4;
+	code = utf8_get((const char **)&s);
+	ut_asserteq(-1, code);
+	ut_asserteq_ptr(j4 + 1, s);
+
 	return 0;
 }
 UNICODE_TEST(unicode_test_utf8_get);