Message ID | a89cee054d28d43cf8f7e5f171e876326e4af96e.1652994079.git.fweimer@redhat.com |
---|---|
State | New |
Headers | show |
Series | Assume UTF-8 encoding for localedef input files | expand |
On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote: > This will permit reusing the Unicode character processing for > different character encodings, not just the current <U...> encoding. LGTM. Straight forward refactor. Reviewed-by: Carlos O'Donell <carlos@redhat.com> Tested-by: Carlos O'Donell <carlos@redhat.com> > --- > locale/programs/linereader.c | 167 ++++++++++++++++++----------------- > 1 file changed, 85 insertions(+), 82 deletions(-) > > diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c > index d5367e0a1e..f7292f0102 100644 > --- a/locale/programs/linereader.c > +++ b/locale/programs/linereader.c > @@ -596,6 +596,83 @@ get_ident (struct linereader *lr) > return &lr->token; > } > > +/* Process a decoded Unicode codepoint WCH in a string, placing the > + multibyte sequence into LRB. Return false if the character is not > + found in CHARMAP/REPERTOIRE. */ > +static bool > +translate_unicode_codepoint (struct localedef_t *locale, > + const struct charmap_t *charmap, > + const struct repertoire_t *repertoire, > + uint32_t wch, struct lr_buffer *lrb) > +{ > + /* See whether the charmap contains the Uxxxxxxxx names. */ > + char utmp[10]; > + snprintf (utmp, sizeof (utmp), "U%08X", wch); > + struct charseq *seq = charmap_find_value (charmap, utmp, 9); > + > + if (seq == NULL) > + { > + /* No, this isn't the case. Now determine from > + the repertoire the name of the character and > + find it in the charmap. */ > + if (repertoire != NULL) > + { > + const char *symbol = repertoire_find_symbol (repertoire, wch); > + if (symbol != NULL) > + seq = charmap_find_value (charmap, symbol, strlen (symbol)); > + } > + > + if (seq == NULL) > + { > +#ifndef NO_TRANSLITERATION > + /* Transliterate if possible. */ > + if (locale != NULL) > + { > + if ((locale->avail & CTYPE_LOCALE) == 0) > + { > + /* Load the CTYPE data now. */ > + int old_needed = locale->needed; > + > + locale->needed = 0; > + locale = load_locale (LC_CTYPE, locale->name, > + locale->repertoire_name, > + charmap, locale); > + locale->needed = old_needed; > + } > + > + uint32_t *translit; > + if ((locale->avail & CTYPE_LOCALE) != 0 > + && ((translit = find_translit (locale, charmap, wch)) > + != NULL)) > + /* The CTYPE data contains a matching > + transliteration. */ > + { > + for (int i = 0; translit[i] != 0; ++i) > + { > + snprintf (utmp, sizeof (utmp), "U%08X", translit[i]); > + seq = charmap_find_value (charmap, utmp, 9); > + assert (seq != NULL); > + adds (lrb, seq->bytes, seq->nbytes); > + } > + return true; > + } > + } > +#endif /* NO_TRANSLITERATION */ > + > + /* Not a known name. */ > + return false; > + } > + } > + > + if (seq != NULL) > + { > + adds (lrb, seq->bytes, seq->nbytes); > + return true; > + } > + else > + return false; > +} > + > > static struct token * > get_string (struct linereader *lr, const struct charmap_t *charmap, > @@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, > } > else > { > - int illegal_string = 0; > + bool illegal_string = false; > size_t buf2act = 0; > size_t buf2max = 56 * sizeof (uint32_t); > int ch; > @@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, > { > /* <> is no correct name. Ignore it and also signal an > error. */ > - illegal_string = 1; > + illegal_string = true; > continue; > } > > @@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, > > if (cp == &lrb.buf[lrb.act]) > { > - char utmp[10]; > - > /* Yes, it is. */ > addc (&lrb, '\0'); > wch = strtoul (lrb.buf + startidx + 1, NULL, 16); > @@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, > if (return_widestr) > ADDWC (wch); > > - /* See whether the charmap contains the Uxxxxxxxx names. */ > - snprintf (utmp, sizeof (utmp), "U%08X", wch); > - seq = charmap_find_value (charmap, utmp, 9); > - > - if (seq == NULL) > - { > - /* No, this isn't the case. Now determine from > - the repertoire the name of the character and > - find it in the charmap. */ > - if (repertoire != NULL) > - { > - const char *symbol; > - > - symbol = repertoire_find_symbol (repertoire, wch); > - > - if (symbol != NULL) > - seq = charmap_find_value (charmap, symbol, > - strlen (symbol)); > - } > - > - if (seq == NULL) > - { > -#ifndef NO_TRANSLITERATION > - /* Transliterate if possible. */ > - if (locale != NULL) > - { > - uint32_t *translit; > - > - if ((locale->avail & CTYPE_LOCALE) == 0) > - { > - /* Load the CTYPE data now. */ > - int old_needed = locale->needed; > - > - locale->needed = 0; > - locale = load_locale (LC_CTYPE, > - locale->name, > - locale->repertoire_name, > - charmap, locale); > - locale->needed = old_needed; > - } > - > - if ((locale->avail & CTYPE_LOCALE) != 0 > - && ((translit = find_translit (locale, > - charmap, wch)) > - != NULL)) > - /* The CTYPE data contains a matching > - transliteration. */ > - { > - int i; > - > - for (i = 0; translit[i] != 0; ++i) > - { > - char utmp[10]; > - > - snprintf (utmp, sizeof (utmp), "U%08X", > - translit[i]); > - seq = charmap_find_value (charmap, utmp, > - 9); > - assert (seq != NULL); > - adds (&lrb, seq->bytes, seq->nbytes); > - } > - > - continue; > - } > - } > -#endif /* NO_TRANSLITERATION */ > - > - /* Not a known name. */ > - illegal_string = 1; > - } > - } > - > - if (seq != NULL) > - adds (&lrb, seq->bytes, seq->nbytes); > - > + if (!translate_unicode_codepoint (locale, charmap, > + repertoire, wch, &lrb)) > + illegal_string = true; OK. Refactor. > continue; > } > } > @@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, > /* This name is not in the charmap. */ > lr_error (lr, _("symbol `%.*s' not in charmap"), > (int) (lrb.act - startidx), &lrb.buf[startidx]); > - illegal_string = 1; > + illegal_string = true; > } > > if (return_widestr) > @@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, > /* This name is not in the repertoire map. */ > lr_error (lr, _("symbol `%.*s' not in repertoire map"), > (int) (lrb.act - startidx), &lrb.buf[startidx]); > - illegal_string = 1; > + illegal_string = true; > } > else > ADDWC (wch); > @@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, > if (ch == '\n' || ch == EOF) > { > lr_error (lr, _("unterminated string")); > - illegal_string = 1; > + illegal_string = true; > } > > if (illegal_string) OK.
diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c index d5367e0a1e..f7292f0102 100644 --- a/locale/programs/linereader.c +++ b/locale/programs/linereader.c @@ -596,6 +596,83 @@ get_ident (struct linereader *lr) return &lr->token; } +/* Process a decoded Unicode codepoint WCH in a string, placing the + multibyte sequence into LRB. Return false if the character is not + found in CHARMAP/REPERTOIRE. */ +static bool +translate_unicode_codepoint (struct localedef_t *locale, + const struct charmap_t *charmap, + const struct repertoire_t *repertoire, + uint32_t wch, struct lr_buffer *lrb) +{ + /* See whether the charmap contains the Uxxxxxxxx names. */ + char utmp[10]; + snprintf (utmp, sizeof (utmp), "U%08X", wch); + struct charseq *seq = charmap_find_value (charmap, utmp, 9); + + if (seq == NULL) + { + /* No, this isn't the case. Now determine from + the repertoire the name of the character and + find it in the charmap. */ + if (repertoire != NULL) + { + const char *symbol = repertoire_find_symbol (repertoire, wch); + if (symbol != NULL) + seq = charmap_find_value (charmap, symbol, strlen (symbol)); + } + + if (seq == NULL) + { +#ifndef NO_TRANSLITERATION + /* Transliterate if possible. */ + if (locale != NULL) + { + if ((locale->avail & CTYPE_LOCALE) == 0) + { + /* Load the CTYPE data now. */ + int old_needed = locale->needed; + + locale->needed = 0; + locale = load_locale (LC_CTYPE, locale->name, + locale->repertoire_name, + charmap, locale); + locale->needed = old_needed; + } + + uint32_t *translit; + if ((locale->avail & CTYPE_LOCALE) != 0 + && ((translit = find_translit (locale, charmap, wch)) + != NULL)) + /* The CTYPE data contains a matching + transliteration. */ + { + for (int i = 0; translit[i] != 0; ++i) + { + snprintf (utmp, sizeof (utmp), "U%08X", translit[i]); + seq = charmap_find_value (charmap, utmp, 9); + assert (seq != NULL); + adds (lrb, seq->bytes, seq->nbytes); + } + return true; + } + } +#endif /* NO_TRANSLITERATION */ + + /* Not a known name. */ + return false; + } + } + + if (seq != NULL) + { + adds (lrb, seq->bytes, seq->nbytes); + return true; + } + else + return false; +} + static struct token * get_string (struct linereader *lr, const struct charmap_t *charmap, @@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, } else { - int illegal_string = 0; + bool illegal_string = false; size_t buf2act = 0; size_t buf2max = 56 * sizeof (uint32_t); int ch; @@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, { /* <> is no correct name. Ignore it and also signal an error. */ - illegal_string = 1; + illegal_string = true; continue; } @@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, if (cp == &lrb.buf[lrb.act]) { - char utmp[10]; - /* Yes, it is. */ addc (&lrb, '\0'); wch = strtoul (lrb.buf + startidx + 1, NULL, 16); @@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, if (return_widestr) ADDWC (wch); - /* See whether the charmap contains the Uxxxxxxxx names. */ - snprintf (utmp, sizeof (utmp), "U%08X", wch); - seq = charmap_find_value (charmap, utmp, 9); - - if (seq == NULL) - { - /* No, this isn't the case. Now determine from - the repertoire the name of the character and - find it in the charmap. */ - if (repertoire != NULL) - { - const char *symbol; - - symbol = repertoire_find_symbol (repertoire, wch); - - if (symbol != NULL) - seq = charmap_find_value (charmap, symbol, - strlen (symbol)); - } - - if (seq == NULL) - { -#ifndef NO_TRANSLITERATION - /* Transliterate if possible. */ - if (locale != NULL) - { - uint32_t *translit; - - if ((locale->avail & CTYPE_LOCALE) == 0) - { - /* Load the CTYPE data now. */ - int old_needed = locale->needed; - - locale->needed = 0; - locale = load_locale (LC_CTYPE, - locale->name, - locale->repertoire_name, - charmap, locale); - locale->needed = old_needed; - } - - if ((locale->avail & CTYPE_LOCALE) != 0 - && ((translit = find_translit (locale, - charmap, wch)) - != NULL)) - /* The CTYPE data contains a matching - transliteration. */ - { - int i; - - for (i = 0; translit[i] != 0; ++i) - { - char utmp[10]; - - snprintf (utmp, sizeof (utmp), "U%08X", - translit[i]); - seq = charmap_find_value (charmap, utmp, - 9); - assert (seq != NULL); - adds (&lrb, seq->bytes, seq->nbytes); - } - - continue; - } - } -#endif /* NO_TRANSLITERATION */ - - /* Not a known name. */ - illegal_string = 1; - } - } - - if (seq != NULL) - adds (&lrb, seq->bytes, seq->nbytes); - + if (!translate_unicode_codepoint (locale, charmap, + repertoire, wch, &lrb)) + illegal_string = true; continue; } } @@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, /* This name is not in the charmap. */ lr_error (lr, _("symbol `%.*s' not in charmap"), (int) (lrb.act - startidx), &lrb.buf[startidx]); - illegal_string = 1; + illegal_string = true; } if (return_widestr) @@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, /* This name is not in the repertoire map. */ lr_error (lr, _("symbol `%.*s' not in repertoire map"), (int) (lrb.act - startidx), &lrb.buf[startidx]); - illegal_string = 1; + illegal_string = true; } else ADDWC (wch); @@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, if (ch == '\n' || ch == EOF) { lr_error (lr, _("unterminated string")); - illegal_string = 1; + illegal_string = true; } if (illegal_string)