[BZ,#18441] fix sorting multibyte charsets with an improper locale

Message ID	558EA828.3080106@web.de
State	New
Headers	show Return-Path: <libc-alpha-return-60459-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:message-id:date:from:mime-version:to:subject :content-type:content-transfer-encoding; q=dns; s=default; b=RKD Orn851LjlhPXhJF+nXnmvD4K1d9kK6WubA2JB7O4S2MisWjjgLxapobK7pszWy5C cjal28XAjBFLRj3Yd0ymBkeOkFA2ExZEHDw98BKweihb3L3l+DL8V3dReGuEb7HA 1SVdrHXZEYSExgKg2wpSHb0z5hXvfusY87ArTO2k= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Message-ID: <558EA828.3080106@web.de> Date: Sat, 27 Jun 2015 15:42:00 +0200 From: Leonhard Holz <leonhard.holz@web.de> User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Thunderbird/31.7.0 MIME-Version: 1.0 To: libc-alpha@sourceware.org Subject: [PATCH][BZ #18441] fix sorting multibyte charsets with an improper locale Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit

diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c index a39a94f..93ced10 100644 --- a/locale/programs/ld-collate.c +++ b/locale/programs/ld-collate.c @@ -244,9 +244,9 @@ struct locale_collate_t Therefore we keep all relevant input in a list. */ struct locale_collate_t *next; - /* Arrays with heads of the list for each of the leading bytes in + /* Arrays with heads of the list for the leading bytes in the multibyte sequences. */ - struct element_t *mbheads[256]; + struct element_t *mbheads[256 * 256]; /* Arrays with heads of the list for each of the leading bytes in the multibyte sequences. */ @@ -1558,6 +1558,7 @@ collate_finish (struct localedef_t *locale, const struct charmap_t *charmap) struct section_list *sect; int ruleidx; int nr_wide_elems = 0; + bool is_utf8 = strcmp (charmap->code_set_name, "UTF-8") == 0; if (collate == NULL) { @@ -1664,7 +1665,50 @@ collate_finish (struct localedef_t *locale, const struct charmap_t *charmap) struct element_t *lastp = NULL; /* Find the point where to insert in the list. */ - eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]]; + uint16_t index = ((unsigned char *) runp->mbs)[0]; + + /* Special handling of UTF-8: Generate a 2-byte index to mbheads. + Also check the UTF-8 encoding. Keep locale/weight.h in sync. */ + if (is_utf8) + { + if ((index & 0xC0) == 0x80) + { + utf8_error: + WITH_CUR_LOCALE (error_at_line (0, 0, runp->file, runp->line, + _("\ +malformed UTF-8 character in `%s'"), runp->name);); + goto dont_insert; + } + else if ((index & 0xE0) == 0xC0) + { + if (runp->nmbs < 2) + goto utf8_error; + uint16_t byte2 = ((unsigned char *) runp->mbs)[1]; + index = ((index & 0x1F) << 6) | (byte2 & 0x3F); + } + else if ((index & 0xF0) == 0xE0) + { + if (runp->nmbs < 3) + goto utf8_error; + uint16_t byte2 = ((unsigned char *) runp->mbs)[1]; + uint16_t byte3 = ((unsigned char *) runp->mbs)[2]; + index = ((index & 0xF) << 12) | ((byte2 & 0x3F) << 6) | + (byte3 & 0x3F); + } + else if ((index & 0xF8) == 0xF0) + { + if (runp->nmbs < 4) + goto utf8_error; + uint16_t byte3 = ((unsigned char *) runp->mbs)[2]; + uint16_t byte4 = ((unsigned char *) runp->mbs)[3]; + index = ((index & 0xF) << 12) | ((byte3 & 0x3F) << 6) | + (byte4 & 0x3F); + } + else if ((index & 0x80) != 0) + goto utf8_error; + } + + eptr = &collate->mbheads[index]; while (*eptr != NULL) { if ((*eptr)->nmbs < runp->nmbs) @@ -1735,7 +1779,7 @@ symbol `%s' has the same encoding as"), (*eptr)->name); /* Find out whether any of the `mbheads' entries is unset. In this case we use the UNDEFINED entry. */ - for (i = 1; i < 256; ++i) + for (i = 1; i < 256 * 256; ++i) if (collate->mbheads[i] == NULL) { need_undefined = 1; @@ -2108,7 +2152,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE); struct locale_file file; size_t ch; - int32_t tablemb[256]; + int32_t tablemb[256 * 256]; struct obstack weightpool; struct obstack extrapool; struct obstack indirectpool; @@ -2186,7 +2230,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, if (collate->undefined.used_in_level != 0) output_weight (&weightpool, collate, &collate->undefined); - for (ch = 1; ch < 256; ++ch) + for (ch = 1; ch < 256 * 256; ++ch) if (collate->mbheads[ch]->mbnext == NULL && collate->mbheads[ch]->nmbs <= 1) { @@ -2211,7 +2255,6 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, and add only one index into the weight table. We can find the consecutive entries since they are also consecutive in the list. */ struct element_t *runp = collate->mbheads[ch]; - struct element_t *lastp; assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool))); @@ -2239,7 +2282,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, /* Compute how much space we will need. */ added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 - + 2 * (runp->nmbs - 1)); + + 2 * runp->nmbs); assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool))); obstack_make_room (&extrapool, added); @@ -2262,9 +2305,9 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, /* Now walk backward from here to the beginning. */ curp = runp; - assert (runp->nmbs <= 256); - obstack_1grow_fast (&extrapool, curp->nmbs - 1); - for (i = 1; i < curp->nmbs; ++i) + assert (runp->nmbs <= 255); + obstack_1grow_fast (&extrapool, curp->nmbs); + for (i = 0; i < curp->nmbs; ++i) obstack_1grow_fast (&extrapool, curp->mbs[i]); /* Now find the end of the consecutive sequence and @@ -2284,7 +2327,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, /* And add the end byte sequence. Without length this time. */ - for (i = 1; i < curp->nmbs; ++i) + for (i = 0; i < curp->nmbs; ++i) obstack_1grow_fast (&extrapool, curp->mbs[i]); } else @@ -2298,15 +2341,15 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, weightidx = output_weight (&weightpool, collate, runp); added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 - + runp->nmbs - 1); + + runp->nmbs); assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool))); obstack_make_room (&extrapool, added); obstack_int32_grow_fast (&extrapool, weightidx); - assert (runp->nmbs <= 256); - obstack_1grow_fast (&extrapool, runp->nmbs - 1); + assert (runp->nmbs <= 255); + obstack_1grow_fast (&extrapool, runp->nmbs); - for (i = 1; i < runp->nmbs; ++i) + for (i = 0; i < runp->nmbs; ++i) obstack_1grow_fast (&extrapool, runp->mbs[i]); } @@ -2315,30 +2358,25 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, obstack_1grow_fast (&extrapool, '\0'); /* Next entry. */ - lastp = runp; runp = runp->mbnext; } while (runp != NULL); assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool))); - /* If the final entry in the list is not a single character we - add an UNDEFINED entry here. */ - if (lastp->nmbs != 1) - { - int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1); - obstack_make_room (&extrapool, added); + /* Add an UNDEFINED entry at the end of the list. */ + int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1); + obstack_make_room (&extrapool, added); - obstack_int32_grow_fast (&extrapool, 0); - /* XXX What rule? We just pick the first. */ - obstack_1grow_fast (&extrapool, 0); - /* Length is zero. */ - obstack_1grow_fast (&extrapool, 0); + obstack_int32_grow_fast (&extrapool, 0); + /* XXX What rule? We just pick the first. */ + obstack_1grow_fast (&extrapool, 0); + /* Length is zero. */ + obstack_1grow_fast (&extrapool, 0); - /* Add alignment bytes if necessary. */ - while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool))) - obstack_1grow_fast (&extrapool, '\0'); - } + /* Add alignment bytes if necessary. */ + while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool))) + obstack_1grow_fast (&extrapool, '\0'); } /* Add padding to the tables if necessary. */ @@ -2346,7 +2384,7 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap, obstack_1grow (&weightpool, 0); /* Now add the four tables. */ - add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256); + add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256 * 256); add_locale_raw_obstack (&file, &weightpool); add_locale_raw_obstack (&file, &extrapool); add_locale_raw_obstack (&file, &indirectpool); diff --git a/locale/weight.h b/locale/weight.h index 721bf7d..86cc9b7 100644 --- a/locale/weight.h +++ b/locale/weight.h @@ -21,24 +21,66 @@ /* Find index of weight. */ static inline int32_t __attribute__ ((always_inline)) -findidx (const int32_t *table, +findidx (uint_fast32_t locale_encoding, + const int32_t *table, const int32_t *indirect, const unsigned char *extra, const unsigned char **cpp, size_t len) { - int_fast32_t i = table[*(*cpp)++]; const unsigned char *cp; const unsigned char *usrc; + uint16_t index = (*cpp)[0]; + /* Special handling of UTF-8: Generate a 2-byte index for table. + This has to be equal to the folding in locale/programs/ld-collate.c: + collate_finish(). */ + if (locale_encoding == __cet_utf8 && (index & 0x80) != 0) + { + if ((index & 0xE0) == 0xC0) + { + if (len < 2) + goto utf8_error; + uint16_t byte2 = (*cpp)[1]; + index = ((index & 0x1F) << 6) | (byte2 & 0x3F); + } + else if ((index & 0xF0) == 0xE0) + { + if (len < 3) + goto utf8_error; + uint16_t byte2 = (*cpp)[1]; + uint16_t byte3 = (*cpp)[2]; + index = ((index & 0xF) << 12) | ((byte2 & 0x3F) << 6) | + (byte3 & 0x3F); + } + else if ((index & 0xF8) == 0xF0) + { + if (len < 4) + goto utf8_error; + uint16_t byte3 = (*cpp)[2]; + uint16_t byte4 = (*cpp)[3]; + index = ((index & 0xF) << 12) | ((byte3 & 0x3F) << 6) | + (byte4 & 0x3F); + } + else + { + utf8_error: + *cpp += 1; + return 0; + } + } + + int_fast32_t i = table[index]; if (i >= 0) - /* This is an index into the weight table. Cool. */ - return i; + { + /* This is an index into the weight table. Cool. */ + *cpp += 1; + return i; + } /* Oh well, more than one sequence starting with this byte. Search for the correct one. */ cp = &extra[-i]; usrc = *cpp; - --len; while (1) { size_t nhere; @@ -57,8 +99,7 @@ findidx (const int32_t *table, /* It is a single character. If it matches we found our index. Note that at the end of each list there is an entry of length zero which represents the single byte - sequence. The first (and here only) byte was tested - already. */ + sequence. */ size_t cnt; for (cnt = 0; cnt < nhere && cnt < len; ++cnt) @@ -68,7 +109,7 @@ findidx (const int32_t *table, if (cnt == nhere) { /* Found it. */ - *cpp += nhere; + *cpp += nhere > 0 ? nhere : 1; return i; } @@ -127,7 +168,7 @@ findidx (const int32_t *table, while (++cnt < nhere); } - *cpp += nhere; + *cpp += nhere > 0 ? nhere : 1; return indirect[-i + offset]; } } diff --git a/locale/weightwc.h b/locale/weightwc.h index 3cd7a69..3781d0d 100644 --- a/locale/weightwc.h +++ b/locale/weightwc.h @@ -21,7 +21,8 @@ /* Find index of weight. */ static inline int32_t __attribute__ ((always_inline)) -findidx (const int32_t *table, +findidx (uint_fast32_t encoding, + const int32_t *table, const int32_t *indirect, const wint_t *extra, const wint_t **cpp, size_t len) diff --git a/posix/fnmatch_loop.c b/posix/fnmatch_loop.c index f46c9df..d25cfb0 100644 --- a/posix/fnmatch_loop.c +++ b/posix/fnmatch_loop.c @@ -389,6 +389,8 @@ FCT (pattern, string, string_end, no_leading_period, flags, ends, alloca_used) const int32_t *indirect; int32_t idx; const UCHAR *cp = (const UCHAR *) &str; + uint_fast32_t encoding = (uint32_t) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE); # if WIDE_CHAR_VERSION table = (const int32_t *) @@ -410,7 +412,7 @@ FCT (pattern, string, string_end, no_leading_period, flags, ends, alloca_used) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); # endif - idx = FINDIDX (table, indirect, extra, &cp, 1); + idx = FINDIDX (encoding, table, indirect, extra, &cp, 1); if (idx != 0) { /* We found a table entry. Now see whether the @@ -420,7 +422,7 @@ FCT (pattern, string, string_end, no_leading_period, flags, ends, alloca_used) int32_t idx2; const UCHAR *np = (const UCHAR *) n; - idx2 = FINDIDX (table, indirect, extra, + idx2 = FINDIDX (encoding, table, indirect, extra, &np, string_end - n); if (idx2 != 0 && (idx >> 24) == (idx2 >> 24) diff --git a/posix/regcomp.c b/posix/regcomp.c index bf8aa16..65d2d1c 100644 --- a/posix/regcomp.c +++ b/posix/regcomp.c @@ -3426,6 +3426,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) { + uint_fast32_t encoding; const int32_t *table, *indirect; const unsigned char *weights, *extra, *cp; unsigned char char_buf[2]; @@ -3434,6 +3435,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) size_t len; /* Calculate the index for equivalence class. */ cp = name; + encoding = (uint32_t) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE); table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB); @@ -3441,7 +3443,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - idx1 = findidx (table, indirect, extra, &cp, -1); + idx1 = findidx (encoding, table, indirect, extra, &cp, -1); if (BE (idx1 == 0 || *cp != '\0', 0)) /* This isn't a valid character. */ return REG_ECOLLATE; @@ -3452,7 +3454,7 @@ build_equiv_class (bitset_t sbcset, const unsigned char *name) { char_buf[0] = ch; cp = char_buf; - idx2 = findidx (table, indirect, extra, &cp, 1); + idx2 = findidx (encoding, table, indirect, extra, &cp, 1); /* idx2 = table[ch]; */ diff --git a/posix/regex_internal.h b/posix/regex_internal.h index 154e969..bece810 100644 --- a/posix/regex_internal.h +++ b/posix/regex_internal.h @@ -743,17 +743,19 @@ re_string_elem_size_at (const re_string_t *pstr, int idx) # ifdef _LIBC const unsigned char *p, *extra; const int32_t *table, *indirect; + uint_fast32_t encoding; uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) { + encoding = (uint32_t) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE); table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); p = pstr->mbs + idx; - findidx (table, indirect, extra, &p, pstr->len - idx); + findidx (encoding, table, indirect, extra, &p, pstr->len - idx); return p - pstr->mbs - idx; } else diff --git a/posix/regexec.c b/posix/regexec.c index 70cd606..ef7a5c6 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -3869,6 +3869,7 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, if (nrules != 0) { unsigned int in_collseq = 0; + uint_fast32_t encoding; const int32_t *table, *indirect; const unsigned char *weights, *extra; const char *collseqwc; @@ -3919,6 +3920,8 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, if (cset->nequiv_classes) { const unsigned char *cp = pin; + encoding = (uint32_t) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE); table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); weights = (const unsigned char *) @@ -3927,7 +3930,8 @@ check_node_accept_bytes (const re_dfa_t *dfa, int node_idx, _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB); indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB); - int32_t idx = findidx (table, indirect, extra, &cp, elem_len); + int32_t idx = findidx (encoding, table, indirect, extra, &cp, + elem_len); if (idx > 0) for (i = 0; i < cset->nequiv_classes; ++i) { diff --git a/string/strcoll_l.c b/string/strcoll_l.c index 8f1225f..668ea9d 100644 --- a/string/strcoll_l.c +++ b/string/strcoll_l.c @@ -78,9 +78,9 @@ typedef struct /* Get next sequence. Traverse the string as required. */ static __always_inline void get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, - const USTRING_TYPE *weights, const int32_t *table, - const USTRING_TYPE *extra, const int32_t *indirect, - int pass) + const USTRING_TYPE *weights, uint_fast32_t encoding, + const int32_t *table, const USTRING_TYPE *extra, + const int32_t *indirect, int pass) { size_t val = seq->val = 0; int len = seq->len; @@ -124,7 +124,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, us = seq->back_us; while (i < backw) { - int32_t tmp = findidx (table, indirect, extra, &us, -1); + int32_t tmp = findidx (encoding, table, indirect, extra, &us, -1); idx = tmp & 0xffffff; i++; } @@ -139,7 +139,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets, while (*us != L('\0')) { - int32_t tmp = findidx (table, indirect, extra, &us, -1); + int32_t tmp = findidx (encoding, table, indirect, extra, &us, -1); unsigned char rule = tmp >> 24; prev_idx = idx; idx = tmp & 0xffffff; @@ -345,9 +345,9 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) while (1) { - get_next_seq (&seq1, nrules, rulesets, weights, table, + get_next_seq (&seq1, nrules, rulesets, weights, encoding, table, extra, indirect, pass); - get_next_seq (&seq2, nrules, rulesets, weights, table, + get_next_seq (&seq2, nrules, rulesets, weights, encoding, table, extra, indirect, pass); /* See whether any or both strings are empty. */ if (seq1.len == 0 || seq2.len == 0) diff --git a/string/strxfrm_l.c b/string/strxfrm_l.c index 8b61ea2..95abc4e 100644 --- a/string/strxfrm_l.c +++ b/string/strxfrm_l.c @@ -53,6 +53,7 @@ typedef struct uint_fast32_t nrules; unsigned char *rulesets; USTRING_TYPE *weights; + uint_fast32_t encoding; int32_t *table; USTRING_TYPE *extra; int32_t *indirect; @@ -100,8 +101,8 @@ static __always_inline size_t find_idx (const USTRING_TYPE **us, int32_t *weight_idx, unsigned char *rule_idx, const locale_data_t *l_data, const int pass) { - int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us, - -1); + int32_t tmp = findidx (l_data->encoding, l_data->table, l_data->indirect, + l_data->extra, us, -1); *rule_idx = tmp >> 24; int32_t idx = tmp & 0xffffff; size_t len = l_data->weights[idx++]; @@ -693,6 +694,8 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, __locale_t l) /* Get the locale data. */ l_data.rulesets = (unsigned char *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string; + l_data.encoding = + current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; l_data.table = (int32_t *) current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;

[BZ,#18441] fix sorting multibyte charsets with an improper locale

Commit Message

Comments

Patch