diff mbox

[BZ,#18441] fix sorting multibyte charsets with an improper locale

Message ID 558EA828.3080106@web.de
State New
Headers show

Commit Message

Leonhard Holz June 27, 2015, 1:42 p.m. UTC
In BZ #18441 sorting a thai text with the en_US.UTF-8 locale causes a performance
regression. The cause of the problem is that

a) en_US.UTF-8 has no informations for thai chars and so always reports a zero
sort weight which causes the comparison to check the whole string instead of
breaking up early and

b) the sequence-to-weight list is partitioned by the first byte of the first
character (TABLEMB); this generates long lists for multibyte UTF-8 characters as
they tend to have an equal starting byte (e.g. all thai chars start with E0).

The approach of the patch is to interprete TABLEMB as a hashtable and find a
better hash key. My first try was to somehow "fold" a multibyte character into one
byte but that worsened the overall performance a lot. Enhancing the table to 2
byte keys works much better while needing a reasonable amount of extra memory.

The patch vastly improves the performance of languages with multibyte chars (see
zh_CN and hi_IN below). A side effect is that languages with one-byte chars get a
bit slower because of the extra check for the first byte while finding the right
sequence in the sequence list . It cannot be avoided since the hash key is not
longer equal to the first byte of the sequence. Tests are ok.

filelist#C			2.52%
filelist#en_US.UTF-8		3.51%
lorem_ipsum#vi_VN.UTF-8		1.21%
lorem_ipsum#ar_SA.UTF-8		11.45%
lorem_ipsum#en_US.UTF-8		3.12%
lorem_ipsum#zh_CN.UTF-8		-85.55%
lorem_ipsum#cs_CZ.UTF-8		0.14%
lorem_ipsum#en_GB.UTF-8		1.83%
lorem_ipsum#da_DK.UTF-8		6.85%
lorem_ipsum#pl_PL.UTF-8		4.94%
lorem_ipsum#fr_FR.UTF-8		5.22%
lorem_ipsum#pt_PT.UTF-8		4.62%
lorem_ipsum#el_GR.UTF-8		-9.20%
lorem_ipsum#ru_RU.UTF-8		9.22%
lorem_ipsum#iw_IL.UTF-8		10.54%
lorem_ipsum#es_ES.UTF-8		8.55%
lorem_ipsum#hi_IN.UTF-8		-97.85%
lorem_ipsum#sv_SE.UTF-8		11.47%
lorem_ipsum#hu_HU.UTF-8		15.75%
lorem_ipsum#tr_TR.UTF-8		4.59%
lorem_ipsum#is_IS.UTF-8		11.70%
lorem_ipsum#it_IT.UTF-8		5.80%
lorem_ipsum#sr_RS.UTF-8		5.45%
lorem_ipsum#ja_JP.UTF-8		-14.34%
wikipedia-th#en_US.UTF-8	-99.39%


	* locale/programs/ld-collate.c (struct locale_collate_t):
	Expand mbheads array from 256 to 16384 entries.
	(collate_finish): Generate 2-byte key for mbheads if UTF-8 locale.
	(collate_output): Output larger table and sequences including first byte.
	* locale/weight.h (findidx): Use 2-byte key for table if UTF-8 locale.
	* locale/weightwc.h (findidx): Accept encoding parameter, not used.
	* posix/fnmatch_loop.c (FCT): Call findidx with encoding parameter.
	* posix/regcomp.c (build_equiv_class): Likewise.
	* posix/regex_internal.h (re_string_elem_size_at): Likewise.
	* posix/regexec.c (check_node_accept_bytes): Likewise.
	* string/strcoll_l.c (get_next_seq): Likewise.
	(STRCOLL): Call get_next_seq with encoding parameter.
	* string/strxfrm_l.c (find_idx): Call findidx with encoding parameter.
	(STRXFRM): Call find_idx with encoding parameter.


   l_data.weights = (USTRING_TYPE *)
@@ -721,8 +724,8 @@ STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n,
__locale_t l)

   do
     {
-      int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
-			     -1);
+      int32_t tmp = findidx (l_data.encoding, l_data.table, l_data.indirect,
+			     l_data.extra, &cur, -1);
       rulearr[idxmax] = tmp >> 24;
       idxarr[idxmax] = tmp & 0xffffff;

Comments

Ondřej Bílka June 27, 2015, 7:04 p.m. UTC | #1
On Sat, Jun 27, 2015 at 03:42:00PM +0200, Leonhard Holz wrote:
> In BZ #18441 sorting a thai text with the en_US.UTF-8 locale causes a performance
> regression. The cause of the problem is that
> 
> a) en_US.UTF-8 has no informations for thai chars and so always reports a zero
> sort weight which causes the comparison to check the whole string instead of
> breaking up early and
> 
> b) the sequence-to-weight list is partitioned by the first byte of the first
> character (TABLEMB); this generates long lists for multibyte UTF-8 characters as
> they tend to have an equal starting byte (e.g. all thai chars start with E0).
> 
> The approach of the patch is to interprete TABLEMB as a hashtable and find a
> better hash key. My first try was to somehow "fold" a multibyte character into one
> byte but that worsened the overall performance a lot. Enhancing the table to 2
> byte keys works much better while needing a reasonable amount of extra memory.
> 
> The patch vastly improves the performance of languages with multibyte chars (see
> zh_CN and hi_IN below). A side effect is that languages with one-byte chars get a
> bit slower because of the extra check for the first byte while finding the right
> sequence in the sequence list . It cannot be avoided since the hash key is not
> longer equal to the first byte of the sequence. Tests are ok.
>
First could you check if utf overhead is noticable or not? Compile that
again with passing locale replaced by hardcoded utf and see if there is
noticable difference. It makes sense to force gcc create utf8 variant of
strcoll and other and do only one check at start of function. Hack above
would tell you impact.

> +	      else if ((index & 0xE0) == 0xC0)
> +		{
> +		  if (runp->nmbs < 2)
> +		    goto utf8_error;
> +		  uint16_t byte2 = ((unsigned char *) runp->mbs)[1];
> +		  index = ((index & 0x1F) << 6) | (byte2 & 0x3F);
> +		}
Masking makes no sense for hash function as it doesn't matter if there
is 0 or zero. Here problably just use
*((uint16_t *) runp->mbs)

> +	      else if ((index & 0xF0) == 0xE0)
> +		{
> +		  if (runp->nmbs < 3)
> +		    goto utf8_error;
> +		  uint16_t byte2 = ((unsigned char *) runp->mbs)[1];
> +		  uint16_t byte3 = ((unsigned char *) runp->mbs)[2];
> +		  index = ((index & 0xF) << 12) | ((byte2 & 0x3F) << 6) |
> +			  (byte3 & 0x3F)
Again here ands are unnecessary, when you use xor instead or. 
(index << 12) ^ *((uint16_t *) (runp->mbs+1));
> +		}
> +	      else if ((index & 0xF8) == 0xF0)
> +		{
> +		  if (runp->nmbs < 4)
> +		    goto utf8_error;
> +		  uint16_t byte3 = ((unsigned char *) runp->mbs)[2];
> +		  uint16_t byte4 = ((unsigned char *) runp->mbs)[3];
> +		  index = ((index & 0xF) << 12) | ((byte3 & 0x3F) << 6) |
> +			  (byte4 & 0x3F)
As you ignore second byte value you could use 
(index << 12) ^ *((uint16_t *) (runp->mbs+2));
Andreas Schwab June 27, 2015, 8:07 p.m. UTC | #2
Ondřej Bílka <neleai@seznam.cz> writes:

> *((uint16_t *) runp->mbs)
> (index << 12) ^ *((uint16_t *) (runp->mbs+1));

Either of these will be unaligned.

Andreas.
Ondřej Bílka June 27, 2015, 8:56 p.m. UTC | #3
On Sat, Jun 27, 2015 at 10:07:17PM +0200, Andreas Schwab wrote:
> Ondřej Bílka <neleai@seznam.cz> writes:
> 
> > *((uint16_t *) runp->mbs)
> > (index << 12) ^ *((uint16_t *) (runp->mbs+1));
> 
> Either of these will be unaligned.
> 
Which as its used to index array of size 256 * 256 is what we want.

But now I see that I didn't wrote what I meant as second could be
larger, If you want to add index into hash function I would add it as
below as high bits are unlikely to change, like following

 (index << 4) ^ *((uint16_t *) (runp->mbs+1))
Andreas Schwab June 27, 2015, 9:50 p.m. UTC | #4
Ondřej Bílka <neleai@seznam.cz> writes:

> On Sat, Jun 27, 2015 at 10:07:17PM +0200, Andreas Schwab wrote:
>> Ondřej Bílka <neleai@seznam.cz> writes:
>> 
>> > *((uint16_t *) runp->mbs)
>> > (index << 12) ^ *((uint16_t *) (runp->mbs+1));
>> 
>> Either of these will be unaligned.
>> 
> Which as its used to index array of size 256 * 256 is what we want.

This doesn't make any sense.

Andreas.
Ondřej Bílka June 28, 2015, 6:18 a.m. UTC | #5
On Sat, Jun 27, 2015 at 11:50:10PM +0200, Andreas Schwab wrote:
> Ondřej Bílka <neleai@seznam.cz> writes:
> 
> > On Sat, Jun 27, 2015 at 10:07:17PM +0200, Andreas Schwab wrote:
> >> Ondřej Bílka <neleai@seznam.cz> writes:
> >> 
> >> > *((uint16_t *) runp->mbs)
> >> > (index << 12) ^ *((uint16_t *) (runp->mbs+1));
> >> 
> >> Either of these will be unaligned.
> >> 
> > Which as its used to index array of size 256 * 256 is what we want.
> 
> This doesn't make any sense.
> 
Yes, was writing that while bit drunk so I somewhat read that as unsigned.
These should be guarded with_STRING_ARCH_unaligned.
Ondřej Bílka June 28, 2015, 8:57 a.m. UTC | #6
On Sat, Jun 27, 2015 at 03:42:00PM +0200, Leonhard Holz wrote:
> In BZ #18441 sorting a thai text with the en_US.UTF-8 locale causes a performance
> regression. The cause of the problem is that
> 
> a) en_US.UTF-8 has no informations for thai chars and so always reports a zero
> sort weight which causes the comparison to check the whole string instead of
> breaking up early and
> 
> b) the sequence-to-weight list is partitioned by the first byte of the first
> character (TABLEMB); this generates long lists for multibyte UTF-8 characters as
> they tend to have an equal starting byte (e.g. all thai chars start with E0).
> 
> The approach of the patch is to interprete TABLEMB as a hashtable and find a
> better hash key. My first try was to somehow "fold" a multibyte character into one
> byte but that worsened the overall performance a lot. Enhancing the table to 2
> byte keys works much better while needing a reasonable amount of extra memory.
> 
> The patch vastly improves the performance of languages with multibyte chars (see
> zh_CN and hi_IN below). A side effect is that languages with one-byte chars get a
> bit slower because of the extra check for the first byte while finding the right
> sequence in the sequence list . It cannot be avoided since the hash key is not
> longer equal to the first byte of the sequence. Tests are ok.
> 
No, you can. With this patch you partially fixed a problem that
datastructures used in findidx are terrible.

Just use a trie. You would avoid slow lists by precomputing it so you
could do following:

struct trie
{
  int32_t index[256];
}

int32_t
findidx (unsigned char *p, struct trie *start)
{
  struct trie *t = start;
  while (1)
    {
      unsigned char c = *p++;
      if (t->index[c] >= 0)
        return t->index[c];
      else
        t = start - t->index[c];
    }
}


Also you don't need table access at all for utf8 unless somebody adds
locale with exotic characters. Just use utf8 codepoints. Here is how
calculate them for 1-3 byte sequences for first 65536 indices and for
4-byte use trie.


  if (p[0] < 0x80) {
    return p[0];
  } else if (p[0] < 0xE0) {
    /* 2-byte sequence */
    /* No need to check size due trailing 0.  */
    return (p[0] << 6) + p[1] - 0x3080;
  } else if (code_unit1 < 0xF0) {
    /* 3-byte sequence */
    if (size < 3) goto error;
    return (p[0] << 12) + (p[1] << 6) + p[2] - 0xE2080;
  }
Andreas Schwab June 28, 2015, 11:59 a.m. UTC | #7
Ondřej Bílka <neleai@seznam.cz> writes:

> On Sat, Jun 27, 2015 at 11:50:10PM +0200, Andreas Schwab wrote:
>> Ondřej Bílka <neleai@seznam.cz> writes:
>> 
>> > On Sat, Jun 27, 2015 at 10:07:17PM +0200, Andreas Schwab wrote:
>> >> Ondřej Bílka <neleai@seznam.cz> writes:
>> >> 
>> >> > *((uint16_t *) runp->mbs)
>> >> > (index << 12) ^ *((uint16_t *) (runp->mbs+1));
>> >> 
>> >> Either of these will be unaligned.
>> >> 
>> > Which as its used to index array of size 256 * 256 is what we want.
>> 
>> This doesn't make any sense.
>> 
> Yes, was writing that while bit drunk so I somewhat read that as unsigned.
> These should be guarded with_STRING_ARCH_unaligned.

localedef doesn't need micro-optimizations.

Andreas.
Ondřej Bílka June 28, 2015, 12:25 p.m. UTC | #8
On Sun, Jun 28, 2015 at 01:59:16PM +0200, Andreas Schwab wrote:
> Ondřej Bílka <neleai@seznam.cz> writes:
> 
> > On Sat, Jun 27, 2015 at 11:50:10PM +0200, Andreas Schwab wrote:
> >> Ondřej Bílka <neleai@seznam.cz> writes:
> >> 
> >> > On Sat, Jun 27, 2015 at 10:07:17PM +0200, Andreas Schwab wrote:
> >> >> Ondřej Bílka <neleai@seznam.cz> writes:
> >> >> 
> >> >> > *((uint16_t *) runp->mbs)
> >> >> > (index << 12) ^ *((uint16_t *) (runp->mbs+1));
> >> >> 
> >> >> Either of these will be unaligned.
> >> >> 
> >> > Which as its used to index array of size 256 * 256 is what we want.
> >> 
> >> This doesn't make any sense.
> >> 
> > Yes, was writing that while bit drunk so I somewhat read that as unsigned.
> > These should be guarded with_STRING_ARCH_unaligned.
> 
> localedef doesn't need micro-optimizations.
>
but strcoll does and code is same.
Rich Felker June 29, 2015, 3:58 p.m. UTC | #9
On Sun, Jun 28, 2015 at 10:57:53AM +0200, Ondřej Bílka wrote:
> Also you don't need table access at all for utf8 unless somebody adds
> locale with exotic characters. Just use utf8 codepoints. Here is how
> calculate them for 1-3 byte sequences for first 65536 indices and for
> 4-byte use trie.
> 
> 
>   if (p[0] < 0x80) {
>     return p[0];
>   } else if (p[0] < 0xE0) {
>     /* 2-byte sequence */
>     /* No need to check size due trailing 0.  */
>     return (p[0] << 6) + p[1] - 0x3080;
>   } else if (code_unit1 < 0xF0) {
>     /* 3-byte sequence */
>     if (size < 3) goto error;
>     return (p[0] << 12) + (p[1] << 6) + p[2] - 0xE2080;
>   } 

Maybe in the special case where you intend to use this it doesn't
matter, but in general this is unsafe because it assumes only legal
sequences appear.

Is there a reason you even need to get codepoints? One of the
properties of UTF-8 is that codepoints sort in the same order as code
units.

Rich
Ondřej Bílka June 29, 2015, 5:57 p.m. UTC | #10
On Mon, Jun 29, 2015 at 11:58:08AM -0400, Rich Felker wrote:
> On Sun, Jun 28, 2015 at 10:57:53AM +0200, Ondřej Bílka wrote:
> > Also you don't need table access at all for utf8 unless somebody adds
> > locale with exotic characters. Just use utf8 codepoints. Here is how
> > calculate them for 1-3 byte sequences for first 65536 indices and for
> > 4-byte use trie.
> > 
> > 
> >   if (p[0] < 0x80) {
> >     return p[0];
> >   } else if (p[0] < 0xE0) {
> >     /* 2-byte sequence */
> >     /* No need to check size due trailing 0.  */
> >     return (p[0] << 6) + p[1] - 0x3080;
> >   } else if (code_unit1 < 0xF0) {
> >     /* 3-byte sequence */
> >     if (size < 3) goto error;
> >     return (p[0] << 12) + (p[1] << 6) + p[2] - 0xE2080;
> >   } 
> 
> Maybe in the special case where you intend to use this it doesn't
> matter, but in general this is unsafe because it assumes only legal
> sequences appear.
>
And you couldn't spend five minutes to check that and cried wolf?
This is elementary as findidx doesn't return error, that makes result on
illegal sequence undefined so it doesn't matter. It couldn't crash, just
produces different invalid output.

Also callers fnmatch, regexec and strcoll dont return error on invalid
sequence. 
Anyway it couldn't be reliably detected without big slowdown in strcoll
as strdiff skips initial possibly invalid bytes.
 
> Is there a reason you even need to get codepoints? One of the
> properties of UTF-8 is that codepoints sort in the same order as code
> units.
> 
No special need for codepoint, but these are nice perfect hash function.
As you have 16777216 three byte sequences but just 65536 codepoints its
really noticable.
Leonhard Holz June 30, 2015, 5:52 a.m. UTC | #11
> Just use a trie. You would avoid slow lists by precomputing it so you
> could do following:
> 
> struct trie
> {
>   int32_t index[256];
> }
> 
> int32_t
> findidx (unsigned char *p, struct trie *start)
> {
>   struct trie *t = start;
>   while (1)
>     {
>       unsigned char c = *p++;
>       if (t->index[c] >= 0)
>         return t->index[c];
>       else
>         t = start - t->index[c];
>     }
> }
> 
> 

I agree, but that goes way beyond the scope of the patch. I suggest to use the
patch given to solve the regression problem and refactor the datastructure as a
follow-up.

> Also you don't need table access at all for utf8 unless somebody adds
> locale with exotic characters. Just use utf8 codepoints. Here is how
> calculate them for 1-3 byte sequences for first 65536 indices and for
> 4-byte use trie.
> 
> 
>   if (p[0] < 0x80) {
>     return p[0];
>   } else if (p[0] < 0xE0) {
>     /* 2-byte sequence */
>     /* No need to check size due trailing 0.  */
>     return (p[0] << 6) + p[1] - 0x3080;
>   } else if (code_unit1 < 0xF0) {
>     /* 3-byte sequence */
>     if (size < 3) goto error;
>     return (p[0] << 12) + (p[1] << 6) + p[2] - 0xE2080;
>   } 
> 
> 

Actually the code does convert the UTF-8 sequences to codepoints and uses them as
hashkey while ignoring the upper bytes in the four-bytes-UTF-8 case:

+      if ((index & 0xE0) == 0xC0)
+	{
+	  if (len < 2)
+	    goto utf8_error;
+	  uint16_t byte2 = (*cpp)[1];
+	  index = ((index & 0x1F) << 6) | (byte2 & 0x3F);
+	}
+      else if ((index & 0xF0) == 0xE0)
+	{
+	  if (len < 3)
+	    goto utf8_error;
+	  uint16_t byte2 = (*cpp)[1];
+	  uint16_t byte3 = (*cpp)[2];
+	  index = ((index & 0xF) << 12) | ((byte2 & 0x3F) << 6) |
+		  (byte3 & 0x3F);
+	}
+      else if ((index & 0xF8) == 0xF0)
+	{
+	  if (len < 4)
+	    goto utf8_error;
+	  uint16_t byte3 = (*cpp)[2];
+	  uint16_t byte4 = (*cpp)[3];
+	  index = ((index & 0xF) << 12) | ((byte3 & 0x3F) << 6) |
+		  (byte4 & 0x3F);
+	}
+      else
+	{
+	utf8_error:
+	  *cpp += 1;
+	  return 0;
+	}
+    }

Maybe my conversion method is a bit "academic" but I think it is a reliable one.
Ondřej Bílka June 30, 2015, 9:31 a.m. UTC | #12
On Tue, Jun 30, 2015 at 07:52:21AM +0200, Leonhard Holz wrote:
> > Just use a trie. You would avoid slow lists by precomputing it so you
> > could do following:
> > 
> > struct trie
> > {
> >   int32_t index[256];
> > }
> > 
> > int32_t
> > findidx (unsigned char *p, struct trie *start)
> > {
> >   struct trie *t = start;
> >   while (1)
> >     {
> >       unsigned char c = *p++;
> >       if (t->index[c] >= 0)
> >         return t->index[c];
> >       else
> >         t = start - t->index[c];
> >     }
> > }
> > 
> > 
> 
> I agree, but that goes way beyond the scope of the patch. I suggest to use the
> patch given to solve the regression problem and refactor the datastructure as a
> follow-up.
>
Thats certainly possible but you have problem that with new
datastructure this patch becomes reduntant as you delete all relevant
code and replace it with better. So it would be better to just apply
next step and skip this intermediate one.
 
> > Also you don't need table access at all for utf8 unless somebody adds
> > locale with exotic characters. Just use utf8 codepoints. Here is how
> > calculate them for 1-3 byte sequences for first 65536 indices and for
> > 4-byte use trie.
> > 
> > 
> >   if (p[0] < 0x80) {
> >     return p[0];
> >   } else if (p[0] < 0xE0) {
> >     /* 2-byte sequence */
> >     /* No need to check size due trailing 0.  */
> >     return (p[0] << 6) + p[1] - 0x3080;
> >   } else if (code_unit1 < 0xF0) {
> >     /* 3-byte sequence */
> >     if (size < 3) goto error;
> >     return (p[0] << 12) + (p[1] << 6) + p[2] - 0xE2080;
> >   } 
> > 
> > 
> 
> Actually the code does convert the UTF-8 sequences to codepoints and uses them as
> hashkey while ignoring the upper bytes in the four-bytes-UTF-8 case:
> 
> +      if ((index & 0xE0) == 0xC0)
> +	{
> +	  if (len < 2)
> +	    goto utf8_error;
> +	  uint16_t byte2 = (*cpp)[1];
> +	  index = ((index & 0x1F) << 6) | (byte2 & 0x3F);
> +	}
> +      else if ((index & 0xF0) == 0xE0)
> +	{
> +	  if (len < 3)
> +	    goto utf8_error;
> +	  uint16_t byte2 = (*cpp)[1];
> +	  uint16_t byte3 = (*cpp)[2];
> +	  index = ((index & 0xF) << 12) | ((byte2 & 0x3F) << 6) |
> +		  (byte3 & 0x3F);
> +	}
> +      else if ((index & 0xF8) == 0xF0)
> +	{
> +	  if (len < 4)
> +	    goto utf8_error;
> +	  uint16_t byte3 = (*cpp)[2];
> +	  uint16_t byte4 = (*cpp)[3];
> +	  index = ((index & 0xF) << 12) | ((byte3 & 0x3F) << 6) |
> +		  (byte4 & 0x3F);
> +	}
> +      else
> +	{
> +	utf8_error:
> +	  *cpp += 1;
> +	  return 0;
> +	}
> +    }
> 
> Maybe my conversion method is a bit "academic" but I think it is a reliable one.

I wrote about that in sibling thread that this could be botteneck so you
should optimize that, there are lot of completely unnecessary ands. 

Mine should be faster, its straigth from wikipedia utf8 page with
removed checks and replaced getchar() (There is still typo code_unit1
that I didn't replace with p[0])

Both will produce codepoints for valid inputs and random number from
0-65536 for invalid when you store that in uint16_t variable. One could
for performance use int to save truncation instruction in 1-2 byte case.
diff mbox

Patch

diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c
index a39a94f..93ced10 100644
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -244,9 +244,9 @@  struct locale_collate_t
      Therefore we keep all relevant input in a list.  */
   struct locale_collate_t *next;

-  /* Arrays with heads of the list for each of the leading bytes in
+  /* Arrays with heads of the list for the leading bytes in
      the multibyte sequences.  */
-  struct element_t *mbheads[256];
+  struct element_t *mbheads[256 * 256];

   /* Arrays with heads of the list for each of the leading bytes in
      the multibyte sequences.  */
@@ -1558,6 +1558,7 @@  collate_finish (struct localedef_t *locale, const struct
charmap_t *charmap)
   struct section_list *sect;
   int ruleidx;
   int nr_wide_elems = 0;
+  bool is_utf8 = strcmp (charmap->code_set_name, "UTF-8") == 0;

   if (collate == NULL)
     {
@@ -1664,7 +1665,50 @@  collate_finish (struct localedef_t *locale, const struct
charmap_t *charmap)
 	  struct element_t *lastp = NULL;

 	  /* Find the point where to insert in the list.  */
-	  eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
+	  uint16_t index = ((unsigned char *) runp->mbs)[0];
+
+	  /* Special handling of UTF-8: Generate a 2-byte index to mbheads.
+	     Also check the UTF-8 encoding.  Keep locale/weight.h in sync.  */
+	  if (is_utf8)
+	    {
+	      if ((index & 0xC0) == 0x80)
+		{
+		utf8_error:
+		  WITH_CUR_LOCALE (error_at_line (0, 0, runp->file, runp->line,
+						  _("\
+malformed UTF-8 character in `%s'"), runp->name););
+		  goto dont_insert;
+		}
+	      else if ((index & 0xE0) == 0xC0)
+		{
+		  if (runp->nmbs < 2)
+		    goto utf8_error;
+		  uint16_t byte2 = ((unsigned char *) runp->mbs)[1];
+		  index = ((index & 0x1F) << 6) | (byte2 & 0x3F);
+		}
+	      else if ((index & 0xF0) == 0xE0)
+		{
+		  if (runp->nmbs < 3)
+		    goto utf8_error;
+		  uint16_t byte2 = ((unsigned char *) runp->mbs)[1];
+		  uint16_t byte3 = ((unsigned char *) runp->mbs)[2];
+		  index = ((index & 0xF) << 12) | ((byte2 & 0x3F) << 6) |
+			  (byte3 & 0x3F);
+		}
+	      else if ((index & 0xF8) == 0xF0)
+		{
+		  if (runp->nmbs < 4)
+		    goto utf8_error;
+		  uint16_t byte3 = ((unsigned char *) runp->mbs)[2];
+		  uint16_t byte4 = ((unsigned char *) runp->mbs)[3];
+		  index = ((index & 0xF) << 12) | ((byte3 & 0x3F) << 6) |
+			  (byte4 & 0x3F);
+		}
+	      else if ((index & 0x80) != 0)
+		goto utf8_error;
+	    }
+
+	  eptr = &collate->mbheads[index];
 	  while (*eptr != NULL)
 	    {
 	      if ((*eptr)->nmbs < runp->nmbs)
@@ -1735,7 +1779,7 @@  symbol `%s' has the same encoding as"), (*eptr)->name);

   /* Find out whether any of the `mbheads' entries is unset.  In this
      case we use the UNDEFINED entry.  */
-  for (i = 1; i < 256; ++i)
+  for (i = 1; i < 256 * 256; ++i)
     if (collate->mbheads[i] == NULL)
       {
 	need_undefined = 1;
@@ -2108,7 +2152,7 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,
   const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
   struct locale_file file;
   size_t ch;
-  int32_t tablemb[256];
+  int32_t tablemb[256 * 256];
   struct obstack weightpool;
   struct obstack extrapool;
   struct obstack indirectpool;
@@ -2186,7 +2230,7 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,
   if (collate->undefined.used_in_level != 0)
     output_weight (&weightpool, collate, &collate->undefined);

-  for (ch = 1; ch < 256; ++ch)
+  for (ch = 1; ch < 256 * 256; ++ch)
     if (collate->mbheads[ch]->mbnext == NULL
 	&& collate->mbheads[ch]->nmbs <= 1)
       {
@@ -2211,7 +2255,6 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,
 	   and add only one index into the weight table.  We can find the
 	   consecutive entries since they are also consecutive in the list.  */
 	struct element_t *runp = collate->mbheads[ch];
-	struct element_t *lastp;

 	assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));

@@ -2239,7 +2282,7 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,

 		/* Compute how much space we will need.  */
 		added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
-					  + 2 * (runp->nmbs - 1));
+					  + 2 * runp->nmbs);
 		assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
 		obstack_make_room (&extrapool, added);

@@ -2262,9 +2305,9 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,
 		/* Now walk backward from here to the beginning.  */
 		curp = runp;

-		assert (runp->nmbs <= 256);
-		obstack_1grow_fast (&extrapool, curp->nmbs - 1);
-		for (i = 1; i < curp->nmbs; ++i)
+		assert (runp->nmbs <= 255);
+		obstack_1grow_fast (&extrapool, curp->nmbs);
+		for (i = 0; i < curp->nmbs; ++i)
 		  obstack_1grow_fast (&extrapool, curp->mbs[i]);

 		/* Now find the end of the consecutive sequence and
@@ -2284,7 +2327,7 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,

 		/* And add the end byte sequence.  Without length this
 		   time.  */
-		for (i = 1; i < curp->nmbs; ++i)
+		for (i = 0; i < curp->nmbs; ++i)
 		  obstack_1grow_fast (&extrapool, curp->mbs[i]);
 	      }
 	    else
@@ -2298,15 +2341,15 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,
 		weightidx = output_weight (&weightpool, collate, runp);

 		added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
-					  + runp->nmbs - 1);
+					  + runp->nmbs);
 		assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
 		obstack_make_room (&extrapool, added);

 		obstack_int32_grow_fast (&extrapool, weightidx);
-		assert (runp->nmbs <= 256);
-		obstack_1grow_fast (&extrapool, runp->nmbs - 1);
+		assert (runp->nmbs <= 255);
+		obstack_1grow_fast (&extrapool, runp->nmbs);

-		for (i = 1; i < runp->nmbs; ++i)
+		for (i = 0; i < runp->nmbs; ++i)
 		  obstack_1grow_fast (&extrapool, runp->mbs[i]);
 	      }

@@ -2315,30 +2358,25 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,
 	      obstack_1grow_fast (&extrapool, '\0');

 	    /* Next entry.  */
-	    lastp = runp;
 	    runp = runp->mbnext;
 	  }
 	while (runp != NULL);

 	assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));

-	/* If the final entry in the list is not a single character we
-	   add an UNDEFINED entry here.  */
-	if (lastp->nmbs != 1)
-	  {
-	    int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
-	    obstack_make_room (&extrapool, added);
+	/* Add an UNDEFINED entry at the end of the list.  */
+	int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
+	obstack_make_room (&extrapool, added);

-	    obstack_int32_grow_fast (&extrapool, 0);
-	    /* XXX What rule? We just pick the first.  */
-	    obstack_1grow_fast (&extrapool, 0);
-	    /* Length is zero.  */
-	    obstack_1grow_fast (&extrapool, 0);
+	obstack_int32_grow_fast (&extrapool, 0);
+	/* XXX What rule? We just pick the first.  */
+	obstack_1grow_fast (&extrapool, 0);
+	/* Length is zero.  */
+	obstack_1grow_fast (&extrapool, 0);

-	    /* Add alignment bytes if necessary.  */
-	    while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
-	      obstack_1grow_fast (&extrapool, '\0');
-	  }
+	/* Add alignment bytes if necessary.  */
+	while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
+	  obstack_1grow_fast (&extrapool, '\0');
       }

   /* Add padding to the tables if necessary.  */
@@ -2346,7 +2384,7 @@  collate_output (struct localedef_t *locale, const struct
charmap_t *charmap,
     obstack_1grow (&weightpool, 0);

   /* Now add the four tables.  */
-  add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256);
+  add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256 * 256);
   add_locale_raw_obstack (&file, &weightpool);
   add_locale_raw_obstack (&file, &extrapool);
   add_locale_raw_obstack (&file, &indirectpool);
diff --git a/locale/weight.h b/locale/weight.h
index 721bf7d..86cc9b7 100644
--- a/locale/weight.h
+++ b/locale/weight.h
@@ -21,24 +21,66 @@ 

 /* Find index of weight.  */
 static inline int32_t __attribute__ ((always_inline))
-findidx (const int32_t *table,
+findidx (uint_fast32_t locale_encoding,
+	 const int32_t *table,
 	 const int32_t *indirect,
 	 const unsigned char *extra,
 	 const unsigned char **cpp, size_t len)
 {
-  int_fast32_t i = table[*(*cpp)++];
   const unsigned char *cp;
   const unsigned char *usrc;
+  uint16_t index = (*cpp)[0];

+  /* Special handling of UTF-8: Generate a 2-byte index for table.
+     This has to be equal to the folding in locale/programs/ld-collate.c:
+     collate_finish().  */
+  if (locale_encoding == __cet_utf8 && (index & 0x80) != 0)
+    {
+      if ((index & 0xE0) == 0xC0)
+	{
+	  if (len < 2)
+	    goto utf8_error;
+	  uint16_t byte2 = (*cpp)[1];
+	  index = ((index & 0x1F) << 6) | (byte2 & 0x3F);
+	}
+      else if ((index & 0xF0) == 0xE0)
+	{
+	  if (len < 3)
+	    goto utf8_error;
+	  uint16_t byte2 = (*cpp)[1];
+	  uint16_t byte3 = (*cpp)[2];
+	  index = ((index & 0xF) << 12) | ((byte2 & 0x3F) << 6) |
+		  (byte3 & 0x3F);
+	}
+      else if ((index & 0xF8) == 0xF0)
+	{
+	  if (len < 4)
+	    goto utf8_error;
+	  uint16_t byte3 = (*cpp)[2];
+	  uint16_t byte4 = (*cpp)[3];
+	  index = ((index & 0xF) << 12) | ((byte3 & 0x3F) << 6) |
+		  (byte4 & 0x3F);
+	}
+      else
+	{
+	utf8_error:
+	  *cpp += 1;
+	  return 0;
+	}
+    }
+
+  int_fast32_t i = table[index];
   if (i >= 0)
-    /* This is an index into the weight table.  Cool.  */
-    return i;
+    {
+      /* This is an index into the weight table.  Cool.  */
+      *cpp += 1;
+      return i;
+    }

   /* Oh well, more than one sequence starting with this byte.
      Search for the correct one.  */
   cp = &extra[-i];
   usrc = *cpp;
-  --len;
   while (1)
     {
       size_t nhere;
@@ -57,8 +99,7 @@  findidx (const int32_t *table,
 	  /* It is a single character.  If it matches we found our
 	     index.  Note that at the end of each list there is an
 	     entry of length zero which represents the single byte
-	     sequence.  The first (and here only) byte was tested
-	     already.  */
+	     sequence.  */
 	  size_t cnt;

 	  for (cnt = 0; cnt < nhere && cnt < len; ++cnt)
@@ -68,7 +109,7 @@  findidx (const int32_t *table,
 	  if (cnt == nhere)
 	    {
 	      /* Found it.  */
-	      *cpp += nhere;
+	      *cpp += nhere > 0 ? nhere : 1;
 	      return i;
 	    }

@@ -127,7 +168,7 @@  findidx (const int32_t *table,
 	      while (++cnt < nhere);
 	    }

-	  *cpp += nhere;
+	  *cpp += nhere > 0 ? nhere : 1;
 	  return indirect[-i + offset];
 	}
     }
diff --git a/locale/weightwc.h b/locale/weightwc.h
index 3cd7a69..3781d0d 100644
--- a/locale/weightwc.h
+++ b/locale/weightwc.h
@@ -21,7 +21,8 @@ 

 /* Find index of weight.  */
 static inline int32_t __attribute__ ((always_inline))
-findidx (const int32_t *table,
+findidx (uint_fast32_t encoding,
+	 const int32_t *table,
 	 const int32_t *indirect,
 	 const wint_t *extra,
 	 const wint_t **cpp, size_t len)
diff --git a/posix/fnmatch_loop.c b/posix/fnmatch_loop.c
index f46c9df..d25cfb0 100644
--- a/posix/fnmatch_loop.c
+++ b/posix/fnmatch_loop.c
@@ -389,6 +389,8 @@  FCT (pattern, string, string_end, no_leading_period, flags,
ends, alloca_used)
 			const int32_t *indirect;
 			int32_t idx;
 			const UCHAR *cp = (const UCHAR *) &str;
+			uint_fast32_t encoding = (uint32_t)
+			  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);

 # if WIDE_CHAR_VERSION
 			table = (const int32_t *)
@@ -410,7 +412,7 @@  FCT (pattern, string, string_end, no_leading_period, flags,
ends, alloca_used)
 			  _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
 # endif

-			idx = FINDIDX (table, indirect, extra, &cp, 1);
+			idx = FINDIDX (encoding, table, indirect, extra, &cp, 1);
 			if (idx != 0)
 			  {
 			    /* We found a table entry.  Now see whether the
@@ -420,7 +422,7 @@  FCT (pattern, string, string_end, no_leading_period, flags,
ends, alloca_used)
 			    int32_t idx2;
 			    const UCHAR *np = (const UCHAR *) n;

-			    idx2 = FINDIDX (table, indirect, extra,
+			    idx2 = FINDIDX (encoding, table, indirect, extra,
 					    &np, string_end - n);
 			    if (idx2 != 0
 				&& (idx >> 24) == (idx2 >> 24)
diff --git a/posix/regcomp.c b/posix/regcomp.c
index bf8aa16..65d2d1c 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -3426,6 +3426,7 @@  build_equiv_class (bitset_t sbcset, const unsigned char *name)
   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
   if (nrules != 0)
     {
+      uint_fast32_t encoding;
       const int32_t *table, *indirect;
       const unsigned char *weights, *extra, *cp;
       unsigned char char_buf[2];
@@ -3434,6 +3435,7 @@  build_equiv_class (bitset_t sbcset, const unsigned char *name)
       size_t len;
       /* Calculate the index for equivalence class.  */
       cp = name;
+      encoding = (uint32_t) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);
       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
 					       _NL_COLLATE_WEIGHTMB);
@@ -3441,7 +3443,7 @@  build_equiv_class (bitset_t sbcset, const unsigned char *name)
 						   _NL_COLLATE_EXTRAMB);
       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
 						_NL_COLLATE_INDIRECTMB);
-      idx1 = findidx (table, indirect, extra, &cp, -1);
+      idx1 = findidx (encoding, table, indirect, extra, &cp, -1);
       if (BE (idx1 == 0 || *cp != '\0', 0))
 	/* This isn't a valid character.  */
 	return REG_ECOLLATE;
@@ -3452,7 +3454,7 @@  build_equiv_class (bitset_t sbcset, const unsigned char *name)
 	{
 	  char_buf[0] = ch;
 	  cp = char_buf;
-	  idx2 = findidx (table, indirect, extra, &cp, 1);
+	  idx2 = findidx (encoding, table, indirect, extra, &cp, 1);
 /*
 	  idx2 = table[ch];
 */
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index 154e969..bece810 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -743,17 +743,19 @@  re_string_elem_size_at (const re_string_t *pstr, int idx)
 #  ifdef _LIBC
   const unsigned char *p, *extra;
   const int32_t *table, *indirect;
+  uint_fast32_t encoding;
   uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);

   if (nrules != 0)
     {
+      encoding = (uint32_t) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);
       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
       extra = (const unsigned char *)
 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
 						_NL_COLLATE_INDIRECTMB);
       p = pstr->mbs + idx;
-      findidx (table, indirect, extra, &p, pstr->len - idx);
+      findidx (encoding, table, indirect, extra, &p, pstr->len - idx);
       return p - pstr->mbs - idx;
     }
   else
diff --git a/posix/regexec.c b/posix/regexec.c
index 70cd606..ef7a5c6 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -3869,6 +3869,7 @@  check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
       if (nrules != 0)
 	{
 	  unsigned int in_collseq = 0;
+	  uint_fast32_t encoding;
 	  const int32_t *table, *indirect;
 	  const unsigned char *weights, *extra;
 	  const char *collseqwc;
@@ -3919,6 +3920,8 @@  check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 	  if (cset->nequiv_classes)
 	    {
 	      const unsigned char *cp = pin;
+	      encoding = (uint32_t)
+		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_ENCODING_TYPE);
 	      table = (const int32_t *)
 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 	      weights = (const unsigned char *)
@@ -3927,7 +3930,8 @@  check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
 	      indirect = (const int32_t *)
 		_NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
-	      int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
+	      int32_t idx = findidx (encoding, table, indirect, extra, &cp,
+				     elem_len);
 	      if (idx > 0)
 		for (i = 0; i < cset->nequiv_classes; ++i)
 		  {
diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 8f1225f..668ea9d 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -78,9 +78,9 @@  typedef struct
 /* Get next sequence.  Traverse the string as required.  */
 static __always_inline void
 get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
-	      const USTRING_TYPE *weights, const int32_t *table,
-	      const USTRING_TYPE *extra, const int32_t *indirect,
-	      int pass)
+	      const USTRING_TYPE *weights, uint_fast32_t encoding,
+	      const int32_t *table, const USTRING_TYPE *extra,
+	      const int32_t *indirect, int pass)
 {
   size_t val = seq->val = 0;
   int len = seq->len;
@@ -124,7 +124,7 @@  get_next_seq (coll_seq *seq, int nrules, const unsigned char
*rulesets,
 	      us = seq->back_us;
 	      while (i < backw)
 		{
-		  int32_t tmp = findidx (table, indirect, extra, &us, -1);
+		  int32_t tmp = findidx (encoding, table, indirect, extra, &us, -1);
 		  idx = tmp & 0xffffff;
 		  i++;
 		}
@@ -139,7 +139,7 @@  get_next_seq (coll_seq *seq, int nrules, const unsigned char
*rulesets,

 	  while (*us != L('\0'))
 	    {
-	      int32_t tmp = findidx (table, indirect, extra, &us, -1);
+	      int32_t tmp = findidx (encoding, table, indirect, extra, &us, -1);
 	      unsigned char rule = tmp >> 24;
 	      prev_idx = idx;
 	      idx = tmp & 0xffffff;
@@ -345,9 +345,9 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2,
__locale_t l)

       while (1)
 	{
-	  get_next_seq (&seq1, nrules, rulesets, weights, table,
+	  get_next_seq (&seq1, nrules, rulesets, weights, encoding, table,
 				    extra, indirect, pass);
-	  get_next_seq (&seq2, nrules, rulesets, weights, table,
+	  get_next_seq (&seq2, nrules, rulesets, weights, encoding, table,
 				    extra, indirect, pass);
 	  /* See whether any or both strings are empty.  */
 	  if (seq1.len == 0 || seq2.len == 0)
diff --git a/string/strxfrm_l.c b/string/strxfrm_l.c
index 8b61ea2..95abc4e 100644
--- a/string/strxfrm_l.c
+++ b/string/strxfrm_l.c
@@ -53,6 +53,7 @@  typedef struct
   uint_fast32_t nrules;
   unsigned char *rulesets;
   USTRING_TYPE *weights;
+  uint_fast32_t encoding;
   int32_t *table;
   USTRING_TYPE *extra;
   int32_t *indirect;
@@ -100,8 +101,8 @@  static __always_inline size_t
 find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
 	  unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
 {
-  int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
-			 -1);
+  int32_t tmp = findidx (l_data->encoding, l_data->table, l_data->indirect,
+			 l_data->extra, us, -1);
   *rule_idx = tmp >> 24;
   int32_t idx = tmp & 0xffffff;
   size_t len = l_data->weights[idx++];
@@ -693,6 +694,8 @@  STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n,
__locale_t l)
   /* Get the locale data.  */
   l_data.rulesets = (unsigned char *)
     current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
+  l_data.encoding =
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
   l_data.table = (int32_t *)
     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;