diff mbox

[v2] speedup strcoll by using strdiff

Message ID 550284F9.6090503@web.de
State New
Headers show

Commit Message

Leonhard Holz March 13, 2015, 6:34 a.m. UTC
This patch uses strdiff to skip over equal prefixes in strcoll. This is 
implemented for 8-bit character sets and UTF-8.

The benchmark shows the following changes:

file listing	-52.82%
vi_VN.UTF-8	-10.63%
en_US.UTF-8	-12.33%
ar_SA.UTF-8	-23.02%
zh_CN.UTF-8	4.80%
cs_CZ.UTF-8	-8.54%
en_GB.UTF-8	-4.14%
da_DK.UTF-8	-13.84%
pl_PL.UTF-8	-8.77%
fr_FR.UTF-8	-7.12%
pt_PT.UTF-8	-7.80%
el_GR.UTF-8	-21.93%
ru_RU.UTF-8	-14.70%
iw_IL.UTF-8	-34.43%
es_ES.UTF-8	-8.97%
hi_IN.UTF-8	-59.70%
sv_SE.UTF-8	-15.63%
hu_HU.UTF-8	-17.53%
tr_TR.UTF-8	-9.62%
is_IS.UTF-8	-5.57%
it_IT.UTF-8	-7.22%
sr_RS.UTF-8	-20.78%
ja_JP.UTF-8	21.09%

	* string/strcoll_l.c (strsuffix): New function.
	* (get_encoding): Likewise.
	* (STRDIFF): Likewise.
	* (STRCOLL): Use STRDIFF to skip over equal prefix.
	* wcsmbs/wcscoll_l.c: Define STRDIFF.

Comments

Leonhard Holz March 24, 2015, 1 p.m. UTC | #1
Please review: https://sourceware.org/ml/libc-alpha/2015-03/msg00495.html
Ondřej Bílka April 3, 2015, 3:15 p.m. UTC | #2
On Fri, Mar 13, 2015 at 07:34:33AM +0100, Leonhard Holz wrote:
> This patch uses strdiff to skip over equal prefixes in strcoll. This
> is implemented for 8-bit character sets and UTF-8.
> 
Almost ok for me.

I would rather do detection in setlocale by setting new value so you
could use say:

current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;

Now it would be faster unless you have long running application that
uses lot of locales. Then linear browsing of list could be bottleneck.

Second request is moving strcoll to separate file due optimized
implementations.

Following should work for 64bit archs with little endian allowing
unaligned loads.

Also when strdiff is checked I will add patch that uses strdiff for 
strcase*cmp as only powerpc and x86/64 have optimized assembly.

size_t 
strdiff (char *x, char *y)
{
  uint64_t *xp = (uint64_t *) x,  *yp = (uint64_t *) y;
  size_t i;
  while (x[i] == y[i])
    i++;
  return 8 * i + (ffsl (x[i] ^ y[i]) - 1) / 8;
}
Leonhard Holz April 5, 2015, 12:51 p.m. UTC | #3
Am 03.04.2015 um 17:15 schrieb Ondřej Bílka:
> On Fri, Mar 13, 2015 at 07:34:33AM +0100, Leonhard Holz wrote:
>> This patch uses strdiff to skip over equal prefixes in strcoll. This
>> is implemented for 8-bit character sets and UTF-8.
>>
> Almost ok for me.
>
> I would rather do detection in setlocale by setting new value so you
> could use say:
>
> current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
>
> Now it would be faster unless you have long running application that
> uses lot of locales. Then linear browsing of list could be bottleneck.
>

That did work. The v3 patch follows.

> Second request is moving strcoll to separate file due optimized
> implementations.

Did you mean strdiff?
Ondřej Bílka April 6, 2015, 12:52 p.m. UTC | #4
On Sun, Apr 05, 2015 at 02:51:08PM +0200, Leonhard Holz wrote:
> Am 03.04.2015 um 17:15 schrieb Ondřej Bílka:
> >Second request is moving strcoll to separate file due optimized
> >implementations.
> 
> Did you mean strdiff?

Yes, it was typo.
Leonhard Holz April 6, 2015, 8:59 p.m. UTC | #5
Am 06.04.2015 um 14:52 schrieb Ondřej Bílka:
> On Sun, Apr 05, 2015 at 02:51:08PM +0200, Leonhard Holz wrote:
>> Am 03.04.2015 um 17:15 schrieb Ondřej Bílka:
>>> Second request is moving strcoll to separate file due optimized
>>> implementations.
>>
>> Did you mean strdiff?
>
> Yes, it was typo.
>

Ok. When moved to a different file, the signature must be declared somehow. Can I 
do this in string.h? And should it be inside an #ifdef or so?
Leonhard Holz April 12, 2015, 9:20 a.m. UTC | #6
Hey guys! I need an answer to continue.

Am 06.04.2015 um 22:59 schrieb Leonhard Holz:
> 
> Am 06.04.2015 um 14:52 schrieb Ondřej Bílka:
>> On Sun, Apr 05, 2015 at 02:51:08PM +0200, Leonhard Holz wrote:
>>> Am 03.04.2015 um 17:15 schrieb Ondřej Bílka:
>>>> Second request is moving strcoll to separate file due optimized
>>>> implementations.
>>>
>>> Did you mean strdiff?
>>
>> Yes, it was typo.
>>
> 
> Ok. When moved to a different file, the signature must be declared somehow. Can I
> do this in string.h? And should it be inside an #ifdef or so?
diff mbox

Patch

diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 658d5b9..b82a341 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -23,12 +23,15 @@ 
  #include <stddef.h>
  #include <stdint.h>
  #include <string.h>
+#include <stdio.h>
  #include <sys/param.h>
+#include <bits/libc-lock.h>

  #ifndef STRING_TYPE
  # define STRING_TYPE char
  # define USTRING_TYPE unsigned char
  # define STRCOLL __strcoll_l
+# define STRDIFF __strdiff
  # define STRCMP strcmp
  # define WEIGHT_H "../locale/weight.h"
  # define SUFFIX	MB
@@ -41,6 +44,87 @@ 
  #include "../locale/localeinfo.h"
  #include WEIGHT_H

+/* Mapping of locale to encoding type.  */
+typedef struct encoding
+{
+  int type;
+  const char *locale;
+  struct encoding *next;
+} *encoding_t;
+
+#define ENCODING_OTHER 0
+#define ENCODING_8BIT  1
+#define ENCODING_UTF8  2
+
+__libc_lock_define (typedef, lock_t)
+static encoding_t encodings = NULL;
+static lock_t encodings_lock = LLL_LOCK_INITIALIZER;
+
+static int
+strsuffix(const char *str, const char *suffix)
+{
+  if (!str || !suffix)
+    return 0;
+  size_t lenstr = strlen (str);
+  size_t lensuffix = strlen (suffix);
+  if (lensuffix >  lenstr)
+    return 0;
+  return strncmp (str + lenstr - lensuffix, suffix, lensuffix) == 0;
+}
+
+static int
+get_encoding (const char *locale, struct __locale_data * ctype)
+{
+  encoding_t encoding = encodings;
+  while (encoding != NULL)
+    {
+      if (encoding->locale == locale)
+	return encoding->type;
+      encoding = encoding->next;
+    }
+
+  /* Add new encoding type.  Lock encoding map and check if another thread
+     has already added the new type while waiting for the lock.  */
+  __libc_lock_lock (encodings_lock);
+
+  encoding = encodings;
+  while (encoding != NULL)
+    {
+      if (encoding->locale == locale)
+	{
+	  __libc_lock_unlock (encodings_lock);
+	  return encoding->type;
+	}
+      encoding = encoding->next;
+    }
+
+  encoding_t new_encoding = malloc (sizeof (encoding_t*));
+  size_t mb_cur_max = ctype->values[_NL_ITEM_INDEX (_NL_CTYPE_MB_CUR_MAX)].word;
+  if (mb_cur_max == 1)
+    new_encoding->type = ENCODING_8BIT;
+  else if (strsuffix (locale, "UTF-8"))
+    new_encoding->type = ENCODING_UTF8;
+  else
+    new_encoding->type = ENCODING_OTHER;
+  new_encoding->locale = locale;
+  new_encoding->next = encodings;
+  encodings = new_encoding;
+
+  __libc_lock_unlock (encodings_lock);
+  return new_encoding->type;
+}
+
+size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
  /* Track status while looking for sequences in a string.  */
  typedef struct
  {
@@ -242,6 +326,9 @@  out:
    return result;
  }

+#define MASK_UTF8_7BIT  (1 << 7)
+#define MASK_UTF8_START (3 << 6)
+
  int
  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
  {
@@ -255,9 +342,28 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, 
__locale_t l)
    const USTRING_TYPE *extra;
    const int32_t *indirect;

+  /* In case there is no locale specific sort order (C locale).  */
    if (nrules == 0)
      return STRCMP (s1, s2);

+  /* Fast forward to the position of the first difference.  Needs to be
+     encoding aware as the byte-by-byte comparison can stop in the midle
+     of a char sequence for non fixed width encodings like UTF-8.  */
+  int encoding = get_encoding (l->__names[LC_CTYPE], l->__locales[LC_CTYPE]);
+  if (encoding != ENCODING_OTHER)
+    {
+      size_t diff = STRDIFF (s1, s2);
+      if (diff > 0)
+	{
+	  if (encoding == ENCODING_UTF8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
+	    do
+	      diff--;
+	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
+	  s1 += diff;
+	  s2 += diff;
+	}
+    }
+
    /* Catch empty strings.  */
    if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
      return (*s1 != '\0') - (*s2 != '\0');
@@ -321,7 +427,8 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, 
__locale_t l)
  		     byte-level comparison to ensure that we don't waste time
  		     going through multiple passes for totally equal strings
  		     before proceeding to subsequent passes.  */
-		  if (pass == 0 && STRCMP (s1, s2) == 0)
+		  if (pass == 0 && encoding == ENCODING_OTHER &&
+		      STRCMP (s1, s2) == 0)
  		    return result;
  		  else
  		    break;
diff --git a/wcsmbs/wcscoll_l.c b/wcsmbs/wcscoll_l.c
index 106ec93..9f60cee 100644
--- a/wcsmbs/wcscoll_l.c
+++ b/wcsmbs/wcscoll_l.c
@@ -23,6 +23,7 @@ 
  #define STRING_TYPE wchar_t
  #define USTRING_TYPE wint_t
  #define STRCOLL __wcscoll_l
+#define STRDIFF __wcsdiff
  #define STRCMP wcscmp
  #define WEIGHT_H "../locale/weightwc.h"
  #define SUFFIX	WC