diff mbox

[1/3] Refactor strdiff.

Message ID 20150513085810.GA31782@domone
State New
Headers show

Commit Message

Ondřej Bílka May 13, 2015, 8:58 a.m. UTC
Hi, as I want to improve strcasecmp with strdiff first step is move it
to separate file. I also factored out UTF-8 handling. I also added
microoptimization to find start as you could do a < x < b check with
single comparison and substraction and unroll loop as it could happen
maximally three times.

Then there is wide character handling. I added explicit encoding there
as widechar version could be directly used.

OK with this?

	* string/strdiff.h: New file.
	* string/strcoll_l.c: Move out STRDIFF implementation.
diff mbox

Patch

diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 0fa005f..297ec9c 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -30,6 +30,7 @@ 
 # define USTRING_TYPE unsigned char
 # define STRCOLL __strcoll_l
 # define STRDIFF __strdiff
+# define STRDIFF_L __strdiff_l
 # define STRCMP strcmp
 # define WEIGHT_H "../locale/weight.h"
 # define SUFFIX	MB
@@ -42,19 +43,7 @@ 
 #include "../locale/localeinfo.h"
 #include WEIGHT_H
 
-#define MASK_UTF8_7BIT  (1 << 7)
-#define MASK_UTF8_START (3 << 6)
-
-size_t
-STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
-{
-  size_t n;
-
-  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
-    continue;
-
-  return n;
-}
+#include "string/strdiff.h"
 
 /* Track status while looking for sequences in a string.  */
 typedef struct
@@ -274,24 +263,14 @@  STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   if (nrules == 0)
     return STRCMP (s1, s2);
 
-  /* Fast forward to the position of the first difference.  Needs to be
-     encoding aware as the byte-by-byte comparison can stop in the middle
-     of a char sequence for multibyte encodings like UTF-8.  */
+  /* Fast forward to the position of the first difference.  */
   uint_fast32_t encoding =
     current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
-  if (encoding != __cet_other)
-    {
-      size_t diff = STRDIFF (s1, s2);
-      if (diff > 0)
-	{
-	  if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
-	    do
-	      diff--;
-	    while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
-	  s1 += diff;
-	  s2 += diff;
-	}
-    }
+
+  if (sizeof (STRING_TYPE) > 1)
+    STRDIFF_L (&s1, &s2, __cet_8bit);
+  else if (encoding != __cet_other)
+    STRDIFF_L (&s1, &s2, encoding);
 
   /* Catch empty strings.  */
   if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
diff --git a/string/strdiff.h b/string/strdiff.h
new file mode 100644
index 0000000..224d899
--- /dev/null
+++ b/string/strdiff.h
@@ -0,0 +1,36 @@ 
+static size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
+#define UTF8_CONT_START 128
+#define UTF8_CONT_END 195
+
+static void 
+STRDIFF_L (const STRING_TYPE **s1, const STRING_TYPE **s2, uint_fast32_t encoding)
+{
+  size_t diff = STRDIFF (*s1, *s2);
+	  
+  if (encoding == __cet_utf8)
+    {
+      USTRING_TYPE c = *(*s1 + diff);
+      if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+        {
+          diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+          c = *(*s1 + diff);
+          if (((UTF8_CONT_START <= c) & (c <= UTF8_CONT_END)) && diff > 0)
+            diff --;
+        }
+     }
+  *s1 += diff;
+  *s2 += diff;
+}