speedup strcoll by using strdiff

Message ID	54FF4F73.7070902@web.de
State	New
Headers	show Return-Path: <libc-alpha-return-57819-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:message-id:date:from:mime-version:to:subject :content-type:content-transfer-encoding; q=dns; s=default; b=n5W ndf5U9bAUW5nhBH1Mov4Qosj1G7ZEqwzL5NjZfoIlTMcFWlUs/2Ro0GIJ3L5fyMC FtVa6Ap/5RLa2ZDnILFq0rwHoMzOYAY11/vTcrplibNViui81iKXcFC/q/vz+x11 0Q0YXS0us7Vn+XHCXsniqeIAYoNgr1jNGCW7JsYI= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Message-ID: <54FF4F73.7070902@web.de> Date: Tue, 10 Mar 2015 21:09:23 +0100 From: Leonhard Holz <leonhard.holz@web.de> User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Thunderbird/31.5.0 MIME-Version: 1.0 To: libc-alpha@sourceware.org Subject: [PATCH] speedup strcoll by using strdiff Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit

diff --git a/localedata/charmaps-8bit.c b/localedata/charmaps-8bit.c index e69de29..339032d 100644 --- a/localedata/charmaps-8bit.c +++ b/localedata/charmaps-8bit.c @@ -0,0 +1,205 @@ +"ANSI_X3.4-1968", +"ARMSCII-8", +"ASMO_449", +"BRF", +"BS_4730", +"BS_VIEWDATA", +"CP10007", +"CP1125", +"CP1250", +"CP1251", +"CP1252", +"CP1253", +"CP1254", +"CP1255", +"CP1256", +"CP1257", +"CP1258", +"CP737", +"CP770", +"CP771", +"CP772", +"CP773", +"CP774", +"CP775", +"CP949", +"CSA_Z243.4-1985-1", +"CSA_Z243.4-1985-2", +"CSA_Z243.4-1985-GR", +"CSN_369103", +"CWI", +"DEC-MCS", +"DIN_66003", +"DS_2089", +"EBCDIC-AT-DE", +"EBCDIC-AT-DE-A", +"EBCDIC-CA-FR", +"EBCDIC-DK-NO", +"EBCDIC-DK-NO-A", +"EBCDIC-ES", +"EBCDIC-ES-A", +"EBCDIC-ES-S", +"EBCDIC-FI-SE", +"EBCDIC-FI-SE-A", +"EBCDIC-FR", +"EBCDIC-IS-FRISS", +"EBCDIC-IT", +"EBCDIC-PT", +"EBCDIC-UK", +"EBCDIC-US", +"ECMA-CYRILLIC", +"ES", +"ES2", +"GEORGIAN-ACADEMY", +"GEORGIAN-PS", +"GOST_19768-74", +"GREEK7", +"GREEK7-OLD", +"GREEK-CCITT", +"HP-GREEK8", +"HP-ROMAN8", +"HP-ROMAN9", +"HP-THAI8", +"HP-TURKISH8", +"IBM037", +"IBM038", +"IBM1004", +"IBM1026", +"IBM1047", +"IBM1124", +"IBM1129", +"IBM1132", +"IBM1133", +"IBM1160", +"IBM1161", +"IBM1162", +"IBM1163", +"IBM1164", +"IBM256", +"IBM273", +"IBM274", +"IBM275", +"IBM277", +"IBM278", +"IBM280", +"IBM281", +"IBM284", +"IBM285", +"IBM290", +"IBM297", +"IBM420", +"IBM423", +"IBM424", +"IBM437", +"IBM500", +"IBM850", +"IBM851", +"IBM852", +"IBM855", +"IBM856", +"IBM857", +"IBM860", +"IBM861", +"IBM862", +"IBM863", +"IBM864", +"IBM865", +"IBM866", +"IBM866NAV", +"IBM868", +"IBM869", +"IBM870", +"IBM871", +"IBM874", +"IBM875", +"IBM880", +"IBM891", +"IBM903", +"IBM904", +"IBM905", +"IBM918", +"IBM922", +"IEC_P27-1", +"INIS", +"INIS-8", +"INIS-CYRILLIC", +"INVARIANT", +"ISIRI-3342", +"ISO_10367-BOX", +"ISO_11548-1", +"ISO_2033-1983", +"ISO_5427", +"ISO_5427-EXT", +"ISO_5428", +"ISO_646.BASIC", +"ISO_646.IRV", +"ISO_6937-2-25", +"ISO-8859-1", +"ISO-8859-10", +"ISO-8859-11", +"ISO-8859-13", +"ISO-8859-14", +"ISO-8859-15", +"ISO-8859-16", +"ISO_8859-1,GL", +"ISO-8859-2", +"ISO-8859-3", +"ISO-8859-4", +"ISO-8859-5", +"ISO-8859-6", +"ISO-8859-7", +"ISO-8859-8", +"ISO-8859-9", +"ISO-8859-9E", +"ISO_8859-SUPP", +"ISO-IR-197", +"ISO-IR-209", +"IT", +"JIS_C6220-1969-JP", +"JIS_C6220-1969-RO", +"JIS_C6229-1984-A", +"JIS_C6229-1984-B", +"JIS_C6229-1984-B-ADD", +"JIS_C6229-1984-HAND", +"JIS_C6229-1984-HAND-ADD", +"JIS_C6229-1984-KANA", +"JIS_X0201", +"JUS_I.B1.002", +"JUS_I.B1.003-MAC", +"JUS_I.B1.003-SERB", +"KOI-8", +"KOI8-R", +"KOI8-RU", +"KOI8-T", +"KOI8-U", +"KSC5636", +"LATIN-GREEK", +"LATIN-GREEK-1", +"MAC-CENTRALEUROPE", +"MAC-CYRILLIC", +"MACINTOSH", +"MAC-IS", +"MAC-SAMI", +"MAC-UK", +"MIK", +"MSZ_7795.3", +"NATS-DANO", +"NATS-DANO-ADD", +"NATS-SEFI", +"NATS-SEFI-ADD", +"NC_NC00-10", +"NEXTSTEP", +"NF_Z_62-010", +"NF_Z_62-010_1973", +"NS_4551-1", +"NS_4551-2", +"PT", +"PT154", +"PT2", +"RK1048", +"SAMI", +"SAMI-WS2", +"SEN_850200_B", +"SEN_850200_C", +"T.61-7BIT", +"VISCII" diff --git a/string/strcoll_l.c b/string/strcoll_l.c index 658d5b9..2df3361 100644 --- a/string/strcoll_l.c +++ b/string/strcoll_l.c @@ -24,11 +24,13 @@ #include <stdint.h> #include <string.h> #include <sys/param.h> +#include <bits/libc-lock.h> #ifndef STRING_TYPE # define STRING_TYPE char # define USTRING_TYPE unsigned char # define STRCOLL __strcoll_l +# define STRDIFF __strdiff # define STRCMP strcmp # define WEIGHT_H "../locale/weight.h" # define SUFFIX MB @@ -41,6 +43,96 @@ #include "../locale/localeinfo.h" #include WEIGHT_H +/* Mapping of locale to encoding type. */ +typedef struct encoding +{ + int type; + const char *locale; + struct encoding *next; +} *encoding_t; + +#define ENCODING_OTHER 0 +#define ENCODING_8BIT 1 +#define ENCODING_UTF8 2 + +__libc_lock_define (typedef, lock_t) +static encoding_t encodings = NULL; +static const char *const encodings_8bit[] = +{ +#include "../localedata/charmaps-8bit.c" +}; +static lock_t encodings_lock = LLL_LOCK_INITIALIZER; + +static int +strsuffix(const char *str, const char *suffix) +{ + if (!str || !suffix) + return 0; + size_t lenstr = strlen (str); + size_t lensuffix = strlen (suffix); + if (lensuffix > lenstr) + return 0; + return strncmp (str + lenstr - lensuffix, suffix, lensuffix) == 0; +} + +static int +get_encoding (const char *locale) +{ + encoding_t encoding = encodings; + while (encoding != NULL) + { + if (encoding->locale == locale) + return encoding->type; + encoding = encoding->next; + } + + /* Add new encoding type. Lock encoding map and check if another thread + has already added the new type while waiting for the lock. */ + __libc_lock_lock (encodings_lock); + + encoding = encodings; + while (encoding != NULL) + { + if (encoding->locale == locale) + { + __libc_lock_unlock (encodings_lock); + return encoding->type; + } + encoding = encoding->next; + } + + encoding_t new_encoding = malloc (sizeof (encoding_t*)); + if (strsuffix (locale, "UTF-8")) + new_encoding->type = ENCODING_UTF8; + else + { + new_encoding->type = ENCODING_OTHER; + for (size_t i = 0; i < sizeof (encodings_8bit) / sizeof (char *); i++) + if (strsuffix (locale, encodings_8bit[i])) + { + new_encoding->type = ENCODING_8BIT; + break; + } + } + new_encoding->locale = locale; + new_encoding->next = encodings; + encodings = new_encoding; + + __libc_lock_unlock (encodings_lock); + return new_encoding->type; +} + +size_t +STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t) +{ + size_t n; + + for (n = 0; *s != '\0' && *s++ == *t++; ++n) + continue; + + return n; +} + /* Track status while looking for sequences in a string. */ typedef struct { @@ -242,6 +334,9 @@ out: return result; } +#define MASK_UTF8_7BIT (1 << 7) +#define MASK_UTF8_START (3 << 6) + int STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) { @@ -255,9 +350,28 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) const USTRING_TYPE *extra; const int32_t *indirect; + /* In case there is no locale specific sort order (C locale). */ if (nrules == 0) return STRCMP (s1, s2); + /* Fast forward to the position of the first difference. Needs to be + encoding aware as the byte-by-byte comparison can stop in the midle + of a char sequence for non fixed width encodings like UTF-8. */ + int encoding = get_encoding (l->__names[LC_CTYPE]); + if (encoding != ENCODING_OTHER) + { + size_t diff = STRDIFF (s1, s2); + if (diff > 0) + { + if (encoding == ENCODING_UTF8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0) + do + diff--; + while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START); + s1 += diff; + s2 += diff; + } + } + /* Catch empty strings. */ if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0')) return (*s1 != '\0') - (*s2 != '\0'); @@ -321,7 +435,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l) byte-level comparison to ensure that we don't waste time going through multiple passes for totally equal strings before proceeding to subsequent passes. */ - if (pass == 0 && STRCMP (s1, s2) == 0) + if (pass == 0 && encoding == ENCODING_OTHER && + STRCMP (s1, s2) == 0) return result; else break; diff --git a/wcsmbs/wcscoll_l.c b/wcsmbs/wcscoll_l.c index 106ec93..9f60cee 100644 --- a/wcsmbs/wcscoll_l.c +++ b/wcsmbs/wcscoll_l.c @@ -23,6 +23,7 @@ #define STRING_TYPE wchar_t #define USTRING_TYPE wint_t #define STRCOLL __wcscoll_l +#define STRDIFF __wcsdiff #define STRCMP wcscmp #define WEIGHT_H "../locale/weightwc.h" #define SUFFIX WC

speedup strcoll by using strdiff

Commit Message

Comments

Patch