Message ID | 1429254584-124997-1-git-send-email-raji@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote: > This patch optimizes strstr function for power >= 7 systems. Performance > gain is obtained using doubleword aligned memory access and usage of cmpb > instruction for quicker comparison. The average improvement of this > optimization is ~40%. As I did same with x64 I will add improvements, I didn't look at patch in detail. First problem looks there is quadratic worst case. That is solved by buy-or-rent technique. You count number of comparisons and if that exceeds C*N where N is number of haystack bytes read so far and C suitable constant then you switch to slower generic algorithm. Main improvement in x64 case was checking for pair of characters. A pair of characters is less likely than single character even vectorized branch would be rarely triggered saving misprediction cost that dominates performance. That improved performance about 25 times. For checking pair only with arithmetic you look on zero bytes of following expression (LOAD (haystack + n) ^ BROADCAST(needle[0])) | (LOAD (haystack + n + 1) ^ BROADCAST(needle[1])) Then you need to do technical improvements. One is unrolling to test 64 bytes at once, you could keep that to 32/16 if that improves performance. Hot portion is header, loop is rarely used so you should check first 64 bytes without going to main loop. Next is that you need to load vector twice, once for first character and then for second character. You need to have loads corresponding to second character aligned. For first character you could use unaligned loads if they are fast or get these by bit fiddling from second character loads. This way you don't have to worry about crossing page boundaries. Here is template that I used to generate x64 version you could use that with modifications. Main problem is mapping suitable instrincts or replacing these with corresponding powerpc instructions. #include <emmintrin.h> #include <stdint.h> #include <stdlib.h> #include "../header.h" #ifdef AS_SSSE3 # define PREVCHAR(s, va, vb) CONCAT (va, vb, 15) #else # define PREVCHAR(s, va, vb) LOADU (s) #endif static inline tp_mask forget_first_bit (tp_mask x) { return x & (x - 1); } static inline int strcmp2 (char *x, char *y, long *buy) { int i = 0; while (1) { if (!y[i]) return 0; if (x[i] != y[i]) { buy += i; return 1; } i++; } } char *strchr (char *s, int n); char *strstr_string (char *a, char *b); unsigned char * strstr_new (unsigned char *s, unsigned char *n) { long i; unsigned char *s_or, *s_al; if (__builtin_expect (!n[0], 0)) return s; if (!n[1]) return strchr (s, n[0]); tp_vector v0 = BROADCAST (n[0]), v1 = BROADCAST (n[1]), vz = BROADCAST (0); tp_vector vs0, vs1, vs2, vs3, vsb; long buy; long unused; if ((((size_t) s) & 4095) <= 4096 - 65) { vs0 = LOADU (s); /*or test three chars. */ vs0 = OR (MIN (EQ (vs0, v0), EQ (v1, LOADU (s + 1))), EQ (vs0, vz)); tp_mask m = get_mask (vs0); //| (get_mask(vs1) << 16) | (get_mask(vs2) <<32) | (get_mask(vs3)<<48); while (m) { unsigned char *r = s + first_bit (m); if (!*r) return NULL; if (!strcmp2 (r + 2, n + 2, &unused)) return r; m = forget_first_bit (m); } vs1 = LOADU (s + 16); vs2 = LOADU (s + 32); vs3 = LOADU (s + 48); /*or test three chars. */ vs1 = OR (MIN (EQ (vs1, v0), EQ (v1, LOADU (s + 16 + 1))), EQ (vs1, vz)); vs2 = OR (MIN (EQ (vs2, v0), EQ (v1, LOADU (s + 32 + 1))), EQ (vs2, vz)); vs3 = OR (MIN (EQ (vs3, v0), EQ (v1, LOADU (s + 48 + 1))), EQ (vs3, vz)); m = (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48); while (m) { unsigned char *r = s + first_bit (m); if (!*r) return NULL; if (!strcmp2 (r + 2, n + 2, &unused)) return r; m = forget_first_bit (m); } } else { s_al = ALIGN_DOWN (s, 64); vs0 = LOAD (s_al); vs1 = LOAD (s_al + 16); vs2 = LOAD (s_al + 32); vs3 = LOAD (s_al + 48); /*or test three chars. */ vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz)); vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz)); vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz)); vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz)); tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48); m = m >> (s - s_al); while (m) { unsigned char *r = s + first_bit (m); if (!*r) return NULL; if (r != s && !strcmp2 (r + 2, n + 2, &unused)) return r; m = forget_first_bit (m); } } buy = -512; s_or = s; s = ALIGN_DOWN (s + 64, 64); #ifdef AS_SSSE3 vs3 = LOAD (s - 16); #endif while (1) { vs0 = LOAD (s); vs1 = LOAD (s + 16); vs2 = LOAD (s + 32); vs3 = LOAD (s + 48); /*or test three chars. */ tp_vector ret = vs0; ret = MIN (ret, vs1); ret = MIN (ret, vs2); ret = MIN (ret, vs3); tp_vector ret2 = OR (XOR (PREVCHAR (s - 1, vs3, vs0), v0), XOR (vs0, v1)); ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 16, vs0, vs1), v0), XOR (vs1, v1))); ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 32, vs1, vs2), v0), XOR (vs2, v1))); ret2 = MIN (ret2, OR (XOR (PREVCHAR (s - 1 + 48, vs2, vs3), v0), XOR (vs3, v1))); ret = MIN (ret, ret2); if (get_mask (EQ (ret, vz))) { s_al = ALIGN_DOWN (s, 64); vs0 = LOAD (s_al); vs1 = LOAD (s_al + 16); vs2 = LOAD (s_al + 32); vs3 = LOAD (s_al + 48); /*or test three chars. */ vs0 = OR (MIN (EQ (vs0, v1), EQ (v0, LOADU (s_al - 1))), EQ (vs0, vz)); /* todo last three nondestructive */ vs1 = OR (MIN (EQ (vs1, v1), EQ (v0, LOADU (s_al + 16 - 1))), EQ (vs1, vz)); vs2 = OR (MIN (EQ (vs2, v1), EQ (v0, LOADU (s_al + 32 - 1))), EQ (vs2, vz)); vs3 = OR (MIN (EQ (vs3, v1), EQ (v0, LOADU (s_al + 48 - 1))), EQ (vs3, vz)); tp_mask m = get_mask (vs0) | (get_mask (vs1) << 16) | (get_mask (vs2) << 32) | (get_mask (vs3) << 48); while (m) { unsigned char *r = s + first_bit (m); if (!*r) return NULL; if (!strcmp2 (r + 1, n + 2, &buy)) return r - 1; if (buy > s - s_or) return strstr_string (s, n); m = forget_first_bit (m); } } s += 64; } }
On Fri, Apr 17, 2015 at 12:09 AM, Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> wrote: > diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S > new file mode 100644 > index 0000000..68b93b0 > --- /dev/null > +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S > @@ -0,0 +1,44 @@ > +/* Optimized strstr implementation for POWER7. > + Copyright (C) 2015-2016 Free Software Foundation, Inc. Help me understand the 2016 copyright?
On 05/04/2015 05:21 AM, Matt Turner wrote: > On Fri, Apr 17, 2015 at 12:09 AM, Rajalakshmi Srinivasaraghavan > <raji@linux.vnet.ibm.com> wrote: >> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S >> new file mode 100644 >> index 0000000..68b93b0 >> --- /dev/null >> +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S >> @@ -0,0 +1,44 @@ >> +/* Optimized strstr implementation for POWER7. >> + Copyright (C) 2015-2016 Free Software Foundation, Inc. > Help me understand the 2016 copyright? This must be just 2015. I will correct it. >
On 05/01/2015 05:35 PM, Ondřej Bílka wrote: > On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote: >> This patch optimizes strstr function for power >= 7 systems. Performance >> gain is obtained using doubleword aligned memory access and usage of cmpb >> instruction for quicker comparison. The average improvement of this >> optimization is ~40%. > As I did same with x64 I will add improvements, I didn't look at patch > in detail. > > First problem looks there is quadratic worst case. That is solved by > buy-or-rent technique. You count number of comparisons and if that > exceeds C*N where N is number of haystack bytes read so far and C > suitable constant then you switch to slower generic algorithm. > > Main improvement in x64 case was checking for pair of characters. A pair > of characters is less likely than single character even vectorized > branch would be rarely triggered saving misprediction cost that > dominates performance. > Though I did not understand your program completely, I agree with checking pair of characters can improve the performance. In my optimization there are totally three branches. case 1# if needle len >= 8 (common case) case 2# if needle < 8 (common case) case 3# For page boundary cross haystack. (rare case) For case 1, I always compare 8 characters at a time using cmpb instruction. -> perform basic strlen, strnlen, checks. Move haystack based on strchr result. -> Load first 8 bytes(taking care of aligned load)from haystack and needle and compare them. If they are same proceed to next 8 bytes of haystack and needle. Else load next doubleword from haystack.Form a doubleword with last seven characters from first load and first character from second load and compare again with needle. Proceed in a similar way. This gives good improvement. For case2, always compare 'n' characters at a time, 'n' is needle length. -> Algorithm is same as case 1, however before doing cmpb,create input mask based on needle len. For case 1 and case 2, input is always loaded as double word. For case 3, its byte by byte comparison.
On Mon, May 04, 2015 at 11:03:21AM +0530, Rajalakshmi Srinivasaraghavan wrote: > > > On 05/01/2015 05:35 PM, Ondřej Bílka wrote: > >On Fri, Apr 17, 2015 at 02:09:44AM -0500, Rajalakshmi Srinivasaraghavan wrote: > >>This patch optimizes strstr function for power >= 7 systems. Performance > >>gain is obtained using doubleword aligned memory access and usage of cmpb > >>instruction for quicker comparison. The average improvement of this > >>optimization is ~40%. > >As I did same with x64 I will add improvements, I didn't look at patch > >in detail. > > > >First problem looks there is quadratic worst case. That is solved by > >buy-or-rent technique. You count number of comparisons and if that > >exceeds C*N where N is number of haystack bytes read so far and C > >suitable constant then you switch to slower generic algorithm. > > > >Main improvement in x64 case was checking for pair of characters. A pair > >of characters is less likely than single character even vectorized > >branch would be rarely triggered saving misprediction cost that > >dominates performance. > > > Though I did not understand your program completely, I agree with > checking pair of characters > can improve the performance. > It does. AFAIR for practical strings on x64 its only around 3 times slower than theoretical lower bound. A lower bound is strlen call as you must find terminating null when needle isn't present. I hope that main idea was clear and you will use it. Code I send is complex as it contains ten different speedups that improve performance. > In my optimization there are totally three branches. > case 1# if needle len >= 8 (common case) > case 2# if needle < 8 (common case) > case 3# For page boundary cross haystack. (rare case) > Checking if its case 1 or 2 is relatively costy as in practice haystacks tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long. You don't need to know needle size if you know that it doesn't contain first pair. Also its rare that it contains first pair of characters twice. > For case 1, I always compare 8 characters at a time using cmpb instruction. > -> perform basic strlen, strnlen, checks. Move haystack based on > strchr result. > -> Load first 8 bytes(taking care of aligned load)from haystack and > needle and compare them. > If they are same proceed to next 8 bytes of haystack and needle. > Else load next doubleword from haystack.Form a doubleword with last > seven characters from first load and first character from > second load and compare again with needle. Proceed in a similar way. > This gives good improvement. > Thats looks worse than naive algorithm on practical strings. Most instructions you do are useless as you have mismatch at first character. Following function should beat it. char *strstr (char *s, char *n) { if (n[0] == 0) return s; while (s = strchr (s, n[0])) { long i; for (i=1; n[i] && n[i] == s[i]; i++); if (n[i] == 0) return s; s++; } } Or use musl strstr idea. While implementation is terrible as it in practice unnecessary spends 90% time on preprocessing their loop is relatively good for portable implementation. > For case2, always compare 'n' characters at a time, 'n' is needle length. > -> Algorithm is same as case 1, however before doing cmpb,create > input mask based on needle len. > > For case 1 and case 2, input is always loaded as double word. > > For case 3, its byte by byte comparison. >
On 05/06/2015 02:44 AM, Ondřej Bílka wrote: > > It does. AFAIR for practical strings on x64 its only around 3 times slower than > theoretical lower bound. > > A lower bound is strlen call as you must find terminating null when > needle isn't present. > > I hope that main idea was clear and you will > use it. Code I send is complex as it contains ten different speedups > that improve performance. > > >> In my optimization there are totally three branches. >> case 1# if needle len >= 8 (common case) >> case 2# if needle < 8 (common case) >> case 3# For page boundary cross haystack. (rare case) >> > Checking if its case 1 or 2 is relatively costy as in practice haystacks > tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long. > > You don't need to know needle size if you know that it doesn't contain > first pair. Also its rare that it contains first pair of characters twice. strlen of needle is needed in all the cases so as to make sure haystack is longer than needle as part of initial check before proceeding to search. Once this check is passed, strchr is called to find the position of first character. > >> For case 1, I always compare 8 characters at a time using cmpb instruction. >> -> perform basic strlen, strnlen, checks. Move haystack based on >> strchr result. >> -> Load first 8 bytes(taking care of aligned load)from haystack and >> needle and compare them. >> If they are same proceed to next 8 bytes of haystack and needle. >> Else load next doubleword from haystack.Form a doubleword with last >> seven characters from first load and first character from >> second load and compare again with needle. Proceed in a similar way. >> This gives good improvement. >> > Thats looks worse than naive algorithm on practical strings. Most > instructions you do are useless as you have mismatch at first character. > > Following function should beat it. > > char *strstr (char *s, char *n) > { > if (n[0] == 0) > return s; > while (s = strchr (s, n[0])) > { > long i; > for (i=1; n[i] && n[i] == s[i]; i++); > if (n[i] == 0) > return s; > s++; > } > } Calling strchr in a loop is not showing improvement when compared to the proposed patch. > > Or use musl strstr idea. While implementation is terrible as it in > practice unnecessary spends 90% time on preprocessing their loop is > relatively good for portable implementation. > >
On Wed, 2015-05-06 at 18:58 +0530, Rajalakshmi Srinivasaraghavan wrote: > > On 05/06/2015 02:44 AM, Ondřej Bílka wrote: > > > > It does. AFAIR for practical strings on x64 its only around 3 times slower than > > theoretical lower bound. > > > > A lower bound is strlen call as you must find terminating null when > > needle isn't present. > > > > I hope that main idea was clear and you will > > use it. Code I send is complex as it contains ten different speedups > > that improve performance. > > > > > >> In my optimization there are totally three branches. > >> case 1# if needle len >= 8 (common case) > >> case 2# if needle < 8 (common case) > >> case 3# For page boundary cross haystack. (rare case) > >> > > Checking if its case 1 or 2 is relatively costy as in practice haystacks > > tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long. > > > > You don't need to know needle size if you know that it doesn't contain > > first pair. Also its rare that it contains first pair of characters twice. > strlen of needle is needed in all the cases so as to make sure haystack > is longer than needle > as part of initial check before proceeding to search. Once this check is > passed, strchr is > called to find the position of first character. > I think we should move on with (commit) Raji's patch as is. While we appreciate you suggestions Ondrej, not all optimization apply equally to all platforms. > > > >> For case 1, I always compare 8 characters at a time using cmpb instruction. > >> -> perform basic strlen, strnlen, checks. Move haystack based on > >> strchr result. > >> -> Load first 8 bytes(taking care of aligned load)from haystack and > >> needle and compare them. > >> If they are same proceed to next 8 bytes of haystack and needle. > >> Else load next doubleword from haystack.Form a doubleword with last > >> seven characters from first load and first character from > >> second load and compare again with needle. Proceed in a similar way. > >> This gives good improvement. > >> > > Thats looks worse than naive algorithm on practical strings. Most > > instructions you do are useless as you have mismatch at first character. > > > > Following function should beat it. > > > > char *strstr (char *s, char *n) > > { > > if (n[0] == 0) > > return s; > > while (s = strchr (s, n[0])) > > { > > long i; > > for (i=1; n[i] && n[i] == s[i]; i++); > > if (n[i] == 0) > > return s; > > s++; > > } > > } > Calling strchr in a loop is not showing improvement when compared to > the proposed patch. > > > > Or use musl strstr idea. While implementation is terrible as it in > > practice unnecessary spends 90% time on preprocessing their loop is > > relatively good for portable implementation. > > > > >
On Mon, 11 May 2015, Steven Munroe wrote:
> I think we should move on with (commit) Raji's patch as is.
Does the patch avoid quadratic worst case? Quadratic performance in
strstr is an unambiguous regression, allowing for denial-of-service, that
should not be introduced on any platform (cf bug 12100). Indeed, gnulib
has a configure test for quadratic strstr, so if you introduce quadratic
worst case then the effect will be that all applications using gnulib
automatically configure themselves to use gnulib's strstr and not glibc's
at all.
On Mon, 2015-05-11 at 20:19 +0000, Joseph Myers wrote: > On Mon, 11 May 2015, Steven Munroe wrote: > > > I think we should move on with (commit) Raji's patch as is. > > Does the patch avoid quadratic worst case? Quadratic performance in > strstr is an unambiguous regression, allowing for denial-of-service, that > should not be introduced on any platform (cf bug 12100). Indeed, gnulib > has a configure test for quadratic strstr, so if you introduce quadratic > worst case then the effect will be that all applications using gnulib > automatically configure themselves to use gnulib's strstr and not glibc's > at all. > There was an assertion that there "might be" a quadratic worse case. No proof was offered that there was such. Raji do you have the data for this?
On Wed, May 06, 2015 at 06:58:35PM +0530, Rajalakshmi Srinivasaraghavan wrote: > > > On 05/06/2015 02:44 AM, Ondřej Bílka wrote: > > > >It does. AFAIR for practical strings on x64 its only around 3 times slower than > >theoretical lower bound. > > > >A lower bound is strlen call as you must find terminating null when > >needle isn't present. > > > >I hope that main idea was clear and you will > >use it. Code I send is complex as it contains ten different speedups > >that improve performance. > > > > > >>In my optimization there are totally three branches. > >>case 1# if needle len >= 8 (common case) > >>case 2# if needle < 8 (common case) > >>case 3# For page boundary cross haystack. (rare case) > >> > >Checking if its case 1 or 2 is relatively costy as in practice haystacks > >tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long. > > > >You don't need to know needle size if you know that it doesn't contain > >first pair. Also its rare that it contains first pair of characters twice. > strlen of needle is needed in all the cases so as to make sure > haystack is longer than needle > as part of initial check before proceeding to search. Once this > check is passed, strchr is > called to find the position of first character. > No, strlen is entirely unnecessary. When haystack doesn't contain first triple of characters (and if it contains match is relatively likely) then question if needle or haystack are larger is irrelevant. > > > >>For case 1, I always compare 8 characters at a time using cmpb instruction. > >>-> perform basic strlen, strnlen, checks. Move haystack based on > >>strchr result. > >>-> Load first 8 bytes(taking care of aligned load)from haystack and > >>needle and compare them. > >>If they are same proceed to next 8 bytes of haystack and needle. > >>Else load next doubleword from haystack.Form a doubleword with last > >>seven characters from first load and first character from > >>second load and compare again with needle. Proceed in a similar way. > >>This gives good improvement. > >> > >Thats looks worse than naive algorithm on practical strings. Most > >instructions you do are useless as you have mismatch at first character. > > > >Following function should beat it. > > > >char *strstr (char *s, char *n) > >{ > > if (n[0] == 0) > > return s; > > while (s = strchr (s, n[0])) > > { > > long i; > > for (i=1; n[i] && n[i] == s[i]; i++); > > if (n[i] == 0) > > return s; > > s++; > > } > >} > Calling strchr in a loop is not showing improvement when compared > to the proposed patch. > > Evidence?
On Mon, May 11, 2015 at 01:10:48PM -0500, Steven Munroe wrote: > On Wed, 2015-05-06 at 18:58 +0530, Rajalakshmi Srinivasaraghavan wrote: > > > > On 05/06/2015 02:44 AM, Ondřej Bílka wrote: > > > > > > It does. AFAIR for practical strings on x64 its only around 3 times slower than > > > theoretical lower bound. > > > > > > A lower bound is strlen call as you must find terminating null when > > > needle isn't present. > > > > > > I hope that main idea was clear and you will > > > use it. Code I send is complex as it contains ten different speedups > > > that improve performance. > > > > > > > > >> In my optimization there are totally three branches. > > >> case 1# if needle len >= 8 (common case) > > >> case 2# if needle < 8 (common case) > > >> case 3# For page boundary cross haystack. (rare case) > > >> > > > Checking if its case 1 or 2 is relatively costy as in practice haystacks > > > tend to be short. Most apps that use strstr have 90% call haystack less than 64 bytes long. > > > > > > You don't need to know needle size if you know that it doesn't contain > > > first pair. Also its rare that it contains first pair of characters twice. > > strlen of needle is needed in all the cases so as to make sure haystack > > is longer than needle > > as part of initial check before proceeding to search. Once this check is > > passed, strchr is > > called to find the position of first character. > > > I think we should move on with (commit) Raji's patch as is. > > While we appreciate you suggestions Ondrej, not all optimization apply > equally to all platforms. No as you failed a basic item of contributor checklist. All performance related patches need a benchmark to show its performance improvement and not regression. A algorithm overview mentioned there is garbage. It would likely cause performance regression in average case but I don't have a powerpc machine to run a test and present numbers.
On 05/11/2015 04:19 PM, Joseph Myers wrote: > On Mon, 11 May 2015, Steven Munroe wrote: > >> I think we should move on with (commit) Raji's patch as is. > > Does the patch avoid quadratic worst case? Quadratic performance in > strstr is an unambiguous regression, allowing for denial-of-service, that > should not be introduced on any platform (cf bug 12100). Indeed, gnulib > has a configure test for quadratic strstr, so if you introduce quadratic > worst case then the effect will be that all applications using gnulib > automatically configure themselves to use gnulib's strstr and not glibc's > at all. > We should add gnulib's test to the microbenchmarks. c.
On 05/12/2015 04:49 AM, Ondřej Bílka wrote: >>> Following function should beat it. >>> >>> char *strstr (char *s, char *n) >>> { >>> if (n[0] == 0) >>> return s; >>> while (s = strchr (s, n[0])) >>> { >>> long i; >>> for (i=1; n[i] && n[i] == s[i]; i++); >>> if (n[i] == 0) >>> return s; >>> s++; >>> } >>> } >> Calling strchr in a loop is not showing improvement when compared >> to the proposed patch. >>> > Evidence? > I agree with Ondrej here. What tests are you doing to show there is no performance improvement? Can we see those tests? Can you contribute them to the microbenchmark so other Power vendors can use them against their particular hardware to validate performance? Cheers, Carlos.
On Thu, 2015-05-21 at 11:08 -0400, Carlos O'Donell wrote: > On 05/12/2015 04:49 AM, Ondřej Bílka wrote: > >>> Following function should beat it. > >>> > >>> char *strstr (char *s, char *n) > >>> { > >>> if (n[0] == 0) > >>> return s; > >>> while (s = strchr (s, n[0])) > >>> { > >>> long i; > >>> for (i=1; n[i] && n[i] == s[i]; i++); > >>> if (n[i] == 0) > >>> return s; > >>> s++; > >>> } > >>> } > >> Calling strchr in a loop is not showing improvement when compared > >> to the proposed patch. > >>> > > Evidence? > > > > I agree with Ondrej here. What tests are you doing to show there is > no performance improvement? Can we see those tests? Can you contribute > them to the microbenchmark so other Power vendors can use them against > their particular hardware to validate performance? > I thought this was addressed with https://www.sourceware.org/ml/libc-alpha/2015-05/msg00276.html Which includes results from benchtest/bench-strstr.c. Why do we need another micro-benchmark for this?
On 05/21/2015 11:27 AM, Steven Munroe wrote: > On Thu, 2015-05-21 at 11:08 -0400, Carlos O'Donell wrote: >> On 05/12/2015 04:49 AM, Ondřej Bílka wrote: >>>>> Following function should beat it. >>>>> >>>>> char *strstr (char *s, char *n) >>>>> { >>>>> if (n[0] == 0) >>>>> return s; >>>>> while (s = strchr (s, n[0])) >>>>> { >>>>> long i; >>>>> for (i=1; n[i] && n[i] == s[i]; i++); >>>>> if (n[i] == 0) >>>>> return s; >>>>> s++; >>>>> } >>>>> } >>>> Calling strchr in a loop is not showing improvement when compared >>>> to the proposed patch. >>>>> >>> Evidence? >>> >> >> I agree with Ondrej here. What tests are you doing to show there is >> no performance improvement? Can we see those tests? Can you contribute >> them to the microbenchmark so other Power vendors can use them against >> their particular hardware to validate performance? >> > > I thought this was addressed with > https://www.sourceware.org/ml/libc-alpha/2015-05/msg00276.html > > Which includes results from benchtest/bench-strstr.c. > > Why do we need another micro-benchmark for this? You don't. The above addresses my complaints. I must have missed the other post upthread. Cheers, Carlos.
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 17265bd..3b0e3a0 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -19,7 +19,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strcmp-power8 strcmp-power7 strcmp-ppc64 \ strcat-power8 strcat-power7 strcat-ppc64 \ memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \ - strncpy-power8 + strncpy-power8 strstr-power7 strstr-ppc64 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index f5fdea5..9d808e0 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -322,5 +322,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c. */ + IFUNC_IMPL (i, name, strstr, + IFUNC_IMPL_ADD (array, i, strstr, + hwcap & PPC_FEATURE_HAS_VSX, + __strstr_power7) + IFUNC_IMPL_ADD (array, i, strstr, 1, + __strstr_ppc)) + return i; } diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S new file mode 100644 index 0000000..68b93b0 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-power7.S @@ -0,0 +1,44 @@ +/* Optimized strstr implementation for POWER7. + Copyright (C) 2015-2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strstr_power7) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strstr_power7): \ + cfi_startproc; \ + LOCALENTRY(__strstr_power7) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strstr_power7) \ + END_2(__strstr_power7) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#define STRLEN __strlen_power7 +#define STRNLEN __strnlen_power7 +#define STRCHR __strchr_power7 + +#include <sysdeps/powerpc/powerpc64/power7/strstr.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c new file mode 100644 index 0000000..2ca62c4 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr-ppc64.c @@ -0,0 +1,29 @@ +/* Copyright (C) 2015-2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <string.h> + +#define STRSTR __strstr_ppc +#if IS_IN (libc) && defined(SHARED) +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1(__strstr_ppc, __GI_strstr, __strstr_ppc); +#endif + +extern __typeof (strstr) __strstr_ppc attribute_hidden; + +#include <string/strstr.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr.c b/sysdeps/powerpc/powerpc64/multiarch/strstr.c new file mode 100644 index 0000000..7efc4b0 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr.c @@ -0,0 +1,34 @@ +/* Multiple versions of strstr. PowerPC64 version. + Copyright (C) 2015-2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for definition in libc. */ +#if IS_IN (libc) +# include <string.h> +# include <shlib-compat.h> +# include "init-arch.h" + +extern __typeof (strstr) __strstr_ppc attribute_hidden; +extern __typeof (strstr) __strstr_power7 attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +libc_ifunc (strstr, + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strstr_power7 + : __strstr_ppc); +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/strstr.S b/sysdeps/powerpc/powerpc64/power7/strstr.S new file mode 100644 index 0000000..e895d36 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strstr.S @@ -0,0 +1,500 @@ +/* Optimized strstr implementation for PowerPC64/POWER7. + Copyright (C) 2015-2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* char * [r3] strstr (char *s [r3], char * pat[r4]) */ + +/* + * The performance gain is obtained using aligned memory access, load doubleword + * and usage of cmpb instruction for quicker comparison. + */ + +#ifndef STRLEN +/* For builds with no IFUNC support, local calls should be made to internal + GLIBC symbol (created by libc_hidden_builtin_def). */ +# ifdef SHARED +# define STRLEN __GI_strlen +# else +# define STRLEN strlen +# endif +#endif + +#ifndef STRNLEN +/* For builds with no IFUNC support, local calls should be made to internal + GLIBC symbol (created by libc_hidden_builtin_def). */ +# ifdef SHARED +# define STRNLEN __GI_strnlen +# else +# define STRNLEN strnlen +# endif +#endif + +#ifndef STRCHR +# ifdef SHARED +# define STRCHR __GI_strchr +# else +# define STRCHR strchr +# endif +#endif + +#define FRAMESIZE (FRAME_MIN_SIZE+32) + .machine power7 +EALIGN (strstr, 4, 0) + CALL_MCOUNT 2 + mflr r0 /* load link register LR to r0 */ + std r20, -8(r1) /* save callers register , r20 */ + std r19, -16(r1) /* save callers register , r19 */ + std r18, -24(r1) /* save callers register , r18 */ + std r0, 16(r1) /* store the link register */ + stdu r1, -FRAMESIZE(r1) /* create the stack frame */ + + dcbt 0, r3 + dcbt 0, r4 + + and r0, r3, r4 + cmpdi cr7, r0, 0 + beq cr7, L(retnull) + + mr r18, r3 + mr r19, r4 + mr r3, r4 + bl STRLEN + nop + + cmpdi cr7, r3, 0 /* if search str is null */ + beq cr7, L(ret_r3) + mr r20, r3 + mr r4, r3 + mr r3, r18 + bl STRNLEN + nop + + cmpd cr7, r3, r20 /* if len(r3) < len(r4) */ + blt cr7, L(retnull) + mr r3, r18 + lbz r4, 0(r19) + bl STRCHR + nop + + mr r11, r3 + /* if first char of search str is not present */ + cmpdi cr7, r3, 0 + ble cr7, L(end) + + rldicl r8, r3, 0, 52 /* page cross check */ + cmpldi cr7, r8, 4096-16 + bgt cr7, L(bytebybyte) + + rldicl r8, r19, 0, 52 + cmpldi cr7, r8, 4096-16 + bgt cr7, L(bytebybyte) + + /* if len(r4) < 8 handle in a different way */ + /* Shift position based on null and use cmpb */ + cmpdi cr7, r20, 8 + blt cr7, L(lessthan8) + + /* len(r4) >= 8 reaches here */ + mr r8, r3 /* save r3 for future use */ +L(begin): + mr r3, r8 + mr r4, r19 /* restore r4 */ + li r0, 0 + +L(begin2): + rlwinm r10, r19, 3, 26, 28 /* calculate padding in bits */ + clrrdi r4, r4, 3 /* Make r4 aligned to 8 */ + ld r6, 0(r4) + addi r4, r4, 8 + cmpdi cr7, r10, 0 /* check if its already aligned? */ + beq cr7, L(begin1) +#ifdef __LITTLE_ENDIAN__ + srd r6, r6, r10 /* Discard unwanted bits */ +#else + sld r6, r6, r10 +#endif + ld r9, 0(r4) + subfic r10, r10, 64 +#ifdef __LITTLE_ENDIAN__ + sld r9, r9, r10 /* Discard unwanted bits */ +#else + srd r9, r9, r10 +#endif + or r6, r6, r9 /* Form complete search str */ +L(begin1): + rlwinm r10, r3, 3, 26, 28 + clrrdi r3, r3, 3 + ld r5, 0(r3) + cmpb r9, r0, r6 /* check if input has null */ + cmpdi cr7, r9, 0 + bne cr7, L(return3) + cmpb r9, r0, r5 /* check if input has null */ +#ifdef __LITTLE_ENDIAN__ + srd r9, r9, r10 +#else + sld r9, r9, r10 +#endif + cmpdi cr7, r9, 0 + bne cr7, L(return2) + + cmpdi cr7, r10, 0 + beq cr7, L(loop2) + ldu r7, 8(r3) + mr r12, r10 + addi r12, r12, -8 + subfic r11, r12, 64 + srdi r9, r11, 3 + addi r9, r9, -1 + mtctr r9 + b L(nextbyte1) + +L(loop2): + li r9, 8 + li r12, -8 /* shift values */ + li r11, 72 /* shift values */ + ldu r7, 8(r3) /* load next dw */ + mtctr r9 +L(nextbyte1): + addi r12, r12, 8 /* shift one byte and compare */ + addi r11, r11, -8 +#ifdef __LITTLE_ENDIAN__ + srd r9, r5, r12 /* rotate based on mask */ + sld r10, r7, r11 +#else + sld r9, r5, r12 + srd r10, r7, r11 +#endif + /* Form single dw from few bytes on first load and second load */ + or r10, r9, r10 + /* Check for null in the formed dw */ + cmpb r9, r0, r10 + cmpdi cr7, r9, 0 + bne cr7, L(retnull) + /* cmpb search str and input str */ + cmpb r9, r10, r6 + cmpdi cr7, r9, -1 + beq cr7, L(match) + bdnz L(nextbyte1) + /* Check if input has null before next load */ + cmpb r9, r0, r7 + cmpdi cr7, r9, 0 + bne cr7, L(retnull) + mr r5, r7 + /* Proceed to next load */ + b L(loop2) + + .align 4 +L(match): + /* There is a match of 8 bytes, check next bytes */ + cmpdi cr7, r20, 8 + beq cr7, L(return) + /* Update next starting point r8 */ + srdi r9, r11, 3 + subf r9, r9, r3 + mr r8, r9 + +L(secondmatch): + mr r5, r7 + rlwinm r10, r19, 3, 26, 28 /* calculate padding in bits */ + ld r6, 0(r4) + addi r4, r4, 8 + cmpdi cr7, r10, 0 /* check if its already aligned? */ + beq cr7, L(proceed3) +#ifdef __LITTLE_ENDIAN__ + srd r6, r6, r10 /* Discard unwanted bits */ + cmpb r9, r0, r6 + sld r9, r9, r10 +#else + sld r6, r6, r10 + cmpb r9, r0, r6 + srd r9, r9, r10 +#endif + cmpdi cr7, r9, 0 + bne cr7, L(proceed3) + ld r9, 0(r4) + subfic r10, r10, 64 +#ifdef __LITTLE_ENDIAN__ + sld r9, r9, r10 /* Discard unwanted bits */ +#else + srd r9, r9, r10 +#endif + or r6, r6, r9 /* Form complete search str */ + +L(proceed3): + li r7, 0 + addi r3, r3, 8 + cmpb r9, r0, r5 + cmpdi cr7, r9, 0 + bne cr7, L(proceed4) + ld r7, 0(r3) +L(proceed4): +#ifdef __LITTLE_ENDIAN__ + srd r9, r5, r12 + sld r10, r7, r11 +#else + sld r9, r5, r12 + srd r10, r7, r11 +#endif + /* Form single dw with few bytes from first and second load */ + or r10, r9, r10 + cmpb r9, r0, r6 + cmpdi cr7, r9, 0 + bne cr7, L(return4) + /* Check for null in the formed dw */ + cmpb r9, r0, r10 + cmpdi cr7, r9, 0 + bne cr7, L(retnull) + /* If the next 8 bytes dont match, start search again */ + cmpb r9, r10, r6 + cmpdi cr7, r9, -1 + bne cr7, L(reset) + /* If the next 8 bytes match, load and compare next 8 */ + b L(secondmatch) + + .align 4 +L(reset): + /* start the search again */ + addi r8, r8, 1 + b L(begin) + + .align 4 +L(loadnext): + /* Update r3 and proceed */ + addi r3, r3, 8 + b L(begin2) + + .align 4 +L(return2): + mr r3, r8 + b L(end) + + .align 4 +L(return3): + /* Count leading zeros and compare partial dw*/ +#ifdef __LITTLE_ENDIAN__ + addi r7, r9, -1 + andc r7, r7, r9 + popcntd r7, r7 + subfic r7, r7, 64 + sld r10, r5, r7 + sld r6, r6, r7 +#else + cntlzd r7, r9 + subfic r7, r7, 64 + srd r10, r5, r7 + srd r6, r6, r7 +#endif + cmpb r9, r10, r6 + cmpdi cr7, r9, -1 + addi r8, r8, 1 + /* start search again if there is no match */ + bne cr7, L(begin) + /* if the words match, update return values */ + subfic r7, r7, 64 + srdi r7, r7, 3 + add r3, r3, r7 + subf r3, r20, r3 + b L(end) + + .align 4 +L(return4): + /* Count leading zeros and compare partial dw*/ +#ifdef __LITTLE_ENDIAN__ + addi r7, r9, -1 + andc r7, r7, r9 + popcntd r7, r7 + subfic r7, r7, 64 + sld r10, r10, r7 + sld r6, r6, r7 +#else + cntlzd r7, r9 + subfic r7, r7, 64 + srd r10, r10, r7 + srd r6, r6, r7 +#endif + cmpb r9, r10, r6 + cmpdi cr7, r9, -1 + addi r8, r8, 1 + bne cr7, L(begin) + subfic r7, r7, 64 + srdi r11, r11, 3 + subf r3, r11, r3 + srdi r7, r7, 3 + add r3, r3, r7 + subf r3, r20, r3 + b L(end) + + /* handle less than 8 search string */ + .align 4 +L(lessthan8): + mr r4, r19 + li r0, 0 + + rlwinm r10, r4, 3, 26, 28 /* calculate padding in bits */ + srdi r8, r10, 3 /* padding in bytes */ + clrrdi r4, r4, 3 /* Make r4 aligned to 8 */ + ld r6, 0(r4) + cmpdi cr7, r10, 0 /* check if its already aligned? */ + beq cr7, L(proceed2) +#ifdef __LITTLE_ENDIAN__ + srd r6, r6, r10 /* Discard unwanted bits */ +#else + sld r6, r6, r10 +#endif + subfic r8, r8, 8 + cmpd cr7, r8, r20 /* Next load needed? */ + bge cr7, L(proceed2) + ld r7, 8(r4) + subfic r10, r10, 64 +#ifdef __LITTLE_ENDIAN__ + sld r7, r7, r10 /* Discard unwanted bits */ +#else + srd r7, r7, r10 +#endif + or r6, r6, r7 /* Form complete search str */ +L(proceed2): + rlwinm r10, r3, 3, 26, 28 + clrrdi r3, r3, 3 /* Make r3 aligned */ + ld r5, 0(r3) + sldi r8, r20, 3 + subfic r8, r8, 64 +#ifdef __LITTLE_ENDIAN__ + sld r6, r6, r8 + cmpb r9, r0, r5 + srd r9, r9, r10 +#else + srd r6, r6, r8 + cmpb r9, r0, r5 + sld r9, r9, r10 +#endif + cmpdi cr7, r9, 0 + bne cr7, L(noload) + cmpdi cr7, r10, 0 + beq cr7, L(continue) + ldu r7, 8(r3) + mr r12, r10 + addi r12, r12, -8 + subfic r11, r12, 64 + srdi r9, r11, 3 + addi r9, r9, -1 + mtctr r9 + b L(nextbyte) +L(continue): + ldu r7, 8(r3) +L(continue1): + li r9, 8 + li r12, -8 /* shift values */ + li r11, 72 /* shift values */ + mtctr r9 +L(nextbyte): + addi r12, r12, 8 /* mask for rotation */ + addi r11, r11, -8 +#ifdef __LITTLE_ENDIAN__ + srd r9, r5, r12 + sld r10, r7, r11 + or r10, r9, r10 + sld r10, r10, r8 + cmpb r9, r0, r10 + srd r9, r9, r8 +#else + sld r9, r5, r12 + srd r10, r7, r11 + or r10, r9, r10 + srd r10, r10, r8 + cmpb r9, r0, r10 + sld r9, r9, r8 +#endif + cmpdi cr7, r9, 0 + bne cr7, L(retnull) + cmpb r9, r10, r6 + cmpdi cr7, r9, -1 + beq cr7, L(return) + bdnz L(nextbyte) + mr r5, r7 + /* Check null in loaded dw */ + cmpb r9, r0, r7 + cmpdi cr7, r9, 0 + bne cr7, L(noload) + b L(continue) + + .align 4 +L(noload): + /* Reached null in r3, so skip next load */ + addi r3, r3, 8 + li r7, 0 + b L(continue1) + + .align 4 +L(return): + /* Update return values */ + srdi r9, r11, 3 + subf r3, r9, r3 + b L(end) + + /* handling byte by byte */ + .align 4 +L(bytebybyte): + mr r8, r3 + addi r8, r8, -1 +L(loop1): + addi r8, r8, 1 + mr r3, r8 + mr r4, r19 + lbz r6, 0(r4) + cmpdi cr7, r6, 0 + beq cr7, L(updater3) +L(loop): + lbz r5, 0(r3) + cmpdi cr7, r5, 0 + beq cr7, L(retnull) + cmpld cr7, r6, r5 + bne cr7, L(loop1) + addi r3, r3, 1 + addi r4, r4, 1 + lbz r6, 0(r4) + cmpdi cr7, r6, 0 + beq cr7, L(updater3) + b L(loop) + + /* handling return values */ + .align 4 +L(updater3): + subf r3, r20, r3 /* reduce len of r4 from r3 */ + b L(end) + + .align 4 +L(ret_r3): + mr r3, r18 /* return r3 */ + b L(end) + + .align 4 +L(retnull): + li r3, 0 /* return NULL */ + + .align 4 +L(end): + addi r1, r1, FRAMESIZE /* restore stack pointer */ + ld r0, 16(r1) /* read the saved link register */ + ld r18, -24(r1) /* restore callers save register, r18 */ + ld r19, -16(r1) /* restore callers save register, r19 */ + ld r20, -8(r1) /* restore callers save register, r20 */ + mtlr r0 /* branch to link register */ + blr +END (strstr) +libc_hidden_builtin_def (strstr)