diff mbox series

[v7,6/6] elf: Optimize _dl_new_hash in dl-new-hash.h

Message ID 20220510233029.637071-6-goldstein.w.n@gmail.com
State New
Headers show
Series [v7,1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked | expand

Commit Message

Noah Goldstein May 10, 2022, 11:30 p.m. UTC
Unroll slightly and enforce good instruction scheduling. This improves
performance on out-of-order machines. Note the unrolling allows
for pipelined multiplies which helps a bit, but most of the gain
is from enforcing better instruction scheduling for more ILP.
Unrolling further started to induce slowdowns for sizes [0, 4]
but can help the loop so if larger sizes are the target further
unrolling can be beneficial.

Results for _dl_new_hash
Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz

Time as Geometric Mean of N=30 runs
Geometric of all benchmark New / Old: 0.674
  type, length, New Time, Old Time, New Time / Old Time
 fixed,      0,    2.865,     2.72,               1.053
 fixed,      1,    3.567,    2.489,               1.433
 fixed,      2,    2.577,    3.649,               0.706
 fixed,      3,    3.644,    5.983,               0.609
 fixed,      4,    4.211,    6.833,               0.616
 fixed,      5,    4.741,    9.372,               0.506
 fixed,      6,    5.415,    9.561,               0.566
 fixed,      7,    6.649,   10.789,               0.616
 fixed,      8,    8.081,   11.808,               0.684
 fixed,      9,    8.427,   12.935,               0.651
 fixed,     10,    8.673,   14.134,               0.614
 fixed,     11,    10.69,   15.408,               0.694
 fixed,     12,   10.789,   16.982,               0.635
 fixed,     13,   12.169,   18.411,               0.661
 fixed,     14,   12.659,   19.914,               0.636
 fixed,     15,   13.526,   21.541,               0.628
 fixed,     16,   14.211,   23.088,               0.616
 fixed,     32,   29.412,   52.722,               0.558
 fixed,     64,    65.41,  142.351,               0.459
 fixed,    128,  138.505,  295.625,               0.469
 fixed,    256,  291.707,  601.983,               0.485
random,      2,   12.698,   12.849,               0.988
random,      4,   16.065,   15.857,               1.013
random,      8,   19.564,   21.105,               0.927
random,     16,   23.919,   26.823,               0.892
random,     32,   31.987,   39.591,               0.808
random,     64,   49.282,   71.487,               0.689
random,    128,    82.23,  145.364,               0.566
random,    256,  152.209,  298.434,                0.51

Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
---
 elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 5 deletions(-)

Comments

H.J. Lu May 10, 2022, 11:46 p.m. UTC | #1
On Tue, May 10, 2022 at 4:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Unroll slightly and enforce good instruction scheduling. This improves
> performance on out-of-order machines. Note the unrolling allows
> for pipelined multiplies which helps a bit, but most of the gain
> is from enforcing better instruction scheduling for more ILP.
> Unrolling further started to induce slowdowns for sizes [0, 4]
> but can help the loop so if larger sizes are the target further
> unrolling can be beneficial.
>
> Results for _dl_new_hash
> Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
>
> Time as Geometric Mean of N=30 runs
> Geometric of all benchmark New / Old: 0.674
>   type, length, New Time, Old Time, New Time / Old Time
>  fixed,      0,    2.865,     2.72,               1.053
>  fixed,      1,    3.567,    2.489,               1.433
>  fixed,      2,    2.577,    3.649,               0.706
>  fixed,      3,    3.644,    5.983,               0.609
>  fixed,      4,    4.211,    6.833,               0.616
>  fixed,      5,    4.741,    9.372,               0.506
>  fixed,      6,    5.415,    9.561,               0.566
>  fixed,      7,    6.649,   10.789,               0.616
>  fixed,      8,    8.081,   11.808,               0.684
>  fixed,      9,    8.427,   12.935,               0.651
>  fixed,     10,    8.673,   14.134,               0.614
>  fixed,     11,    10.69,   15.408,               0.694
>  fixed,     12,   10.789,   16.982,               0.635
>  fixed,     13,   12.169,   18.411,               0.661
>  fixed,     14,   12.659,   19.914,               0.636
>  fixed,     15,   13.526,   21.541,               0.628
>  fixed,     16,   14.211,   23.088,               0.616
>  fixed,     32,   29.412,   52.722,               0.558
>  fixed,     64,    65.41,  142.351,               0.459
>  fixed,    128,  138.505,  295.625,               0.469
>  fixed,    256,  291.707,  601.983,               0.485
> random,      2,   12.698,   12.849,               0.988
> random,      4,   16.065,   15.857,               1.013
> random,      8,   19.564,   21.105,               0.927
> random,     16,   23.919,   26.823,               0.892
> random,     32,   31.987,   39.591,               0.808
> random,     64,   49.282,   71.487,               0.689
> random,    128,    82.23,  145.364,               0.566
> random,    256,  152.209,  298.434,                0.51
>
> Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> ---
>  elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 61 insertions(+), 5 deletions(-)
>
> diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> index 40d88c81f9..c6d96a0452 100644
> --- a/elf/dl-new-hash.h
> +++ b/elf/dl-new-hash.h
> @@ -20,15 +20,71 @@
>  #define _DL_NEW_HASH_H 1
>
>  #include <stdint.h>
> +/* For __glibc_unlikely.  */
> +#include <sys/cdefs.h>
> +
> +/* The simplest implementation of _dl_new_hash is:
> +
> +_dl_new_hash (const char *s)
> +{
> +   uint32_t h = 5381;
> +   for (unsigned char c = *s; c != '\0'; c = *++s)
> +     h = h * 33 + c;
> +   return h;
> +}
> +
> +   We can get better performance, however, by slightly unrolling the
> +   loop and explicitly specifying order of operations to prevent
> +   reassosiation of instructions and ensure ideal scheduling.  */
>
>  static inline uint32_t
>  __attribute__ ((unused))
> -_dl_new_hash (const char *s)
> +_dl_new_hash (const char *str)
>  {
> -  uint32_t h = 5381;
> -  for (unsigned char c = *s; c != '\0'; c = *++s)
> -    h = h * 33 + c;
> -  return h;
> +  const unsigned char *s = (const unsigned char *) str;
> +  unsigned int h = 5381;
> +  unsigned int c0, c1;
> +  for (;;)
> +    {
> +      c0 = (unsigned int) *s;

I prefer

c0 = s[0];

There is no need for case since "s" is a pointer to unsigned char.

> +      /* Unlikely length zero string so evens will be slightly less
                                                              even
> +        common.  */
> +      if (__glibc_unlikely (c0 == 0))
> +       return h;
> +
> +      c1 = (unsigned int) *(s + 1);

c1 = s[1];

> +      if (c1 == 0)
> +       {
> +         c0 += h;
> +         /* Ideal instruction scheduling is:
> +        c0 += h;
> +        h *= 32;
> +        h += c0;
> +
> +        The asm statements are to prevent reassosiation that would result in
> +        more instruction interdependencies and worse scheduling.  */
> +         asm("" : "+r"(h) : "r"(c0));
              asm (  << A space after asm.
> +         h = h * 32 + c0;
> +         return h;
> +       }
> +
> +      /* Ideal instruction scheduling is:
> +        c1 += c0;
> +        h *= 33 * 33;
> +        c0 *= 32;
> +        c1 += c0;
> +        h  += c1;
> +
> +        The asm statements are to prevent reassosiation that would result in
> +        more instruction interdependencies and worse scheduling.  */
> +      c1 += c0;
> +      asm("" : "+r"(c1), "+r"(c0));
            asm (  << A space after asm.
> +      h *= 33 * 33;
> +      c1 += c0 * 32;
> +      asm("" : "+r"(c1));
           asm (  << A space after asm.
> +      h += c1;
> +      s += 2;
> +    }
>  }

This is faster on x86.  Is this also true on other targets?  Should it be moved
to sysdeps/generic so that other targets can provide a different version?

>
> --
> 2.34.1
>
Noah Goldstein May 11, 2022, 3:07 a.m. UTC | #2
On Tue, May 10, 2022 at 6:47 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, May 10, 2022 at 4:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Unroll slightly and enforce good instruction scheduling. This improves
> > performance on out-of-order machines. Note the unrolling allows
> > for pipelined multiplies which helps a bit, but most of the gain
> > is from enforcing better instruction scheduling for more ILP.
> > Unrolling further started to induce slowdowns for sizes [0, 4]
> > but can help the loop so if larger sizes are the target further
> > unrolling can be beneficial.
> >
> > Results for _dl_new_hash
> > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
> >
> > Time as Geometric Mean of N=30 runs
> > Geometric of all benchmark New / Old: 0.674
> >   type, length, New Time, Old Time, New Time / Old Time
> >  fixed,      0,    2.865,     2.72,               1.053
> >  fixed,      1,    3.567,    2.489,               1.433
> >  fixed,      2,    2.577,    3.649,               0.706
> >  fixed,      3,    3.644,    5.983,               0.609
> >  fixed,      4,    4.211,    6.833,               0.616
> >  fixed,      5,    4.741,    9.372,               0.506
> >  fixed,      6,    5.415,    9.561,               0.566
> >  fixed,      7,    6.649,   10.789,               0.616
> >  fixed,      8,    8.081,   11.808,               0.684
> >  fixed,      9,    8.427,   12.935,               0.651
> >  fixed,     10,    8.673,   14.134,               0.614
> >  fixed,     11,    10.69,   15.408,               0.694
> >  fixed,     12,   10.789,   16.982,               0.635
> >  fixed,     13,   12.169,   18.411,               0.661
> >  fixed,     14,   12.659,   19.914,               0.636
> >  fixed,     15,   13.526,   21.541,               0.628
> >  fixed,     16,   14.211,   23.088,               0.616
> >  fixed,     32,   29.412,   52.722,               0.558
> >  fixed,     64,    65.41,  142.351,               0.459
> >  fixed,    128,  138.505,  295.625,               0.469
> >  fixed,    256,  291.707,  601.983,               0.485
> > random,      2,   12.698,   12.849,               0.988
> > random,      4,   16.065,   15.857,               1.013
> > random,      8,   19.564,   21.105,               0.927
> > random,     16,   23.919,   26.823,               0.892
> > random,     32,   31.987,   39.591,               0.808
> > random,     64,   49.282,   71.487,               0.689
> > random,    128,    82.23,  145.364,               0.566
> > random,    256,  152.209,  298.434,                0.51
> >
> > Co-authored-by: Alexander Monakov <amonakov@ispras.ru>
> > ---
> >  elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 61 insertions(+), 5 deletions(-)
> >
> > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
> > index 40d88c81f9..c6d96a0452 100644
> > --- a/elf/dl-new-hash.h
> > +++ b/elf/dl-new-hash.h
> > @@ -20,15 +20,71 @@
> >  #define _DL_NEW_HASH_H 1
> >
> >  #include <stdint.h>
> > +/* For __glibc_unlikely.  */
> > +#include <sys/cdefs.h>
> > +
> > +/* The simplest implementation of _dl_new_hash is:
> > +
> > +_dl_new_hash (const char *s)
> > +{
> > +   uint32_t h = 5381;
> > +   for (unsigned char c = *s; c != '\0'; c = *++s)
> > +     h = h * 33 + c;
> > +   return h;
> > +}
> > +
> > +   We can get better performance, however, by slightly unrolling the
> > +   loop and explicitly specifying order of operations to prevent
> > +   reassosiation of instructions and ensure ideal scheduling.  */
> >
> >  static inline uint32_t
> >  __attribute__ ((unused))
> > -_dl_new_hash (const char *s)
> > +_dl_new_hash (const char *str)
> >  {
> > -  uint32_t h = 5381;
> > -  for (unsigned char c = *s; c != '\0'; c = *++s)
> > -    h = h * 33 + c;
> > -  return h;
> > +  const unsigned char *s = (const unsigned char *) str;
> > +  unsigned int h = 5381;
> > +  unsigned int c0, c1;
> > +  for (;;)
> > +    {
> > +      c0 = (unsigned int) *s;
>
> I prefer
>
> c0 = s[0];
>
> There is no need for case since "s" is a pointer to unsigned char.

Fixed in V8.
>
> > +      /* Unlikely length zero string so evens will be slightly less
>                                                               even
> > +        common.  */
> > +      if (__glibc_unlikely (c0 == 0))
> > +       return h;
> > +
> > +      c1 = (unsigned int) *(s + 1);
>
> c1 = s[1];

Fixed in V8.
>
> > +      if (c1 == 0)
> > +       {
> > +         c0 += h;
> > +         /* Ideal instruction scheduling is:
> > +        c0 += h;
> > +        h *= 32;
> > +        h += c0;
> > +
> > +        The asm statements are to prevent reassosiation that would result in
> > +        more instruction interdependencies and worse scheduling.  */
> > +         asm("" : "+r"(h) : "r"(c0));
>               asm (  << A space after asm.
> > +         h = h * 32 + c0;
> > +         return h;
> > +       }
> > +
> > +      /* Ideal instruction scheduling is:
> > +        c1 += c0;
> > +        h *= 33 * 33;
> > +        c0 *= 32;
> > +        c1 += c0;
> > +        h  += c1;
> > +
> > +        The asm statements are to prevent reassosiation that would result in
> > +        more instruction interdependencies and worse scheduling.  */
> > +      c1 += c0;
> > +      asm("" : "+r"(c1), "+r"(c0));
>             asm (  << A space after asm.
> > +      h *= 33 * 33;
> > +      c1 += c0 * 32;
> > +      asm("" : "+r"(c1));
>            asm (  << A space after asm.
> > +      h += c1;
> > +      s += 2;
> > +    }
> >  }
>
> This is faster on x86.  Is this also true on other targets?  Should it be moved
> to sysdeps/generic so that other targets can provide a different version?

I haven't tested any other targets but the optimizations are pretty generic so
would imagine similar results on other architectures
>
> >
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
diff mbox series

Patch

diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h
index 40d88c81f9..c6d96a0452 100644
--- a/elf/dl-new-hash.h
+++ b/elf/dl-new-hash.h
@@ -20,15 +20,71 @@ 
 #define _DL_NEW_HASH_H 1
 
 #include <stdint.h>
+/* For __glibc_unlikely.  */
+#include <sys/cdefs.h>
+
+/* The simplest implementation of _dl_new_hash is:
+
+_dl_new_hash (const char *s)
+{
+   uint32_t h = 5381;
+   for (unsigned char c = *s; c != '\0'; c = *++s)
+     h = h * 33 + c;
+   return h;
+}
+
+   We can get better performance, however, by slightly unrolling the
+   loop and explicitly specifying order of operations to prevent
+   reassosiation of instructions and ensure ideal scheduling.  */
 
 static inline uint32_t
 __attribute__ ((unused))
-_dl_new_hash (const char *s)
+_dl_new_hash (const char *str)
 {
-  uint32_t h = 5381;
-  for (unsigned char c = *s; c != '\0'; c = *++s)
-    h = h * 33 + c;
-  return h;
+  const unsigned char *s = (const unsigned char *) str;
+  unsigned int h = 5381;
+  unsigned int c0, c1;
+  for (;;)
+    {
+      c0 = (unsigned int) *s;
+      /* Unlikely length zero string so evens will be slightly less
+	 common.  */
+      if (__glibc_unlikely (c0 == 0))
+	return h;
+
+      c1 = (unsigned int) *(s + 1);
+      if (c1 == 0)
+	{
+	  c0 += h;
+	  /* Ideal instruction scheduling is:
+	 c0 += h;
+	 h *= 32;
+	 h += c0;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+	  asm("" : "+r"(h) : "r"(c0));
+	  h = h * 32 + c0;
+	  return h;
+	}
+
+      /* Ideal instruction scheduling is:
+	 c1 += c0;
+	 h *= 33 * 33;
+	 c0 *= 32;
+	 c1 += c0;
+	 h  += c1;
+
+	 The asm statements are to prevent reassosiation that would result in
+	 more instruction interdependencies and worse scheduling.  */
+      c1 += c0;
+      asm("" : "+r"(c1), "+r"(c0));
+      h *= 33 * 33;
+      c1 += c0 * 32;
+      asm("" : "+r"(c1));
+      h += c1;
+      s += 2;
+    }
 }