Message ID | 20220510233029.637071-6-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v7,1/6] elf: Refactor dl_new_hash so it can be tested / benchmarked | expand |
On Tue, May 10, 2022 at 4:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Unroll slightly and enforce good instruction scheduling. This improves > performance on out-of-order machines. Note the unrolling allows > for pipelined multiplies which helps a bit, but most of the gain > is from enforcing better instruction scheduling for more ILP. > Unrolling further started to induce slowdowns for sizes [0, 4] > but can help the loop so if larger sizes are the target further > unrolling can be beneficial. > > Results for _dl_new_hash > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz > > Time as Geometric Mean of N=30 runs > Geometric of all benchmark New / Old: 0.674 > type, length, New Time, Old Time, New Time / Old Time > fixed, 0, 2.865, 2.72, 1.053 > fixed, 1, 3.567, 2.489, 1.433 > fixed, 2, 2.577, 3.649, 0.706 > fixed, 3, 3.644, 5.983, 0.609 > fixed, 4, 4.211, 6.833, 0.616 > fixed, 5, 4.741, 9.372, 0.506 > fixed, 6, 5.415, 9.561, 0.566 > fixed, 7, 6.649, 10.789, 0.616 > fixed, 8, 8.081, 11.808, 0.684 > fixed, 9, 8.427, 12.935, 0.651 > fixed, 10, 8.673, 14.134, 0.614 > fixed, 11, 10.69, 15.408, 0.694 > fixed, 12, 10.789, 16.982, 0.635 > fixed, 13, 12.169, 18.411, 0.661 > fixed, 14, 12.659, 19.914, 0.636 > fixed, 15, 13.526, 21.541, 0.628 > fixed, 16, 14.211, 23.088, 0.616 > fixed, 32, 29.412, 52.722, 0.558 > fixed, 64, 65.41, 142.351, 0.459 > fixed, 128, 138.505, 295.625, 0.469 > fixed, 256, 291.707, 601.983, 0.485 > random, 2, 12.698, 12.849, 0.988 > random, 4, 16.065, 15.857, 1.013 > random, 8, 19.564, 21.105, 0.927 > random, 16, 23.919, 26.823, 0.892 > random, 32, 31.987, 39.591, 0.808 > random, 64, 49.282, 71.487, 0.689 > random, 128, 82.23, 145.364, 0.566 > random, 256, 152.209, 298.434, 0.51 > > Co-authored-by: Alexander Monakov <amonakov@ispras.ru> > --- > elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 61 insertions(+), 5 deletions(-) > > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h > index 40d88c81f9..c6d96a0452 100644 > --- a/elf/dl-new-hash.h > +++ b/elf/dl-new-hash.h > @@ -20,15 +20,71 @@ > #define _DL_NEW_HASH_H 1 > > #include <stdint.h> > +/* For __glibc_unlikely. */ > +#include <sys/cdefs.h> > + > +/* The simplest implementation of _dl_new_hash is: > + > +_dl_new_hash (const char *s) > +{ > + uint32_t h = 5381; > + for (unsigned char c = *s; c != '\0'; c = *++s) > + h = h * 33 + c; > + return h; > +} > + > + We can get better performance, however, by slightly unrolling the > + loop and explicitly specifying order of operations to prevent > + reassosiation of instructions and ensure ideal scheduling. */ > > static inline uint32_t > __attribute__ ((unused)) > -_dl_new_hash (const char *s) > +_dl_new_hash (const char *str) > { > - uint32_t h = 5381; > - for (unsigned char c = *s; c != '\0'; c = *++s) > - h = h * 33 + c; > - return h; > + const unsigned char *s = (const unsigned char *) str; > + unsigned int h = 5381; > + unsigned int c0, c1; > + for (;;) > + { > + c0 = (unsigned int) *s; I prefer c0 = s[0]; There is no need for case since "s" is a pointer to unsigned char. > + /* Unlikely length zero string so evens will be slightly less even > + common. */ > + if (__glibc_unlikely (c0 == 0)) > + return h; > + > + c1 = (unsigned int) *(s + 1); c1 = s[1]; > + if (c1 == 0) > + { > + c0 += h; > + /* Ideal instruction scheduling is: > + c0 += h; > + h *= 32; > + h += c0; > + > + The asm statements are to prevent reassosiation that would result in > + more instruction interdependencies and worse scheduling. */ > + asm("" : "+r"(h) : "r"(c0)); asm ( << A space after asm. > + h = h * 32 + c0; > + return h; > + } > + > + /* Ideal instruction scheduling is: > + c1 += c0; > + h *= 33 * 33; > + c0 *= 32; > + c1 += c0; > + h += c1; > + > + The asm statements are to prevent reassosiation that would result in > + more instruction interdependencies and worse scheduling. */ > + c1 += c0; > + asm("" : "+r"(c1), "+r"(c0)); asm ( << A space after asm. > + h *= 33 * 33; > + c1 += c0 * 32; > + asm("" : "+r"(c1)); asm ( << A space after asm. > + h += c1; > + s += 2; > + } > } This is faster on x86. Is this also true on other targets? Should it be moved to sysdeps/generic so that other targets can provide a different version? > > -- > 2.34.1 >
On Tue, May 10, 2022 at 6:47 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, May 10, 2022 at 4:30 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Unroll slightly and enforce good instruction scheduling. This improves > > performance on out-of-order machines. Note the unrolling allows > > for pipelined multiplies which helps a bit, but most of the gain > > is from enforcing better instruction scheduling for more ILP. > > Unrolling further started to induce slowdowns for sizes [0, 4] > > but can help the loop so if larger sizes are the target further > > unrolling can be beneficial. > > > > Results for _dl_new_hash > > Benchmarked on Tigerlake: 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz > > > > Time as Geometric Mean of N=30 runs > > Geometric of all benchmark New / Old: 0.674 > > type, length, New Time, Old Time, New Time / Old Time > > fixed, 0, 2.865, 2.72, 1.053 > > fixed, 1, 3.567, 2.489, 1.433 > > fixed, 2, 2.577, 3.649, 0.706 > > fixed, 3, 3.644, 5.983, 0.609 > > fixed, 4, 4.211, 6.833, 0.616 > > fixed, 5, 4.741, 9.372, 0.506 > > fixed, 6, 5.415, 9.561, 0.566 > > fixed, 7, 6.649, 10.789, 0.616 > > fixed, 8, 8.081, 11.808, 0.684 > > fixed, 9, 8.427, 12.935, 0.651 > > fixed, 10, 8.673, 14.134, 0.614 > > fixed, 11, 10.69, 15.408, 0.694 > > fixed, 12, 10.789, 16.982, 0.635 > > fixed, 13, 12.169, 18.411, 0.661 > > fixed, 14, 12.659, 19.914, 0.636 > > fixed, 15, 13.526, 21.541, 0.628 > > fixed, 16, 14.211, 23.088, 0.616 > > fixed, 32, 29.412, 52.722, 0.558 > > fixed, 64, 65.41, 142.351, 0.459 > > fixed, 128, 138.505, 295.625, 0.469 > > fixed, 256, 291.707, 601.983, 0.485 > > random, 2, 12.698, 12.849, 0.988 > > random, 4, 16.065, 15.857, 1.013 > > random, 8, 19.564, 21.105, 0.927 > > random, 16, 23.919, 26.823, 0.892 > > random, 32, 31.987, 39.591, 0.808 > > random, 64, 49.282, 71.487, 0.689 > > random, 128, 82.23, 145.364, 0.566 > > random, 256, 152.209, 298.434, 0.51 > > > > Co-authored-by: Alexander Monakov <amonakov@ispras.ru> > > --- > > elf/dl-new-hash.h | 66 +++++++++++++++++++++++++++++++++++++++++++---- > > 1 file changed, 61 insertions(+), 5 deletions(-) > > > > diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h > > index 40d88c81f9..c6d96a0452 100644 > > --- a/elf/dl-new-hash.h > > +++ b/elf/dl-new-hash.h > > @@ -20,15 +20,71 @@ > > #define _DL_NEW_HASH_H 1 > > > > #include <stdint.h> > > +/* For __glibc_unlikely. */ > > +#include <sys/cdefs.h> > > + > > +/* The simplest implementation of _dl_new_hash is: > > + > > +_dl_new_hash (const char *s) > > +{ > > + uint32_t h = 5381; > > + for (unsigned char c = *s; c != '\0'; c = *++s) > > + h = h * 33 + c; > > + return h; > > +} > > + > > + We can get better performance, however, by slightly unrolling the > > + loop and explicitly specifying order of operations to prevent > > + reassosiation of instructions and ensure ideal scheduling. */ > > > > static inline uint32_t > > __attribute__ ((unused)) > > -_dl_new_hash (const char *s) > > +_dl_new_hash (const char *str) > > { > > - uint32_t h = 5381; > > - for (unsigned char c = *s; c != '\0'; c = *++s) > > - h = h * 33 + c; > > - return h; > > + const unsigned char *s = (const unsigned char *) str; > > + unsigned int h = 5381; > > + unsigned int c0, c1; > > + for (;;) > > + { > > + c0 = (unsigned int) *s; > > I prefer > > c0 = s[0]; > > There is no need for case since "s" is a pointer to unsigned char. Fixed in V8. > > > + /* Unlikely length zero string so evens will be slightly less > even > > + common. */ > > + if (__glibc_unlikely (c0 == 0)) > > + return h; > > + > > + c1 = (unsigned int) *(s + 1); > > c1 = s[1]; Fixed in V8. > > > + if (c1 == 0) > > + { > > + c0 += h; > > + /* Ideal instruction scheduling is: > > + c0 += h; > > + h *= 32; > > + h += c0; > > + > > + The asm statements are to prevent reassosiation that would result in > > + more instruction interdependencies and worse scheduling. */ > > + asm("" : "+r"(h) : "r"(c0)); > asm ( << A space after asm. > > + h = h * 32 + c0; > > + return h; > > + } > > + > > + /* Ideal instruction scheduling is: > > + c1 += c0; > > + h *= 33 * 33; > > + c0 *= 32; > > + c1 += c0; > > + h += c1; > > + > > + The asm statements are to prevent reassosiation that would result in > > + more instruction interdependencies and worse scheduling. */ > > + c1 += c0; > > + asm("" : "+r"(c1), "+r"(c0)); > asm ( << A space after asm. > > + h *= 33 * 33; > > + c1 += c0 * 32; > > + asm("" : "+r"(c1)); > asm ( << A space after asm. > > + h += c1; > > + s += 2; > > + } > > } > > This is faster on x86. Is this also true on other targets? Should it be moved > to sysdeps/generic so that other targets can provide a different version? I haven't tested any other targets but the optimizations are pretty generic so would imagine similar results on other architectures > > > > > -- > > 2.34.1 > > > > > -- > H.J.
diff --git a/elf/dl-new-hash.h b/elf/dl-new-hash.h index 40d88c81f9..c6d96a0452 100644 --- a/elf/dl-new-hash.h +++ b/elf/dl-new-hash.h @@ -20,15 +20,71 @@ #define _DL_NEW_HASH_H 1 #include <stdint.h> +/* For __glibc_unlikely. */ +#include <sys/cdefs.h> + +/* The simplest implementation of _dl_new_hash is: + +_dl_new_hash (const char *s) +{ + uint32_t h = 5381; + for (unsigned char c = *s; c != '\0'; c = *++s) + h = h * 33 + c; + return h; +} + + We can get better performance, however, by slightly unrolling the + loop and explicitly specifying order of operations to prevent + reassosiation of instructions and ensure ideal scheduling. */ static inline uint32_t __attribute__ ((unused)) -_dl_new_hash (const char *s) +_dl_new_hash (const char *str) { - uint32_t h = 5381; - for (unsigned char c = *s; c != '\0'; c = *++s) - h = h * 33 + c; - return h; + const unsigned char *s = (const unsigned char *) str; + unsigned int h = 5381; + unsigned int c0, c1; + for (;;) + { + c0 = (unsigned int) *s; + /* Unlikely length zero string so evens will be slightly less + common. */ + if (__glibc_unlikely (c0 == 0)) + return h; + + c1 = (unsigned int) *(s + 1); + if (c1 == 0) + { + c0 += h; + /* Ideal instruction scheduling is: + c0 += h; + h *= 32; + h += c0; + + The asm statements are to prevent reassosiation that would result in + more instruction interdependencies and worse scheduling. */ + asm("" : "+r"(h) : "r"(c0)); + h = h * 32 + c0; + return h; + } + + /* Ideal instruction scheduling is: + c1 += c0; + h *= 33 * 33; + c0 *= 32; + c1 += c0; + h += c1; + + The asm statements are to prevent reassosiation that would result in + more instruction interdependencies and worse scheduling. */ + c1 += c0; + asm("" : "+r"(c1), "+r"(c0)); + h *= 33 * 33; + c1 += c0 * 32; + asm("" : "+r"(c1)); + h += c1; + s += 2; + } }