Message ID | 20230207001618.458947-19-christoph.muellner@vrull.eu |
---|---|
State | New |
Headers | show |
Series | riscv: ifunc support with optimized mem*/str*/cpu_relax routines | expand |
On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner <christoph.muellner@vrull.eu> wrote: > > From: Christoph Müllner <christoph.muellner@vrull.eu> > > The implementation of strncmp() can be accelerated using Zbb's orc.b > instruction. Let's add an optimized implementation that makes use > of this instruction. > > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu> Not necessary, but imo performance patches should have at least some reference to the expected speedup versus the existing alternatives. > --- > sysdeps/riscv/multiarch/Makefile | 3 +- > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 + > sysdeps/riscv/multiarch/strncmp.c | 6 +- > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++ > 4 files changed, 127 insertions(+), 2 deletions(-) > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S > > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile > index 056ce2ffc0..9f22e31b99 100644 > --- a/sysdeps/riscv/multiarch/Makefile > +++ b/sysdeps/riscv/multiarch/Makefile > @@ -14,5 +14,6 @@ sysdep_routines += \ > strcmp_generic \ > strcmp_zbb \ > strcmp_zbb_unaligned \ > - strncmp_generic > + strncmp_generic \ > + strncmp_zbb > endif > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c > index eb37ed6017..82fd34d010 100644 > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic)) > > IFUNC_IMPL (i, name, strncmp, > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb) > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic)) > return i; > } > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c > index 970aeb8b85..5b0fe08e98 100644 > --- a/sysdeps/riscv/multiarch/strncmp.c > +++ b/sysdeps/riscv/multiarch/strncmp.c > @@ -30,8 +30,12 @@ > > extern __typeof (__redirect_strncmp) __libc_strncmp; > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden; > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden; > > -libc_ifunc (__libc_strncmp, __strncmp_generic); > +libc_ifunc (__libc_strncmp, > + HAVE_RV(zbb) > + ? __strncmp_zbb > + : __strncmp_generic); > > # undef strncmp > strong_alias (__libc_strncmp, strncmp); > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S > new file mode 100644 > index 0000000000..29cff30def > --- /dev/null > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S > @@ -0,0 +1,119 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include <sys/asm.h> > + > +/* Assumptions: rvi_zbb. */ > + > +#define src1 a0 > +#define result a0 > +#define src2 a1 > +#define len a2 > +#define data1 a2 > +#define data2 a3 > +#define align a4 > +#define data1_orcb t0 > +#define limit t1 > +#define fast_limit t2 > +#define m1 t3 > + > +#if __riscv_xlen == 64 > +# define REG_L ld > +# define SZREG 8 > +# define PTRLOG 3 > +#else > +# define REG_L lw > +# define SZREG 4 > +# define PTRLOG 2 > +#endif > + > +#ifndef STRNCMP > +# define STRNCMP __strncmp_zbb > +#endif > + > +.option push > +.option arch,+zbb > + > +ENTRY_ALIGN (STRNCMP, 6) > + beqz len, L(equal) > + or align, src1, src2 > + and align, align, SZREG-1 > + add limit, src1, len > + bnez align, L(simpleloop) > + li m1, -1 > + > + /* Adjust limit for fast-path. */ > + andi fast_limit, limit, -SZREG > + > + /* Main loop for aligned string. */ > + .p2align 3 > +L(loop): > + bge src1, fast_limit, L(simpleloop) > + REG_L data1, 0(src1) > + REG_L data2, 0(src2) > + orc.b data1_orcb, data1 > + bne data1_orcb, m1, L(foundnull) > + addi src1, src1, SZREG > + addi src2, src2, SZREG > + beq data1, data2, L(loop) > + > + /* Words don't match, and no null byte in the first > + * word. Get bytes in big-endian order and compare. */ > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > + rev8 data1, data1 > + rev8 data2, data2 > +#endif > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ > + sltu result, data1, data2 > + neg result, result > + ori result, result, 1 > + ret > + > +L(foundnull): > + /* Found a null byte. > + * If words don't match, fall back to simple loop. */ > + bne data1, data2, L(simpleloop) > + > + /* Otherwise, strings are equal. */ > + li result, 0 > + ret > + > + /* Simple loop for misaligned strings. */ > + .p2align 3 > +L(simpleloop): > + bge src1, limit, L(equal) > + lbu data1, 0(src1) > + addi src1, src1, 1 > + lbu data2, 0(src2) > + addi src2, src2, 1 > + bne data1, data2, L(sub) > + bnez data1, L(simpleloop) > + > +L(sub): > + sub result, data1, data2 > + ret > + > +L(equal): > + li result, 0 > + ret > + > +.option pop > + > +END (STRNCMP) > +libc_hidden_builtin_def (STRNCMP) > -- > 2.39.1 >
On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner > <christoph.muellner@vrull.eu> wrote: > > > > From: Christoph Müllner <christoph.muellner@vrull.eu> > > > > The implementation of strncmp() can be accelerated using Zbb's orc.b > > instruction. Let's add an optimized implementation that makes use > > of this instruction. > > > > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu> > > Not necessary, but imo performance patches should have at least some reference > to the expected speedup versus the existing alternatives. Given that this is effectively a SWAR-like optimization (orc.b allows us to test 8 bytes in parallel for a NUL byte), we should be able to show the benefit through a reduction in dynamic instructions. Would this be considered reasonable reference data? > > --- > > sysdeps/riscv/multiarch/Makefile | 3 +- > > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 + > > sysdeps/riscv/multiarch/strncmp.c | 6 +- > > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++ > > 4 files changed, 127 insertions(+), 2 deletions(-) > > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S > > > > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile > > index 056ce2ffc0..9f22e31b99 100644 > > --- a/sysdeps/riscv/multiarch/Makefile > > +++ b/sysdeps/riscv/multiarch/Makefile > > @@ -14,5 +14,6 @@ sysdep_routines += \ > > strcmp_generic \ > > strcmp_zbb \ > > strcmp_zbb_unaligned \ > > - strncmp_generic > > + strncmp_generic \ > > + strncmp_zbb > > endif > > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c > > index eb37ed6017..82fd34d010 100644 > > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c > > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic)) > > > > IFUNC_IMPL (i, name, strncmp, > > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb) > > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic)) > > return i; > > } > > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c > > index 970aeb8b85..5b0fe08e98 100644 > > --- a/sysdeps/riscv/multiarch/strncmp.c > > +++ b/sysdeps/riscv/multiarch/strncmp.c > > @@ -30,8 +30,12 @@ > > > > extern __typeof (__redirect_strncmp) __libc_strncmp; > > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden; > > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden; > > > > -libc_ifunc (__libc_strncmp, __strncmp_generic); > > +libc_ifunc (__libc_strncmp, > > + HAVE_RV(zbb) > > + ? __strncmp_zbb > > + : __strncmp_generic); > > > > # undef strncmp > > strong_alias (__libc_strncmp, strncmp); > > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S > > new file mode 100644 > > index 0000000000..29cff30def > > --- /dev/null > > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S > > @@ -0,0 +1,119 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library. If not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <sysdep.h> > > +#include <sys/asm.h> > > + > > +/* Assumptions: rvi_zbb. */ > > + > > +#define src1 a0 > > +#define result a0 > > +#define src2 a1 > > +#define len a2 > > +#define data1 a2 > > +#define data2 a3 > > +#define align a4 > > +#define data1_orcb t0 > > +#define limit t1 > > +#define fast_limit t2 > > +#define m1 t3 > > + > > +#if __riscv_xlen == 64 > > +# define REG_L ld > > +# define SZREG 8 > > +# define PTRLOG 3 > > +#else > > +# define REG_L lw > > +# define SZREG 4 > > +# define PTRLOG 2 > > +#endif > > + > > +#ifndef STRNCMP > > +# define STRNCMP __strncmp_zbb > > +#endif > > + > > +.option push > > +.option arch,+zbb > > + > > +ENTRY_ALIGN (STRNCMP, 6) > > + beqz len, L(equal) > > + or align, src1, src2 > > + and align, align, SZREG-1 > > + add limit, src1, len > > + bnez align, L(simpleloop) > > + li m1, -1 > > + > > + /* Adjust limit for fast-path. */ > > + andi fast_limit, limit, -SZREG > > + > > + /* Main loop for aligned string. */ > > + .p2align 3 > > +L(loop): > > + bge src1, fast_limit, L(simpleloop) > > + REG_L data1, 0(src1) > > + REG_L data2, 0(src2) > > + orc.b data1_orcb, data1 > > + bne data1_orcb, m1, L(foundnull) > > + addi src1, src1, SZREG > > + addi src2, src2, SZREG > > + beq data1, data2, L(loop) > > + > > + /* Words don't match, and no null byte in the first > > + * word. Get bytes in big-endian order and compare. */ > > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > > + rev8 data1, data1 > > + rev8 data2, data2 > > +#endif > > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ > > + sltu result, data1, data2 > > + neg result, result > > + ori result, result, 1 > > + ret > > + > > +L(foundnull): > > + /* Found a null byte. > > + * If words don't match, fall back to simple loop. */ > > + bne data1, data2, L(simpleloop) > > + > > + /* Otherwise, strings are equal. */ > > + li result, 0 > > + ret > > + > > + /* Simple loop for misaligned strings. */ > > + .p2align 3 > > +L(simpleloop): > > + bge src1, limit, L(equal) > > + lbu data1, 0(src1) > > + addi src1, src1, 1 > > + lbu data2, 0(src2) > > + addi src2, src2, 1 > > + bne data1, data2, L(sub) > > + bnez data1, L(simpleloop) > > + > > +L(sub): > > + sub result, data1, data2 > > + ret > > + > > +L(equal): > > + li result, 0 > > + ret > > + > > +.option pop > > + > > +END (STRNCMP) > > +libc_hidden_builtin_def (STRNCMP) > > -- > > 2.39.1 > >
On Wed, 08 Feb 2023 07:13:44 PST (-0800), philipp.tomsich@vrull.eu wrote: > On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner >> <christoph.muellner@vrull.eu> wrote: >> > >> > From: Christoph Müllner <christoph.muellner@vrull.eu> >> > >> > The implementation of strncmp() can be accelerated using Zbb's orc.b >> > instruction. Let's add an optimized implementation that makes use >> > of this instruction. >> > >> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu> >> >> Not necessary, but imo performance patches should have at least some reference >> to the expected speedup versus the existing alternatives. > > Given that this is effectively a SWAR-like optimization (orc.b allows > us to test 8 bytes in parallel for a NUL byte), we should be able to > show the benefit through a reduction in dynamic instructions. Would > this be considered reasonable reference data? Generally for performance improvements the only metrics that count come from real hardware. Processor implementation is complex and it's not generally true that reducing dynamic instructions results in better performance (particularly when more complex flavors instructions replace simpler ones). We've not been so good about this on the RISC-V side of things, though. I think that's largely because we didn't have all that much complexity around this, but there's a ton of stuff showing up right now. The general theory has been that Zbb instructions will execute faster than their corresponding I sequences, but nobody has proved that. I believe the new JH7110 has Zba and Zbb, so maybe the right answer there is to just benchmark things before merging them? That way we can get back to doing things sanely before we go too far down the premature optimization rabbit hole. FWIW: we had a pretty similar discussion in Linux land around these and nobody could get the JH7110 to boot, but given that we have ~6 months until glibc releases again hopefully that will be sorted out. There's a bunch of ongoing work looking at the more core issues like probing, so maybe it's best to focus on getting that all sorted out first? It's kind of awkward to have a bunch of routines posted in a whole new framework that's not sorting out all the probing dependencies. >> > --- >> > sysdeps/riscv/multiarch/Makefile | 3 +- >> > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 + >> > sysdeps/riscv/multiarch/strncmp.c | 6 +- >> > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++ >> > 4 files changed, 127 insertions(+), 2 deletions(-) >> > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S >> > >> > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile >> > index 056ce2ffc0..9f22e31b99 100644 >> > --- a/sysdeps/riscv/multiarch/Makefile >> > +++ b/sysdeps/riscv/multiarch/Makefile >> > @@ -14,5 +14,6 @@ sysdep_routines += \ >> > strcmp_generic \ >> > strcmp_zbb \ >> > strcmp_zbb_unaligned \ >> > - strncmp_generic >> > + strncmp_generic \ >> > + strncmp_zbb >> > endif >> > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c >> > index eb37ed6017..82fd34d010 100644 >> > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c >> > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c >> > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic)) >> > >> > IFUNC_IMPL (i, name, strncmp, >> > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb) >> > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic)) >> > return i; >> > } >> > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c >> > index 970aeb8b85..5b0fe08e98 100644 >> > --- a/sysdeps/riscv/multiarch/strncmp.c >> > +++ b/sysdeps/riscv/multiarch/strncmp.c >> > @@ -30,8 +30,12 @@ >> > >> > extern __typeof (__redirect_strncmp) __libc_strncmp; >> > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden; >> > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden; >> > >> > -libc_ifunc (__libc_strncmp, __strncmp_generic); >> > +libc_ifunc (__libc_strncmp, >> > + HAVE_RV(zbb) >> > + ? __strncmp_zbb >> > + : __strncmp_generic); >> > >> > # undef strncmp >> > strong_alias (__libc_strncmp, strncmp); >> > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S >> > new file mode 100644 >> > index 0000000000..29cff30def >> > --- /dev/null >> > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S >> > @@ -0,0 +1,119 @@ >> > +/* Copyright (C) 2022 Free Software Foundation, Inc. >> > + >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library. If not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +#include <sysdep.h> >> > +#include <sys/asm.h> >> > + >> > +/* Assumptions: rvi_zbb. */ >> > + >> > +#define src1 a0 >> > +#define result a0 >> > +#define src2 a1 >> > +#define len a2 >> > +#define data1 a2 >> > +#define data2 a3 >> > +#define align a4 >> > +#define data1_orcb t0 >> > +#define limit t1 >> > +#define fast_limit t2 >> > +#define m1 t3 >> > + >> > +#if __riscv_xlen == 64 >> > +# define REG_L ld >> > +# define SZREG 8 >> > +# define PTRLOG 3 >> > +#else >> > +# define REG_L lw >> > +# define SZREG 4 >> > +# define PTRLOG 2 >> > +#endif >> > + >> > +#ifndef STRNCMP >> > +# define STRNCMP __strncmp_zbb >> > +#endif >> > + >> > +.option push >> > +.option arch,+zbb >> > + >> > +ENTRY_ALIGN (STRNCMP, 6) >> > + beqz len, L(equal) >> > + or align, src1, src2 >> > + and align, align, SZREG-1 >> > + add limit, src1, len >> > + bnez align, L(simpleloop) >> > + li m1, -1 >> > + >> > + /* Adjust limit for fast-path. */ >> > + andi fast_limit, limit, -SZREG >> > + >> > + /* Main loop for aligned string. */ >> > + .p2align 3 >> > +L(loop): >> > + bge src1, fast_limit, L(simpleloop) >> > + REG_L data1, 0(src1) >> > + REG_L data2, 0(src2) >> > + orc.b data1_orcb, data1 >> > + bne data1_orcb, m1, L(foundnull) >> > + addi src1, src1, SZREG >> > + addi src2, src2, SZREG >> > + beq data1, data2, L(loop) >> > + >> > + /* Words don't match, and no null byte in the first >> > + * word. Get bytes in big-endian order and compare. */ >> > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ >> > + rev8 data1, data1 >> > + rev8 data2, data2 >> > +#endif >> > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ >> > + sltu result, data1, data2 >> > + neg result, result >> > + ori result, result, 1 >> > + ret >> > + >> > +L(foundnull): >> > + /* Found a null byte. >> > + * If words don't match, fall back to simple loop. */ >> > + bne data1, data2, L(simpleloop) >> > + >> > + /* Otherwise, strings are equal. */ >> > + li result, 0 >> > + ret >> > + >> > + /* Simple loop for misaligned strings. */ >> > + .p2align 3 >> > +L(simpleloop): >> > + bge src1, limit, L(equal) >> > + lbu data1, 0(src1) >> > + addi src1, src1, 1 >> > + lbu data2, 0(src2) >> > + addi src2, src2, 1 >> > + bne data1, data2, L(sub) >> > + bnez data1, L(simpleloop) >> > + >> > +L(sub): >> > + sub result, data1, data2 >> > + ret >> > + >> > +L(equal): >> > + li result, 0 >> > + ret >> > + >> > +.option pop >> > + >> > +END (STRNCMP) >> > +libc_hidden_builtin_def (STRNCMP) >> > -- >> > 2.39.1 >> >
On Wed, Feb 8, 2023 at 9:13 AM Philipp Tomsich <philipp.tomsich@vrull.eu> wrote: > > On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner > > <christoph.muellner@vrull.eu> wrote: > > > > > > From: Christoph Müllner <christoph.muellner@vrull.eu> > > > > > > The implementation of strncmp() can be accelerated using Zbb's orc.b > > > instruction. Let's add an optimized implementation that makes use > > > of this instruction. > > > > > > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu> > > > > Not necessary, but imo performance patches should have at least some reference > > to the expected speedup versus the existing alternatives. > > Given that this is effectively a SWAR-like optimization (orc.b allows > us to test 8 bytes in parallel for a NUL byte), we should be able to > show the benefit through a reduction in dynamic instructions. Would > this be considered reasonable reference data? > GLIBC has a benchmark suite for all the string/memory functions so would expect improvement in those results compared to the generic implementations. > > > --- > > > sysdeps/riscv/multiarch/Makefile | 3 +- > > > sysdeps/riscv/multiarch/ifunc-impl-list.c | 1 + > > > sysdeps/riscv/multiarch/strncmp.c | 6 +- > > > sysdeps/riscv/multiarch/strncmp_zbb.S | 119 ++++++++++++++++++++++ > > > 4 files changed, 127 insertions(+), 2 deletions(-) > > > create mode 100644 sysdeps/riscv/multiarch/strncmp_zbb.S > > > > > > diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile > > > index 056ce2ffc0..9f22e31b99 100644 > > > --- a/sysdeps/riscv/multiarch/Makefile > > > +++ b/sysdeps/riscv/multiarch/Makefile > > > @@ -14,5 +14,6 @@ sysdep_routines += \ > > > strcmp_generic \ > > > strcmp_zbb \ > > > strcmp_zbb_unaligned \ > > > - strncmp_generic > > > + strncmp_generic \ > > > + strncmp_zbb > > > endif > > > diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c > > > index eb37ed6017..82fd34d010 100644 > > > --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c > > > @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic)) > > > > > > IFUNC_IMPL (i, name, strncmp, > > > + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb) > > > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic)) > > > return i; > > > } > > > diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c > > > index 970aeb8b85..5b0fe08e98 100644 > > > --- a/sysdeps/riscv/multiarch/strncmp.c > > > +++ b/sysdeps/riscv/multiarch/strncmp.c > > > @@ -30,8 +30,12 @@ > > > > > > extern __typeof (__redirect_strncmp) __libc_strncmp; > > > extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden; > > > +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden; > > > > > > -libc_ifunc (__libc_strncmp, __strncmp_generic); > > > +libc_ifunc (__libc_strncmp, > > > + HAVE_RV(zbb) > > > + ? __strncmp_zbb > > > + : __strncmp_generic); > > > > > > # undef strncmp > > > strong_alias (__libc_strncmp, strncmp); > > > diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S > > > new file mode 100644 > > > index 0000000000..29cff30def > > > --- /dev/null > > > +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S > > > @@ -0,0 +1,119 @@ > > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > > + > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library. If not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#include <sysdep.h> > > > +#include <sys/asm.h> > > > + > > > +/* Assumptions: rvi_zbb. */ > > > + > > > +#define src1 a0 > > > +#define result a0 > > > +#define src2 a1 > > > +#define len a2 > > > +#define data1 a2 > > > +#define data2 a3 > > > +#define align a4 > > > +#define data1_orcb t0 > > > +#define limit t1 > > > +#define fast_limit t2 > > > +#define m1 t3 > > > + > > > +#if __riscv_xlen == 64 > > > +# define REG_L ld > > > +# define SZREG 8 > > > +# define PTRLOG 3 > > > +#else > > > +# define REG_L lw > > > +# define SZREG 4 > > > +# define PTRLOG 2 > > > +#endif > > > + > > > +#ifndef STRNCMP > > > +# define STRNCMP __strncmp_zbb > > > +#endif > > > + > > > +.option push > > > +.option arch,+zbb > > > + > > > +ENTRY_ALIGN (STRNCMP, 6) > > > + beqz len, L(equal) > > > + or align, src1, src2 > > > + and align, align, SZREG-1 > > > + add limit, src1, len > > > + bnez align, L(simpleloop) > > > + li m1, -1 > > > + > > > + /* Adjust limit for fast-path. */ > > > + andi fast_limit, limit, -SZREG > > > + > > > + /* Main loop for aligned string. */ > > > + .p2align 3 > > > +L(loop): > > > + bge src1, fast_limit, L(simpleloop) > > > + REG_L data1, 0(src1) > > > + REG_L data2, 0(src2) > > > + orc.b data1_orcb, data1 > > > + bne data1_orcb, m1, L(foundnull) > > > + addi src1, src1, SZREG > > > + addi src2, src2, SZREG > > > + beq data1, data2, L(loop) > > > + > > > + /* Words don't match, and no null byte in the first > > > + * word. Get bytes in big-endian order and compare. */ > > > +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ > > > + rev8 data1, data1 > > > + rev8 data2, data2 > > > +#endif > > > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ > > > + sltu result, data1, data2 > > > + neg result, result > > > + ori result, result, 1 > > > + ret > > > + > > > +L(foundnull): > > > + /* Found a null byte. > > > + * If words don't match, fall back to simple loop. */ > > > + bne data1, data2, L(simpleloop) > > > + > > > + /* Otherwise, strings are equal. */ > > > + li result, 0 > > > + ret > > > + > > > + /* Simple loop for misaligned strings. */ > > > + .p2align 3 > > > +L(simpleloop): > > > + bge src1, limit, L(equal) > > > + lbu data1, 0(src1) > > > + addi src1, src1, 1 > > > + lbu data2, 0(src2) > > > + addi src2, src2, 1 > > > + bne data1, data2, L(sub) > > > + bnez data1, L(simpleloop) > > > + > > > +L(sub): > > > + sub result, data1, data2 > > > + ret > > > + > > > +L(equal): > > > + li result, 0 > > > + ret > > > + > > > +.option pop > > > + > > > +END (STRNCMP) > > > +libc_hidden_builtin_def (STRNCMP) > > > -- > > > 2.39.1 > > >
On 08/02/23 14:55, Palmer Dabbelt wrote: > On Wed, 08 Feb 2023 07:13:44 PST (-0800), philipp.tomsich@vrull.eu wrote: >> On Tue, 7 Feb 2023 at 02:20, Noah Goldstein <goldstein.w.n@gmail.com> wrote: >>> >>> On Mon, Feb 6, 2023 at 6:23 PM Christoph Muellner >>> <christoph.muellner@vrull.eu> wrote: >>> > >>> > From: Christoph Müllner <christoph.muellner@vrull.eu> >>> > >>> > The implementation of strncmp() can be accelerated using Zbb's orc.b >>> > instruction. Let's add an optimized implementation that makes use >>> > of this instruction. >>> > >>> > Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu> >>> >>> Not necessary, but imo performance patches should have at least some reference >>> to the expected speedup versus the existing alternatives. >> >> Given that this is effectively a SWAR-like optimization (orc.b allows >> us to test 8 bytes in parallel for a NUL byte), we should be able to >> show the benefit through a reduction in dynamic instructions. Would >> this be considered reasonable reference data? > > Generally for performance improvements the only metrics that count come from real hardware. Processor implementation is complex and it's not generally true that reducing dynamic instructions results in better performance (particularly when more complex flavors instructions replace simpler ones). > I agree with Noah here that we need to have some baseline performance number, even tough we are comparing naive implementations (what glibc used to have for implementations). > We've not been so good about this on the RISC-V side of things, though. I think that's largely because we didn't have all that much complexity around this, but there's a ton of stuff showing up right now. The general theory has been that Zbb instructions will execute faster than their corresponding I sequences, but nobody has proved that. I believe the new JH7110 has Zba and Zbb, so maybe the right answer there is to just benchmark things before merging them? That way we can get back to doing things sanely before we go too far down the premature optimization rabbit hole. > > FWIW: we had a pretty similar discussion in Linux land around these and nobody could get the JH7110 to boot, but given that we have ~6 months until glibc releases again hopefully that will be sorted out. There's a bunch of ongoing work looking at the more core issues like probing, so maybe it's best to focus on getting that all sorted out first? It's kind of awkward to have a bunch of routines posted in a whole new framework that's not sorting out all the probing dependencies. Just a heads up that with latest generic string routines optimization, all str* routines should now use new zbb extensions (if compiler is instructed to do so). I think you might squeeze some cycles with hand crafted assembly routine, but I would rather focus on trying to optimize code generation instead. The generic routines still assumes that hardware can't or is prohibitive expensive to issue unaligned memory access. However, I think we move toward this direction to start adding unaligned variants when it makes sense. Another usual tuning is loop unrolling, which depends on underlying hardware. Unfortunately we need to explicit force gcc to unroll some loop construction (for instance check sysdeps/powerpc/powerpc64/power4/Makefile), so this might be another approach you might use to tune RISCV routines. The memcpy, memmove, memset, memcmp are a slight different subject. Although current generic mem routines does use some explicit unrolling, it also does not take in consideration unaligned access, vector instructions, or special instruction (such as cache clear one). And these usually make a lot of difference. What I would expect it maybe we can use a similar strategy Google is doing with llvm libc, which based its work on the automemcpy paper [1]. It means that for unaligned, each architecture will reimplement the memory routine block. Although the project focus on static compiling, I think using hooks over assembly routines might be a better approach (you might reuse code blocks or try different strategies more easily). [1] https://storage.googleapis.com/pub-tools-public-publication-data/pdf/4f7c3da72d557ed418828823a8e59942859d677f.pdf
diff --git a/sysdeps/riscv/multiarch/Makefile b/sysdeps/riscv/multiarch/Makefile index 056ce2ffc0..9f22e31b99 100644 --- a/sysdeps/riscv/multiarch/Makefile +++ b/sysdeps/riscv/multiarch/Makefile @@ -14,5 +14,6 @@ sysdep_routines += \ strcmp_generic \ strcmp_zbb \ strcmp_zbb_unaligned \ - strncmp_generic + strncmp_generic \ + strncmp_zbb endif diff --git a/sysdeps/riscv/multiarch/ifunc-impl-list.c b/sysdeps/riscv/multiarch/ifunc-impl-list.c index eb37ed6017..82fd34d010 100644 --- a/sysdeps/riscv/multiarch/ifunc-impl-list.c +++ b/sysdeps/riscv/multiarch/ifunc-impl-list.c @@ -64,6 +64,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_generic)) IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_zbb) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_generic)) return i; } diff --git a/sysdeps/riscv/multiarch/strncmp.c b/sysdeps/riscv/multiarch/strncmp.c index 970aeb8b85..5b0fe08e98 100644 --- a/sysdeps/riscv/multiarch/strncmp.c +++ b/sysdeps/riscv/multiarch/strncmp.c @@ -30,8 +30,12 @@ extern __typeof (__redirect_strncmp) __libc_strncmp; extern __typeof (__redirect_strncmp) __strncmp_generic attribute_hidden; +extern __typeof (__redirect_strncmp) __strncmp_zbb attribute_hidden; -libc_ifunc (__libc_strncmp, __strncmp_generic); +libc_ifunc (__libc_strncmp, + HAVE_RV(zbb) + ? __strncmp_zbb + : __strncmp_generic); # undef strncmp strong_alias (__libc_strncmp, strncmp); diff --git a/sysdeps/riscv/multiarch/strncmp_zbb.S b/sysdeps/riscv/multiarch/strncmp_zbb.S new file mode 100644 index 0000000000..29cff30def --- /dev/null +++ b/sysdeps/riscv/multiarch/strncmp_zbb.S @@ -0,0 +1,119 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/asm.h> + +/* Assumptions: rvi_zbb. */ + +#define src1 a0 +#define result a0 +#define src2 a1 +#define len a2 +#define data1 a2 +#define data2 a3 +#define align a4 +#define data1_orcb t0 +#define limit t1 +#define fast_limit t2 +#define m1 t3 + +#if __riscv_xlen == 64 +# define REG_L ld +# define SZREG 8 +# define PTRLOG 3 +#else +# define REG_L lw +# define SZREG 4 +# define PTRLOG 2 +#endif + +#ifndef STRNCMP +# define STRNCMP __strncmp_zbb +#endif + +.option push +.option arch,+zbb + +ENTRY_ALIGN (STRNCMP, 6) + beqz len, L(equal) + or align, src1, src2 + and align, align, SZREG-1 + add limit, src1, len + bnez align, L(simpleloop) + li m1, -1 + + /* Adjust limit for fast-path. */ + andi fast_limit, limit, -SZREG + + /* Main loop for aligned string. */ + .p2align 3 +L(loop): + bge src1, fast_limit, L(simpleloop) + REG_L data1, 0(src1) + REG_L data2, 0(src2) + orc.b data1_orcb, data1 + bne data1_orcb, m1, L(foundnull) + addi src1, src1, SZREG + addi src2, src2, SZREG + beq data1, data2, L(loop) + + /* Words don't match, and no null byte in the first + * word. Get bytes in big-endian order and compare. */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + rev8 data1, data1 + rev8 data2, data2 +#endif + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */ + sltu result, data1, data2 + neg result, result + ori result, result, 1 + ret + +L(foundnull): + /* Found a null byte. + * If words don't match, fall back to simple loop. */ + bne data1, data2, L(simpleloop) + + /* Otherwise, strings are equal. */ + li result, 0 + ret + + /* Simple loop for misaligned strings. */ + .p2align 3 +L(simpleloop): + bge src1, limit, L(equal) + lbu data1, 0(src1) + addi src1, src1, 1 + lbu data2, 0(src2) + addi src2, src2, 1 + bne data1, data2, L(sub) + bnez data1, L(simpleloop) + +L(sub): + sub result, data1, data2 + ret + +L(equal): + li result, 0 + ret + +.option pop + +END (STRNCMP) +libc_hidden_builtin_def (STRNCMP)