Message ID | DB8PR08MB503633A48CF226FB3C5FF4F683610@DB8PR08MB5036.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Add optimized Q-register memcpy | expand |
On 7/14/20 12:33 PM, Wilco Dijkstra wrote: > Add a new memcpy using 128-bit Q registers - this is faster on modern > cores and reduces codesize. Similar to the generic memcpy, small cases > include copies up to 32 bytes. 64-128 byte copies are split into two > cases to improve performance of 64-96 byte copies. Large copies align > the source rather than the destination. > > bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1, > so make this memcpy the default on N1 (on Centriq it is 15% faster than > memcpy_falkor). > > Passes GLIBC regression tests. OK for commit? As release manager this is OK for 2.32 if Szabolcs says it's OK. > --- > > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index 4377df0735287c210efd661188f9e6e3923c8003..e93c21e764a8d02b9f07f5030c31836a3f03f3e1 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -1,5 +1,5 @@ > ifeq ($(subdir),string) > -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ > +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ > memcpy_falkor \ > memcpy_new \ > memset_generic memset_falkor memset_emag memset_kunpeng \ > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 0ccaf53e555e410569eb2be76ec7d5b4d7bc64a5..09feea97ea37ab923cf4a8557197d46adcd49204 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) > + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) > IFUNC_IMPL (i, name, memmove, > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) > + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > IFUNC_IMPL (i, name, memset, > /* Enable this on non-falkor processors too so that other cores > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > index 2fafefd5d23fc1528031b5fe52098218ed603b89..e6f3ae116701097d71a02e2a1f6bfdadc1eec34a 100644 > --- a/sysdeps/aarch64/multiarch/memcpy.c > +++ b/sysdeps/aarch64/multiarch/memcpy.c > @@ -29,6 +29,7 @@ > extern __typeof (__redirect_memcpy) __libc_memcpy; > > extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; > +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; > @@ -36,11 +37,11 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; > libc_ifunc (__libc_memcpy, > (IS_THUNDERX (midr) > ? __memcpy_thunderx > - : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr) > + : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr) > ? __memcpy_falkor > : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) > ? __memcpy_thunderx2 > - : __memcpy_generic)))); > + : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic))))); > > # undef memcpy > strong_alias (__libc_memcpy, memcpy); > diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S > new file mode 100644 > index 0000000000000000000000000000000000000000..d4ba74777744c8bb5a83e43ab2d63ad8dab35203 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S > @@ -0,0 +1,247 @@ > +/* Generic optimized memcpy using SIMD. > + Copyright (C) 2020 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* Assumptions: > + * > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > + * > + */ > + > +#define dstin x0 > +#define src x1 > +#define count x2 > +#define dst x3 > +#define srcend x4 > +#define dstend x5 > +#define A_l x6 > +#define A_lw w6 > +#define A_h x7 > +#define B_l x8 > +#define B_lw w8 > +#define B_h x9 > +#define C_lw w10 > +#define tmp1 x14 > + > +#define A_q q0 > +#define B_q q1 > +#define C_q q2 > +#define D_q q3 > +#define E_q q4 > +#define F_q q5 > +#define G_q q6 > +#define H_q q7 > + > + > +/* This implementation supports both memcpy and memmove and shares most code. > + It uses unaligned accesses and branchless sequences to keep the code small, > + simple and improve performance. > + > + Copies are split into 3 main cases: small copies of up to 32 bytes, medium > + copies of up to 128 bytes, and large copies. The overhead of the overlap > + check in memmove is negligible since it is only required for large copies. > + > + Large copies use a software pipelined loop processing 64 bytes per > + iteration. The destination pointer is 16-byte aligned to minimize > + unaligned accesses. The loop tail is handled by always copying 64 bytes > + from the end. */ > + > +ENTRY (__memcpy_simd) > + DELOUSE (0) > + DELOUSE (1) > + DELOUSE (2) > + > + add srcend, src, count > + add dstend, dstin, count > + cmp count, 128 > + b.hi L(copy_long) > + cmp count, 32 > + b.hi L(copy32_128) > + > + /* Small copies: 0..32 bytes. */ > + cmp count, 16 > + b.lo L(copy16) > + ldr A_q, [src] > + ldr B_q, [srcend, -16] > + str A_q, [dstin] > + str B_q, [dstend, -16] > + ret > + > + /* Copy 8-15 bytes. */ > +L(copy16): > + tbz count, 3, L(copy8) > + ldr A_l, [src] > + ldr A_h, [srcend, -8] > + str A_l, [dstin] > + str A_h, [dstend, -8] > + ret > + > + /* Copy 4-7 bytes. */ > +L(copy8): > + tbz count, 2, L(copy4) > + ldr A_lw, [src] > + ldr B_lw, [srcend, -4] > + str A_lw, [dstin] > + str B_lw, [dstend, -4] > + ret > + > + /* Copy 0..3 bytes using a branchless sequence. */ > +L(copy4): > + cbz count, L(copy0) > + lsr tmp1, count, 1 > + ldrb A_lw, [src] > + ldrb C_lw, [srcend, -1] > + ldrb B_lw, [src, tmp1] > + strb A_lw, [dstin] > + strb B_lw, [dstin, tmp1] > + strb C_lw, [dstend, -1] > +L(copy0): > + ret > + > + .p2align 4 > + /* Medium copies: 33..128 bytes. */ > +L(copy32_128): > + ldp A_q, B_q, [src] > + ldp C_q, D_q, [srcend, -32] > + cmp count, 64 > + b.hi L(copy128) > + stp A_q, B_q, [dstin] > + stp C_q, D_q, [dstend, -32] > + ret > + > + .p2align 4 > + /* Copy 65..128 bytes. */ > +L(copy128): > + ldp E_q, F_q, [src, 32] > + cmp count, 96 > + b.ls L(copy96) > + ldp G_q, H_q, [srcend, -64] > + stp G_q, H_q, [dstend, -64] > +L(copy96): > + stp A_q, B_q, [dstin] > + stp E_q, F_q, [dstin, 32] > + stp C_q, D_q, [dstend, -32] > + ret > + > + /* Align loop64 below to 16 bytes. */ > + nop > + > + /* Copy more than 128 bytes. */ > +L(copy_long): > + /* Copy 16 bytes and then align src to 16-byte alignment. */ > + ldr D_q, [src] > + and tmp1, src, 15 > + bic src, src, 15 > + sub dst, dstin, tmp1 > + add count, count, tmp1 /* Count is now 16 too large. */ > + ldp A_q, B_q, [src, 16] > + str D_q, [dstin] > + ldp C_q, D_q, [src, 48] > + subs count, count, 128 + 16 /* Test and readjust count. */ > + b.ls L(copy64_from_end) > +L(loop64): > + stp A_q, B_q, [dst, 16] > + ldp A_q, B_q, [src, 80] > + stp C_q, D_q, [dst, 48] > + ldp C_q, D_q, [src, 112] > + add src, src, 64 > + add dst, dst, 64 > + subs count, count, 64 > + b.hi L(loop64) > + > + /* Write the last iteration and copy 64 bytes from the end. */ > +L(copy64_from_end): > + ldp E_q, F_q, [srcend, -64] > + stp A_q, B_q, [dst, 16] > + ldp A_q, B_q, [srcend, -32] > + stp C_q, D_q, [dst, 48] > + stp E_q, F_q, [dstend, -64] > + stp A_q, B_q, [dstend, -32] > + ret > + > +END (__memcpy_simd) > +libc_hidden_builtin_def (__memcpy_simd) > + > + > +ENTRY (__memmove_simd) > + DELOUSE (0) > + DELOUSE (1) > + DELOUSE (2) > + > + add srcend, src, count > + add dstend, dstin, count > + cmp count, 128 > + b.hi L(move_long) > + cmp count, 32 > + b.hi L(copy32_128) > + > + /* Small moves: 0..32 bytes. */ > + cmp count, 16 > + b.lo L(copy16) > + ldr A_q, [src] > + ldr B_q, [srcend, -16] > + str A_q, [dstin] > + str B_q, [dstend, -16] > + ret > + > +L(move_long): > + /* Only use backward copy if there is an overlap. */ > + sub tmp1, dstin, src > + cbz tmp1, L(move0) > + cmp tmp1, count > + b.hs L(copy_long) > + > + /* Large backwards copy for overlapping copies. > + Copy 16 bytes and then align srcend to 16-byte alignment. */ > +L(copy_long_backwards): > + ldr D_q, [srcend, -16] > + and tmp1, srcend, 15 > + bic srcend, srcend, 15 > + sub count, count, tmp1 > + ldp A_q, B_q, [srcend, -32] > + str D_q, [dstend, -16] > + ldp C_q, D_q, [srcend, -64] > + sub dstend, dstend, tmp1 > + subs count, count, 128 > + b.ls L(copy64_from_start) > + > +L(loop64_backwards): > + stp A_q, B_q, [dstend, -32] > + ldp A_q, B_q, [srcend, -96] > + stp C_q, D_q, [dstend, -64] > + ldp C_q, D_q, [srcend, -128] > + sub srcend, srcend, 64 > + sub dstend, dstend, 64 > + subs count, count, 64 > + b.hi L(loop64_backwards) > + > + /* Write the last iteration and copy 64 bytes from the start. */ > +L(copy64_from_start): > + ldp E_q, F_q, [src, 32] > + stp A_q, B_q, [dstend, -32] > + ldp A_q, B_q, [src] > + stp C_q, D_q, [dstend, -64] > + stp E_q, F_q, [dstin, 32] > + stp A_q, B_q, [dstin] > +L(move0): > + ret > + > +END (__memmove_simd) > +libc_hidden_builtin_def (__memmove_simd) > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > index ed5a47f6f83e7b0afcec60cb9fa0f09999eaacae..1229f8b89296eddd2e711490bb7fc0b35726b6f5 100644 > --- a/sysdeps/aarch64/multiarch/memmove.c > +++ b/sysdeps/aarch64/multiarch/memmove.c > @@ -29,6 +29,7 @@ > extern __typeof (__redirect_memmove) __libc_memmove; > > extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; > +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; > @@ -40,7 +41,7 @@ libc_ifunc (__libc_memmove, > ? __memmove_falkor > : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) > ? __memmove_thunderx2 > - : __memmove_generic)))); > + : (IS_ARES (midr) ? __memmove_simd : __memmove_generic))))); > > # undef memmove > strong_alias (__libc_memmove, memmove); > >
The 07/14/2020 16:17, Carlos O'Donell wrote: > On 7/14/20 12:33 PM, Wilco Dijkstra wrote: > > Add a new memcpy using 128-bit Q registers - this is faster on modern > > cores and reduces codesize. Similar to the generic memcpy, small cases > > include copies up to 32 bytes. 64-128 byte copies are split into two > > cases to improve performance of 64-96 byte copies. Large copies align > > the source rather than the destination. > > > > bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1, > > so make this memcpy the default on N1 (on Centriq it is 15% faster than > > memcpy_falkor). > > > > Passes GLIBC regression tests. OK for commit? > > As release manager this is OK for 2.32 if Szabolcs says it's OK. thanks. this is ok to commit. > > --- > > > > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > > index 4377df0735287c210efd661188f9e6e3923c8003..e93c21e764a8d02b9f07f5030c31836a3f03f3e1 100644 > > --- a/sysdeps/aarch64/multiarch/Makefile > > +++ b/sysdeps/aarch64/multiarch/Makefile > > @@ -1,5 +1,5 @@ > > ifeq ($(subdir),string) > > -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ > > +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ > > memcpy_falkor \ > > memcpy_new \ > > memset_generic memset_falkor memset_emag memset_kunpeng \ > > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > > index 0ccaf53e555e410569eb2be76ec7d5b4d7bc64a5..09feea97ea37ab923cf4a8557197d46adcd49204 100644 > > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > > @@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) > > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) > > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) > > + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) > > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) > > IFUNC_IMPL (i, name, memmove, > > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) > > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2) > > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) > > + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) > > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > > IFUNC_IMPL (i, name, memset, > > /* Enable this on non-falkor processors too so that other cores > > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > > index 2fafefd5d23fc1528031b5fe52098218ed603b89..e6f3ae116701097d71a02e2a1f6bfdadc1eec34a 100644 > > --- a/sysdeps/aarch64/multiarch/memcpy.c > > +++ b/sysdeps/aarch64/multiarch/memcpy.c > > @@ -29,6 +29,7 @@ > > extern __typeof (__redirect_memcpy) __libc_memcpy; > > > > extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; > > +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; > > extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; > > extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; > > extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; > > @@ -36,11 +37,11 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; > > libc_ifunc (__libc_memcpy, > > (IS_THUNDERX (midr) > > ? __memcpy_thunderx > > - : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr) > > + : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr) > > ? __memcpy_falkor > > : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) > > ? __memcpy_thunderx2 > > - : __memcpy_generic)))); > > + : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic))))); > > > > # undef memcpy > > strong_alias (__libc_memcpy, memcpy); > > diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S > > new file mode 100644 > > index 0000000000000000000000000000000000000000..d4ba74777744c8bb5a83e43ab2d63ad8dab35203 > > --- /dev/null > > +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S > > @@ -0,0 +1,247 @@ > > +/* Generic optimized memcpy using SIMD. > > + Copyright (C) 2020 Free Software Foundation, Inc. > > + > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library. If not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <sysdep.h> > > + > > +/* Assumptions: > > + * > > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > > + * > > + */ > > + > > +#define dstin x0 > > +#define src x1 > > +#define count x2 > > +#define dst x3 > > +#define srcend x4 > > +#define dstend x5 > > +#define A_l x6 > > +#define A_lw w6 > > +#define A_h x7 > > +#define B_l x8 > > +#define B_lw w8 > > +#define B_h x9 > > +#define C_lw w10 > > +#define tmp1 x14 > > + > > +#define A_q q0 > > +#define B_q q1 > > +#define C_q q2 > > +#define D_q q3 > > +#define E_q q4 > > +#define F_q q5 > > +#define G_q q6 > > +#define H_q q7 > > + > > + > > +/* This implementation supports both memcpy and memmove and shares most code. > > + It uses unaligned accesses and branchless sequences to keep the code small, > > + simple and improve performance. > > + > > + Copies are split into 3 main cases: small copies of up to 32 bytes, medium > > + copies of up to 128 bytes, and large copies. The overhead of the overlap > > + check in memmove is negligible since it is only required for large copies. > > + > > + Large copies use a software pipelined loop processing 64 bytes per > > + iteration. The destination pointer is 16-byte aligned to minimize > > + unaligned accesses. The loop tail is handled by always copying 64 bytes > > + from the end. */ > > + > > +ENTRY (__memcpy_simd) > > + DELOUSE (0) > > + DELOUSE (1) > > + DELOUSE (2) > > + > > + add srcend, src, count > > + add dstend, dstin, count > > + cmp count, 128 > > + b.hi L(copy_long) > > + cmp count, 32 > > + b.hi L(copy32_128) > > + > > + /* Small copies: 0..32 bytes. */ > > + cmp count, 16 > > + b.lo L(copy16) > > + ldr A_q, [src] > > + ldr B_q, [srcend, -16] > > + str A_q, [dstin] > > + str B_q, [dstend, -16] > > + ret > > + > > + /* Copy 8-15 bytes. */ > > +L(copy16): > > + tbz count, 3, L(copy8) > > + ldr A_l, [src] > > + ldr A_h, [srcend, -8] > > + str A_l, [dstin] > > + str A_h, [dstend, -8] > > + ret > > + > > + /* Copy 4-7 bytes. */ > > +L(copy8): > > + tbz count, 2, L(copy4) > > + ldr A_lw, [src] > > + ldr B_lw, [srcend, -4] > > + str A_lw, [dstin] > > + str B_lw, [dstend, -4] > > + ret > > + > > + /* Copy 0..3 bytes using a branchless sequence. */ > > +L(copy4): > > + cbz count, L(copy0) > > + lsr tmp1, count, 1 > > + ldrb A_lw, [src] > > + ldrb C_lw, [srcend, -1] > > + ldrb B_lw, [src, tmp1] > > + strb A_lw, [dstin] > > + strb B_lw, [dstin, tmp1] > > + strb C_lw, [dstend, -1] > > +L(copy0): > > + ret > > + > > + .p2align 4 > > + /* Medium copies: 33..128 bytes. */ > > +L(copy32_128): > > + ldp A_q, B_q, [src] > > + ldp C_q, D_q, [srcend, -32] > > + cmp count, 64 > > + b.hi L(copy128) > > + stp A_q, B_q, [dstin] > > + stp C_q, D_q, [dstend, -32] > > + ret > > + > > + .p2align 4 > > + /* Copy 65..128 bytes. */ > > +L(copy128): > > + ldp E_q, F_q, [src, 32] > > + cmp count, 96 > > + b.ls L(copy96) > > + ldp G_q, H_q, [srcend, -64] > > + stp G_q, H_q, [dstend, -64] > > +L(copy96): > > + stp A_q, B_q, [dstin] > > + stp E_q, F_q, [dstin, 32] > > + stp C_q, D_q, [dstend, -32] > > + ret > > + > > + /* Align loop64 below to 16 bytes. */ > > + nop > > + > > + /* Copy more than 128 bytes. */ > > +L(copy_long): > > + /* Copy 16 bytes and then align src to 16-byte alignment. */ > > + ldr D_q, [src] > > + and tmp1, src, 15 > > + bic src, src, 15 > > + sub dst, dstin, tmp1 > > + add count, count, tmp1 /* Count is now 16 too large. */ > > + ldp A_q, B_q, [src, 16] > > + str D_q, [dstin] > > + ldp C_q, D_q, [src, 48] > > + subs count, count, 128 + 16 /* Test and readjust count. */ > > + b.ls L(copy64_from_end) > > +L(loop64): > > + stp A_q, B_q, [dst, 16] > > + ldp A_q, B_q, [src, 80] > > + stp C_q, D_q, [dst, 48] > > + ldp C_q, D_q, [src, 112] > > + add src, src, 64 > > + add dst, dst, 64 > > + subs count, count, 64 > > + b.hi L(loop64) > > + > > + /* Write the last iteration and copy 64 bytes from the end. */ > > +L(copy64_from_end): > > + ldp E_q, F_q, [srcend, -64] > > + stp A_q, B_q, [dst, 16] > > + ldp A_q, B_q, [srcend, -32] > > + stp C_q, D_q, [dst, 48] > > + stp E_q, F_q, [dstend, -64] > > + stp A_q, B_q, [dstend, -32] > > + ret > > + > > +END (__memcpy_simd) > > +libc_hidden_builtin_def (__memcpy_simd) > > + > > + > > +ENTRY (__memmove_simd) > > + DELOUSE (0) > > + DELOUSE (1) > > + DELOUSE (2) > > + > > + add srcend, src, count > > + add dstend, dstin, count > > + cmp count, 128 > > + b.hi L(move_long) > > + cmp count, 32 > > + b.hi L(copy32_128) > > + > > + /* Small moves: 0..32 bytes. */ > > + cmp count, 16 > > + b.lo L(copy16) > > + ldr A_q, [src] > > + ldr B_q, [srcend, -16] > > + str A_q, [dstin] > > + str B_q, [dstend, -16] > > + ret > > + > > +L(move_long): > > + /* Only use backward copy if there is an overlap. */ > > + sub tmp1, dstin, src > > + cbz tmp1, L(move0) > > + cmp tmp1, count > > + b.hs L(copy_long) > > + > > + /* Large backwards copy for overlapping copies. > > + Copy 16 bytes and then align srcend to 16-byte alignment. */ > > +L(copy_long_backwards): > > + ldr D_q, [srcend, -16] > > + and tmp1, srcend, 15 > > + bic srcend, srcend, 15 > > + sub count, count, tmp1 > > + ldp A_q, B_q, [srcend, -32] > > + str D_q, [dstend, -16] > > + ldp C_q, D_q, [srcend, -64] > > + sub dstend, dstend, tmp1 > > + subs count, count, 128 > > + b.ls L(copy64_from_start) > > + > > +L(loop64_backwards): > > + stp A_q, B_q, [dstend, -32] > > + ldp A_q, B_q, [srcend, -96] > > + stp C_q, D_q, [dstend, -64] > > + ldp C_q, D_q, [srcend, -128] > > + sub srcend, srcend, 64 > > + sub dstend, dstend, 64 > > + subs count, count, 64 > > + b.hi L(loop64_backwards) > > + > > + /* Write the last iteration and copy 64 bytes from the start. */ > > +L(copy64_from_start): > > + ldp E_q, F_q, [src, 32] > > + stp A_q, B_q, [dstend, -32] > > + ldp A_q, B_q, [src] > > + stp C_q, D_q, [dstend, -64] > > + stp E_q, F_q, [dstin, 32] > > + stp A_q, B_q, [dstin] > > +L(move0): > > + ret > > + > > +END (__memmove_simd) > > +libc_hidden_builtin_def (__memmove_simd) > > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > > index ed5a47f6f83e7b0afcec60cb9fa0f09999eaacae..1229f8b89296eddd2e711490bb7fc0b35726b6f5 100644 > > --- a/sysdeps/aarch64/multiarch/memmove.c > > +++ b/sysdeps/aarch64/multiarch/memmove.c > > @@ -29,6 +29,7 @@ > > extern __typeof (__redirect_memmove) __libc_memmove; > > > > extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; > > +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; > > extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; > > extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; > > extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; > > @@ -40,7 +41,7 @@ libc_ifunc (__libc_memmove, > > ? __memmove_falkor > > : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) > > ? __memmove_thunderx2 > > - : __memmove_generic)))); > > + : (IS_ARES (midr) ? __memmove_simd : __memmove_generic))))); > > > > # undef memmove > > strong_alias (__libc_memmove, memmove); > > > > > > > -- > Cheers, > Carlos. >
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 4377df0735287c210efd661188f9e6e3923c8003..e93c21e764a8d02b9f07f5030c31836a3f03f3e1 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,5 +1,5 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor \ memcpy_new \ memset_generic memset_falkor memset_emag memset_kunpeng \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 0ccaf53e555e410569eb2be76ec7d5b4d7bc64a5..09feea97ea37ab923cf4a8557197d46adcd49204 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, /* Enable this on non-falkor processors too so that other cores diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index 2fafefd5d23fc1528031b5fe52098218ed603b89..e6f3ae116701097d71a02e2a1f6bfdadc1eec34a 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -29,6 +29,7 @@ extern __typeof (__redirect_memcpy) __libc_memcpy; extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; @@ -36,11 +37,11 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; libc_ifunc (__libc_memcpy, (IS_THUNDERX (midr) ? __memcpy_thunderx - : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr) + : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr) ? __memcpy_falkor : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memcpy_thunderx2 - : __memcpy_generic)))); + : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic))))); # undef memcpy strong_alias (__libc_memcpy, memcpy); diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S new file mode 100644 index 0000000000000000000000000000000000000000..d4ba74777744c8bb5a83e43ab2d63ad8dab35203 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S @@ -0,0 +1,247 @@ +/* Generic optimized memcpy using SIMD. + Copyright (C) 2020 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_lw w10 +#define tmp1 x14 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + + +/* This implementation supports both memcpy and memmove and shares most code. + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check in memmove is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per + iteration. The destination pointer is 16-byte aligned to minimize + unaligned accesses. The loop tail is handled by always copying 64 bytes + from the end. */ + +ENTRY (__memcpy_simd) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Align loop64 below to 16 bytes. */ + nop + + /* Copy more than 128 bytes. */ +L(copy_long): + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + +END (__memcpy_simd) +libc_hidden_builtin_def (__memcpy_simd) + + +ENTRY (__memmove_simd) + DELOUSE (0) + DELOUSE (1) + DELOUSE (2) + + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(move_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small moves: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldr A_q, [src] + ldr B_q, [srcend, -16] + str A_q, [dstin] + str B_q, [dstend, -16] + ret + +L(move_long): + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(move0) + cmp tmp1, count + b.hs L(copy_long) + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + stp C_q, D_q, [dstend, -64] + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + sub dstend, dstend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(move0): + ret + +END (__memmove_simd) +libc_hidden_builtin_def (__memmove_simd) diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index ed5a47f6f83e7b0afcec60cb9fa0f09999eaacae..1229f8b89296eddd2e711490bb7fc0b35726b6f5 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -29,6 +29,7 @@ extern __typeof (__redirect_memmove) __libc_memmove; extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; @@ -40,7 +41,7 @@ libc_ifunc (__libc_memmove, ? __memmove_falkor : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memmove_thunderx2 - : __memmove_generic)))); + : (IS_ARES (midr) ? __memmove_simd : __memmove_generic))))); # undef memmove strong_alias (__libc_memmove, memmove);