Message ID | DB6PR0801MB1879336AFC5EF1B90792F02383D69@DB6PR0801MB1879.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Add SVE memcpy | expand |
The 05/25/2022 17:04, Wilco Dijkstra wrote: > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 483fbbdb523c9a5b697a9b7f63153d2cefb482ac..e12c1a9fad3c2b1061f3f9d10accfe61ccc5ea50 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) > #if HAVE_AARCH64_SVE_ASM > IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx) > + IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) > #endif > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) > IFUNC_IMPL (i, name, memmove, > @@ -54,6 +55,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) > #if HAVE_AARCH64_SVE_ASM > IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx) > + IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) > #endif > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > IFUNC_IMPL (i, name, memset, i'd expect MAX_IFUNC to be updated whenever we add a new memcpy. > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > index a476dd548fd04fddcd0b39cc3263866769bb309a..6172fddcb83baf91e78f621cae7cf4dd9aff822a 100644 > --- a/sysdeps/aarch64/multiarch/memcpy.c > +++ b/sysdeps/aarch64/multiarch/memcpy.c > @@ -35,6 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; > # if HAVE_AARCH64_SVE_ASM > extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; > +extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; > # endif > > libc_ifunc (__libc_memcpy, > @@ -45,15 +46,12 @@ libc_ifunc (__libc_memcpy, > : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) > ? __memcpy_thunderx2 > : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) > - || IS_NEOVERSE_V1 (midr) > ? __memcpy_simd > -# if HAVE_AARCH64_SVE_ASM > - : (IS_A64FX (midr) && sve > - ? __memcpy_a64fx > - : __memcpy_generic)))))); > -# else > - : __memcpy_generic))))); > -# endif > + : (sve && HAVE_AARCH64_SVE_ASM > + ? (IS_A64FX (midr) > + ? __memcpy_a64fx > + : __memcpy_sve) > + : __memcpy_generic)))))); i think if you remove the ifdef here, then the declaration should be outside ifdef too. same for memmove.
Hi Szabolcs,
> i'd expect MAX_IFUNC to be updated whenever we add a new memcpy.
Yes this macro is quite broken, the value 7 should be correct, but I've set it to 8 for now. The correct approach is to check for 'max' inside IFUNC_IMPL_ADD.
Here is v2 (which also improves the readability of the ifunc selectors):
Add an initial SVE memcpy implementation. Copies up to 32 bytes use SVE vectors
which improves the random memcpy benchmark significantly. Cleanup the memcpy
and memmove ifunc selectors.
---
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 7500cf1e9369443daed1c5a938903dcff9d0a45e..490aaa0faf1e74fb7014e2ed6b574d92b8b83637 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,6 +1,6 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
- memcpy_falkor memcpy_a64fx \
+ memcpy_falkor memcpy_a64fx memcpy_sve \
memset_generic memset_falkor memset_emag memset_kunpeng \
memset_a64fx \
memchr_generic memchr_nosimd \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 483fbbdb523c9a5b697a9b7f63153d2cefb482ac..f6c6d008da285122cf3549efa5adf05d5e098680 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,7 +25,7 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 7
+#define MAX_IFUNC 8
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
+ IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
@@ -54,6 +55,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
#if HAVE_AARCH64_SVE_ASM
IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
+ IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
#endif
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
IFUNC_IMPL (i, name, memset,
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index a476dd548fd04fddcd0b39cc3263866769bb309a..0486213f08db8ab08a903c4b9e6bdd19df6dccec 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -33,27 +33,38 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
-# if HAVE_AARCH64_SVE_ASM
extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
-# endif
-
-libc_ifunc (__libc_memcpy,
- (IS_THUNDERX (midr)
- ? __memcpy_thunderx
- : (IS_FALKOR (midr) || IS_PHECDA (midr)
- ? __memcpy_falkor
- : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
- ? __memcpy_thunderx2
- : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
- || IS_NEOVERSE_V1 (midr)
- ? __memcpy_simd
-# if HAVE_AARCH64_SVE_ASM
- : (IS_A64FX (midr) && sve
- ? __memcpy_a64fx
- : __memcpy_generic))))));
-# else
- : __memcpy_generic)))));
-# endif
+extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
+
+static inline __typeof (__redirect_memcpy) *
+select_memcpy_ifunc (void)
+{
+ INIT_ARCH ();
+
+ if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
+ return __memcpy_simd;
+
+ if (sve && HAVE_AARCH64_SVE_ASM)
+ {
+ if (IS_A64FX (midr))
+ return __memcpy_a64fx;
+ return __memcpy_sve;
+ }
+
+ if (IS_THUNDERX (midr))
+ return __memcpy_thunderx;
+
+ if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
+ return __memcpy_thunderx2;
+
+ if (IS_FALKOR (midr) || IS_PHECDA (midr))
+ return __memcpy_falkor;
+
+ return __memcpy_generic;
+}
+
+libc_ifunc (__libc_memcpy, select_memcpy_ifunc ());
+
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
#endif
diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S
new file mode 100644
index 0000000000000000000000000000000000000000..a70907ec5595114e4da35c54e932279c22e53cc6
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcpy_sve.S
@@ -0,0 +1,218 @@
+/* Optimized memcpy for SVE.
+ Copyright (C) 2021-2022 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ *
+ */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define tmp1 x6
+#define vlen x6
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
+/* This implementation supports both memcpy and memmove and shares most code.
+ It uses unaligned accesses and branchless sequences to keep the code small,
+ simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check in memmove is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The source pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+#if HAVE_AARCH64_SVE_ASM
+
+ .arch armv8.2-a+sve
+
+ENTRY (__memcpy_sve)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ whilelo p0.b, xzr, count
+ cntb vlen
+ tbnz vlen, 4, L(vlen128)
+ ld1b z0.b, p0/z, [src]
+ st1b z0.b, p0, [dstin]
+ ret
+
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ add srcend, src, count
+ add dstend, dstin, count
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_q, F_q, [src, 32]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
+L(copy96):
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
+ ret
+
+L(vlen128):
+ whilelo p1.b, vlen, count
+ ld1b z0.b, p0/z, [src, 0, mul vl]
+ ld1b z1.b, p1/z, [src, 1, mul vl]
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z1.b, p1, [dstin, 1, mul vl]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ add srcend, src, count
+ add dstend, dstin, count
+
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+L(loop64):
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
+ ret
+
+END (__memcpy_sve)
+libc_hidden_builtin_def (__memcpy_sve)
+
+
+ENTRY (__memmove_sve)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+
+ cmp count, 128
+ b.hi L(move_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ whilelo p0.b, xzr, count
+ cntb vlen
+ tbnz vlen, 4, L(vlen128)
+ ld1b z0.b, p0/z, [src]
+ st1b z0.b, p0, [dstin]
+ ret
+
+ .p2align 4
+L(move_long):
+ add srcend, src, count
+ add dstend, dstin, count
+ /* Only use backward copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(return)
+ cmp tmp1, count
+ b.hs L(copy_long)
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
+ sub count, count, tmp1
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(return):
+ ret
+
+END (__memmove_sve)
+libc_hidden_builtin_def (__memmove_sve)
+#endif
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 4f7d7eedfd943d1b6a54e90dc973406b30cc8db9..261996ecc4b36bdaf3c50598d910e9c7dcace08b 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -33,27 +33,38 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
-# if HAVE_AARCH64_SVE_ASM
extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
-# endif
-
-libc_ifunc (__libc_memmove,
- (IS_THUNDERX (midr)
- ? __memmove_thunderx
- : (IS_FALKOR (midr) || IS_PHECDA (midr)
- ? __memmove_falkor
- : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
- ? __memmove_thunderx2
- : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)
- || IS_NEOVERSE_V1 (midr)
- ? __memmove_simd
-# if HAVE_AARCH64_SVE_ASM
- : (IS_A64FX (midr) && sve
- ? __memmove_a64fx
- : __memmove_generic))))));
-# else
- : __memmove_generic)))));
-# endif
+extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden;
+
+static inline __typeof (__redirect_memmove) *
+select_memmove_ifunc (void)
+{
+ INIT_ARCH ();
+
+ if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr))
+ return __memmove_simd;
+
+ if (sve && HAVE_AARCH64_SVE_ASM)
+ {
+ if (IS_A64FX (midr))
+ return __memmove_a64fx;
+ return __memmove_sve;
+ }
+
+ if (IS_THUNDERX (midr))
+ return __memmove_thunderx;
+
+ if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr))
+ return __memmove_thunderx2;
+
+ if (IS_FALKOR (midr) || IS_PHECDA (midr))
+ return __memmove_falkor;
+
+ return __memmove_generic;
+}
+
+libc_ifunc (__libc_memmove, select_memmove_ifunc ());
+
# undef memmove
strong_alias (__libc_memmove, memmove);
#endif
The 06/01/2022 17:03, Wilco Dijkstra wrote: > Hi Szabolcs, > > > i'd expect MAX_IFUNC to be updated whenever we add a new memcpy. > > Yes this macro is quite broken, the value 7 should be correct, but I've set it to 8 for now. The correct approach is to check for 'max' inside IFUNC_IMPL_ADD. > > Here is v2 (which also improves the readability of the ifunc selectors): > > Add an initial SVE memcpy implementation. Copies up to 32 bytes use SVE vectors > which improves the random memcpy benchmark significantly. Cleanup the memcpy > and memmove ifunc selectors. > thanks, this looks good. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> > --- > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index 7500cf1e9369443daed1c5a938903dcff9d0a45e..490aaa0faf1e74fb7014e2ed6b574d92b8b83637 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -1,6 +1,6 @@ > ifeq ($(subdir),string) > sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ > - memcpy_falkor memcpy_a64fx \ > + memcpy_falkor memcpy_a64fx memcpy_sve \ > memset_generic memset_falkor memset_emag memset_kunpeng \ > memset_a64fx \ > memchr_generic memchr_nosimd \ > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 483fbbdb523c9a5b697a9b7f63153d2cefb482ac..f6c6d008da285122cf3549efa5adf05d5e098680 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -25,7 +25,7 @@ > #include <stdio.h> > > /* Maximum number of IFUNC implementations. */ > -#define MAX_IFUNC 7 > +#define MAX_IFUNC 8 > > size_t > __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > @@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) > #if HAVE_AARCH64_SVE_ASM > IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx) > + IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) > #endif > IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) > IFUNC_IMPL (i, name, memmove, > @@ -54,6 +55,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) > #if HAVE_AARCH64_SVE_ASM > IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx) > + IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) > #endif > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > IFUNC_IMPL (i, name, memset, > diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c > index a476dd548fd04fddcd0b39cc3263866769bb309a..0486213f08db8ab08a903c4b9e6bdd19df6dccec 100644 > --- a/sysdeps/aarch64/multiarch/memcpy.c > +++ b/sysdeps/aarch64/multiarch/memcpy.c > @@ -33,27 +33,38 @@ extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; > extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; > -# if HAVE_AARCH64_SVE_ASM > extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; > -# endif > - > -libc_ifunc (__libc_memcpy, > - (IS_THUNDERX (midr) > - ? __memcpy_thunderx > - : (IS_FALKOR (midr) || IS_PHECDA (midr) > - ? __memcpy_falkor > - : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) > - ? __memcpy_thunderx2 > - : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) > - || IS_NEOVERSE_V1 (midr) > - ? __memcpy_simd > -# if HAVE_AARCH64_SVE_ASM > - : (IS_A64FX (midr) && sve > - ? __memcpy_a64fx > - : __memcpy_generic)))))); > -# else > - : __memcpy_generic))))); > -# endif > +extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; > + > +static inline __typeof (__redirect_memcpy) * > +select_memcpy_ifunc (void) > +{ > + INIT_ARCH (); > + > + if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) > + return __memcpy_simd; > + > + if (sve && HAVE_AARCH64_SVE_ASM) > + { > + if (IS_A64FX (midr)) > + return __memcpy_a64fx; > + return __memcpy_sve; > + } > + > + if (IS_THUNDERX (midr)) > + return __memcpy_thunderx; > + > + if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) > + return __memcpy_thunderx2; > + > + if (IS_FALKOR (midr) || IS_PHECDA (midr)) > + return __memcpy_falkor; > + > + return __memcpy_generic; > +} > + > +libc_ifunc (__libc_memcpy, select_memcpy_ifunc ()); > + > # undef memcpy > strong_alias (__libc_memcpy, memcpy); > #endif > diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S > new file mode 100644 > index 0000000000000000000000000000000000000000..a70907ec5595114e4da35c54e932279c22e53cc6 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memcpy_sve.S > @@ -0,0 +1,218 @@ > +/* Optimized memcpy for SVE. > + Copyright (C) 2021-2022 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* Assumptions: > + * > + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. > + * > + */ > + > +#define dstin x0 > +#define src x1 > +#define count x2 > +#define dst x3 > +#define srcend x4 > +#define dstend x5 > +#define tmp1 x6 > +#define vlen x6 > + > +#define A_q q0 > +#define B_q q1 > +#define C_q q2 > +#define D_q q3 > +#define E_q q4 > +#define F_q q5 > +#define G_q q6 > +#define H_q q7 > + > +/* This implementation supports both memcpy and memmove and shares most code. > + It uses unaligned accesses and branchless sequences to keep the code small, > + simple and improve performance. > + > + Copies are split into 3 main cases: small copies of up to 32 bytes, medium > + copies of up to 128 bytes, and large copies. The overhead of the overlap > + check in memmove is negligible since it is only required for large copies. > + > + Large copies use a software pipelined loop processing 64 bytes per iteration. > + The source pointer is 16-byte aligned to minimize unaligned accesses. > + The loop tail is handled by always copying 64 bytes from the end. > +*/ > + > +#if HAVE_AARCH64_SVE_ASM > + > + .arch armv8.2-a+sve > + > +ENTRY (__memcpy_sve) > + PTR_ARG (0) > + PTR_ARG (1) > + SIZE_ARG (2) > + > + cmp count, 128 > + b.hi L(copy_long) > + cmp count, 32 > + b.hi L(copy32_128) > + > + whilelo p0.b, xzr, count > + cntb vlen > + tbnz vlen, 4, L(vlen128) > + ld1b z0.b, p0/z, [src] > + st1b z0.b, p0, [dstin] > + ret > + > + /* Medium copies: 33..128 bytes. */ > +L(copy32_128): > + add srcend, src, count > + add dstend, dstin, count > + ldp A_q, B_q, [src] > + ldp C_q, D_q, [srcend, -32] > + cmp count, 64 > + b.hi L(copy128) > + stp A_q, B_q, [dstin] > + stp C_q, D_q, [dstend, -32] > + ret > + > + /* Copy 65..128 bytes. */ > +L(copy128): > + ldp E_q, F_q, [src, 32] > + cmp count, 96 > + b.ls L(copy96) > + ldp G_q, H_q, [srcend, -64] > + stp G_q, H_q, [dstend, -64] > +L(copy96): > + stp A_q, B_q, [dstin] > + stp E_q, F_q, [dstin, 32] > + stp C_q, D_q, [dstend, -32] > + ret > + > +L(vlen128): > + whilelo p1.b, vlen, count > + ld1b z0.b, p0/z, [src, 0, mul vl] > + ld1b z1.b, p1/z, [src, 1, mul vl] > + st1b z0.b, p0, [dstin, 0, mul vl] > + st1b z1.b, p1, [dstin, 1, mul vl] > + ret > + > + .p2align 4 > + /* Copy more than 128 bytes. */ > +L(copy_long): > + add srcend, src, count > + add dstend, dstin, count > + > + /* Copy 16 bytes and then align src to 16-byte alignment. */ > + ldr D_q, [src] > + and tmp1, src, 15 > + bic src, src, 15 > + sub dst, dstin, tmp1 > + add count, count, tmp1 /* Count is now 16 too large. */ > + ldp A_q, B_q, [src, 16] > + str D_q, [dstin] > + ldp C_q, D_q, [src, 48] > + subs count, count, 128 + 16 /* Test and readjust count. */ > + b.ls L(copy64_from_end) > +L(loop64): > + stp A_q, B_q, [dst, 16] > + ldp A_q, B_q, [src, 80] > + stp C_q, D_q, [dst, 48] > + ldp C_q, D_q, [src, 112] > + add src, src, 64 > + add dst, dst, 64 > + subs count, count, 64 > + b.hi L(loop64) > + > + /* Write the last iteration and copy 64 bytes from the end. */ > +L(copy64_from_end): > + ldp E_q, F_q, [srcend, -64] > + stp A_q, B_q, [dst, 16] > + ldp A_q, B_q, [srcend, -32] > + stp C_q, D_q, [dst, 48] > + stp E_q, F_q, [dstend, -64] > + stp A_q, B_q, [dstend, -32] > + ret > + > +END (__memcpy_sve) > +libc_hidden_builtin_def (__memcpy_sve) > + > + > +ENTRY (__memmove_sve) > + PTR_ARG (0) > + PTR_ARG (1) > + SIZE_ARG (2) > + > + cmp count, 128 > + b.hi L(move_long) > + cmp count, 32 > + b.hi L(copy32_128) > + > + whilelo p0.b, xzr, count > + cntb vlen > + tbnz vlen, 4, L(vlen128) > + ld1b z0.b, p0/z, [src] > + st1b z0.b, p0, [dstin] > + ret > + > + .p2align 4 > +L(move_long): > + add srcend, src, count > + add dstend, dstin, count > + /* Only use backward copy if there is an overlap. */ > + sub tmp1, dstin, src > + cbz tmp1, L(return) > + cmp tmp1, count > + b.hs L(copy_long) > + > + /* Large backwards copy for overlapping copies. > + Copy 16 bytes and then align srcend to 16-byte alignment. */ > + ldr D_q, [srcend, -16] > + and tmp1, srcend, 15 > + bic srcend, srcend, 15 > + sub count, count, tmp1 > + ldp A_q, B_q, [srcend, -32] > + str D_q, [dstend, -16] > + ldp C_q, D_q, [srcend, -64] > + sub dstend, dstend, tmp1 > + subs count, count, 128 > + b.ls L(copy64_from_start) > + > +L(loop64_backwards): > + str B_q, [dstend, -16] > + str A_q, [dstend, -32] > + ldp A_q, B_q, [srcend, -96] > + str D_q, [dstend, -48] > + str C_q, [dstend, -64]! > + ldp C_q, D_q, [srcend, -128] > + sub srcend, srcend, 64 > + subs count, count, 64 > + b.hi L(loop64_backwards) > + > + /* Write the last iteration and copy 64 bytes from the start. */ > +L(copy64_from_start): > + ldp E_q, F_q, [src, 32] > + stp A_q, B_q, [dstend, -32] > + ldp A_q, B_q, [src] > + stp C_q, D_q, [dstend, -64] > + stp E_q, F_q, [dstin, 32] > + stp A_q, B_q, [dstin] > +L(return): > + ret > + > +END (__memmove_sve) > +libc_hidden_builtin_def (__memmove_sve) > +#endif > diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c > index 4f7d7eedfd943d1b6a54e90dc973406b30cc8db9..261996ecc4b36bdaf3c50598d910e9c7dcace08b 100644 > --- a/sysdeps/aarch64/multiarch/memmove.c > +++ b/sysdeps/aarch64/multiarch/memmove.c > @@ -33,27 +33,38 @@ extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; > extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; > -# if HAVE_AARCH64_SVE_ASM > extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden; > -# endif > - > -libc_ifunc (__libc_memmove, > - (IS_THUNDERX (midr) > - ? __memmove_thunderx > - : (IS_FALKOR (midr) || IS_PHECDA (midr) > - ? __memmove_falkor > - : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) > - ? __memmove_thunderx2 > - : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) > - || IS_NEOVERSE_V1 (midr) > - ? __memmove_simd > -# if HAVE_AARCH64_SVE_ASM > - : (IS_A64FX (midr) && sve > - ? __memmove_a64fx > - : __memmove_generic)))))); > -# else > - : __memmove_generic))))); > -# endif > +extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden; > + > +static inline __typeof (__redirect_memmove) * > +select_memmove_ifunc (void) > +{ > + INIT_ARCH (); > + > + if (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr)) > + return __memmove_simd; > + > + if (sve && HAVE_AARCH64_SVE_ASM) > + { > + if (IS_A64FX (midr)) > + return __memmove_a64fx; > + return __memmove_sve; > + } > + > + if (IS_THUNDERX (midr)) > + return __memmove_thunderx; > + > + if (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)) > + return __memmove_thunderx2; > + > + if (IS_FALKOR (midr) || IS_PHECDA (midr)) > + return __memmove_falkor; > + > + return __memmove_generic; > +} > + > +libc_ifunc (__libc_memmove, select_memmove_ifunc ()); > + > # undef memmove > strong_alias (__libc_memmove, memmove); > #endif >
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 7500cf1e9369443daed1c5a938903dcff9d0a45e..490aaa0faf1e74fb7014e2ed6b574d92b8b83637 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,6 +1,6 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \ - memcpy_falkor memcpy_a64fx \ + memcpy_falkor memcpy_a64fx memcpy_sve \ memset_generic memset_falkor memset_emag memset_kunpeng \ memset_a64fx \ memchr_generic memchr_nosimd \ diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 483fbbdb523c9a5b697a9b7f63153d2cefb482ac..e12c1a9fad3c2b1061f3f9d10accfe61ccc5ea50 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -45,6 +45,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx) + IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve) #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic)) IFUNC_IMPL (i, name, memmove, @@ -54,6 +55,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd) #if HAVE_AARCH64_SVE_ASM IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx) + IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve) #endif IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c index a476dd548fd04fddcd0b39cc3263866769bb309a..6172fddcb83baf91e78f621cae7cf4dd9aff822a 100644 --- a/sysdeps/aarch64/multiarch/memcpy.c +++ b/sysdeps/aarch64/multiarch/memcpy.c @@ -35,6 +35,7 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden; # if HAVE_AARCH64_SVE_ASM extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden; # endif libc_ifunc (__libc_memcpy, @@ -45,15 +46,12 @@ libc_ifunc (__libc_memcpy, : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memcpy_thunderx2 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) - || IS_NEOVERSE_V1 (midr) ? __memcpy_simd -# if HAVE_AARCH64_SVE_ASM - : (IS_A64FX (midr) && sve - ? __memcpy_a64fx - : __memcpy_generic)))))); -# else - : __memcpy_generic))))); -# endif + : (sve && HAVE_AARCH64_SVE_ASM + ? (IS_A64FX (midr) + ? __memcpy_a64fx + : __memcpy_sve) + : __memcpy_generic)))))); # undef memcpy strong_alias (__libc_memcpy, memcpy); #endif diff --git a/sysdeps/aarch64/multiarch/memcpy_sve.S b/sysdeps/aarch64/multiarch/memcpy_sve.S new file mode 100644 index 0000000000000000000000000000000000000000..a70907ec5595114e4da35c54e932279c22e53cc6 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memcpy_sve.S @@ -0,0 +1,218 @@ +/* Optimized memcpy for SVE. + Copyright (C) 2021-2022 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses. + * + */ + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation supports both memcpy and memmove and shares most code. + It uses unaligned accesses and branchless sequences to keep the code small, + simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check in memmove is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +#if HAVE_AARCH64_SVE_ASM + + .arch armv8.2-a+sve + +ENTRY (__memcpy_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + cntb vlen + tbnz vlen, 4, L(vlen128) + ld1b z0.b, p0/z, [src] + st1b z0.b, p0, [dstin] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + +L(vlen128): + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + +END (__memcpy_sve) +libc_hidden_builtin_def (__memcpy_sve) + + +ENTRY (__memmove_sve) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + + cmp count, 128 + b.hi L(move_long) + cmp count, 32 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + cntb vlen + tbnz vlen, 4, L(vlen128) + ld1b z0.b, p0/z, [src] + st1b z0.b, p0, [dstin] + ret + + .p2align 4 +L(move_long): + add srcend, src, count + add dstend, dstin, count + /* Only use backward copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(return) + cmp tmp1, count + b.hs L(copy_long) + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memmove_sve) +libc_hidden_builtin_def (__memmove_sve) +#endif diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c index 4f7d7eedfd943d1b6a54e90dc973406b30cc8db9..8713932d719fd2f31ac8914d28f586548770936e 100644 --- a/sysdeps/aarch64/multiarch/memmove.c +++ b/sysdeps/aarch64/multiarch/memmove.c @@ -35,6 +35,7 @@ extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden; extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden; # if HAVE_AARCH64_SVE_ASM extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden; # endif libc_ifunc (__libc_memmove, @@ -45,15 +46,12 @@ libc_ifunc (__libc_memmove, : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr) ? __memmove_thunderx2 : (IS_NEOVERSE_N1 (midr) || IS_NEOVERSE_N2 (midr) - || IS_NEOVERSE_V1 (midr) ? __memmove_simd -# if HAVE_AARCH64_SVE_ASM - : (IS_A64FX (midr) && sve - ? __memmove_a64fx - : __memmove_generic)))))); -# else - : __memmove_generic))))); -# endif + : (sve && HAVE_AARCH64_SVE_ASM + ? (IS_A64FX (midr) + ? __memmove_a64fx + : __memmove_sve) + : __memmove_generic)))))); # undef memmove strong_alias (__libc_memmove, memmove); #endif