diff mbox series

AArch64: Add support for MOPS memcpy/memmove/memset

Message ID PAWPR08MB89821F65EE83CF28A17B3F4B83D4A@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Add support for MOPS memcpy/memmove/memset | expand

Commit Message

Wilco Dijkstra Oct. 19, 2023, 3:37 p.m. UTC
Add support for MOPS in cpu_features and INIT_ARCH.  Add ifuncs using MOPS for
memcpy, memmove and memset (use .inst for now so it works with all binutils
versions without needing complex configure and conditional compilation).

OK for commit?
---

Comments

Adhemerval Zanella Netto Oct. 19, 2023, 5:03 p.m. UTC | #1
On 19/10/23 12:37, Wilco Dijkstra wrote:
> Add support for MOPS in cpu_features and INIT_ARCH.  Add ifuncs using MOPS for
> memcpy, memmove and memset (use .inst for now so it works with all binutils
> versions without needing complex configure and conditional compilation).
> 
> OK for commit?
> ---
> 
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 223777d94e350fdfd1bb82a4b38eea4653d63057..8ecf9bdd3c6facad11adb96b3713f2fcb5167db2 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,5 +1,6 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += \
> +  mem_mops \
>    memchr_generic \
>    memchr_nosimd \
>    memcpy_a64fx \
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index d274f01fdbd1c99d7f872bbdcdc7b2a1cbc7b042..da7f1153778efd41a15ef5aa900252f9e492b18d 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -41,6 +41,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
>  #endif
> +	      IFUNC_IMPL_ADD (array, i, memcpy, mops, __memcpy_mops)
>  	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
>    IFUNC_IMPL (i, name, memmove,
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> @@ -50,6 +51,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
>  	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
>  #endif
> +	      IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
>  	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
>    IFUNC_IMPL (i, name, memset,
>  	      /* Enable this on non-falkor processors too so that other cores
> @@ -60,6 +62,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  #if HAVE_AARCH64_SVE_ASM
>  	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
>  #endif
> +	      IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>    IFUNC_IMPL (i, name, memchr,
>  	      IFUNC_IMPL_ADD (array, i, memchr, !mte, __memchr_nosimd)
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 6de081e3814d55812572c889fb8749e0e4e6222a..e23e6ff29042a68ce5499eddcc35b4f7f2ffa14d 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -35,4 +35,6 @@
>    bool __attribute__((unused)) mte =					      \
>      MTE_ENABLED ();							      \
>    bool __attribute__((unused)) sve =					      \
> -    GLRO(dl_aarch64_cpu_features).sve;
> +    GLRO(dl_aarch64_cpu_features).sve;					      \
> +  bool __attribute__((unused)) mops =					      \
> +    GLRO(dl_aarch64_cpu_features).mops;
> diff --git a/sysdeps/aarch64/multiarch/mem_mops.S b/sysdeps/aarch64/multiarch/mem_mops.S
> new file mode 100644
> index 0000000000000000000000000000000000000000..36316f14e369c1dddb3fc3f234160195861cc8da
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/mem_mops.S
> @@ -0,0 +1,69 @@
> +/* Optimized memcpy/memmove/memset for MOPS.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, MOPS.
> + *
> + */
> +
> +ENTRY (__memcpy_mops)
> +	PTR_ARG (0)
> +	PTR_ARG (1)
> +	SIZE_ARG (2)
> +
> +	mov	x3, x0
> +	.inst	0x19010443	/* cpyfp   [x3]!, [x1]!, x2!  */
> +	.inst	0x19410443	/* cpyfm   [x3]!, [x1]!, x2!  */
> +	.inst	0x19810443	/* cpyfe   [x3]!, [x1]!, x2!  */
> +	ret
> +
> +END (__memcpy_mops)
> +libc_hidden_builtin_def (__memcpy_mops)
> +
> +
> +ENTRY (__memmove_mops)
> +	PTR_ARG (0)
> +	PTR_ARG (1)
> +	SIZE_ARG (2)
> +
> +	mov	x3, x0
> +	.inst	0x1d010443	/* cpyp    [x3]!, [x1]!, x2!  */
> +	.inst	0x1d410443	/* cpym    [x3]!, [x1]!, x2!  */
> +	.inst	0x1d810443	/* cpye    [x3]!, [x1]!, x2!  */
> +	ret
> +
> +END (__memmove_mops)
> +libc_hidden_builtin_def (__memmove_mops)
> +

I think it would be better to move each function to its own file.

Also, the libc_hidden_builtin_def is superflous here, libc does not use
internal names in any place (other aarch64 implementation have the same
directive).

The libc_hidden_builtin_def does not really make the symbol hidden
on assembly, but rather add an alias to a __GI_##name symbol.  To
actually sets the symbol hidden you will need to add a .hidden symbol
directive.

> +
> +ENTRY (__memset_mops)
> +	PTR_ARG (0)
> +	SIZE_ARG (2)
> +
> +	mov     x3, x0
> +	.inst   0x19c10443	/* setp    [x3]!, x2!, x1  */
> +	.inst   0x19c14443	/* setm    [x3]!, x2!, x1  */
> +	.inst   0x19c18443	/* sete    [x3]!, x2!, x1  */
> +	ret
> +
> +END (__memset_mops)
> +libc_hidden_builtin_def (__memset_mops)
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index 3aae915c5f479fe5426399fbaa2f951d5771a997..9aace954cbfd1eb3e2b35e570e4eb31bbb3c6cfe 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -34,12 +34,16 @@ extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
>  extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
> +extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden;
>  
>  static inline __typeof (__redirect_memcpy) *
>  select_memcpy_ifunc (void)
>  {
>    INIT_ARCH ();
>  
> +  if (mops)
> +    return __memcpy_mops;
> +
>    if (sve && HAVE_AARCH64_SVE_ASM)
>      {
>        if (IS_A64FX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index 312f90f111868c71dd1f32f4a175a28e93c948ee..fd346e7b73a86a076ba8e1cdd7fd588098333f48 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -34,12 +34,16 @@ extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
>  extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden;
> +extern __typeof (__redirect_memmove) __memmove_mops attribute_hidden;
>  
>  static inline __typeof (__redirect_memmove) *
>  select_memmove_ifunc (void)
>  {
>    INIT_ARCH ();
>  
> +  if (mops)
> +    return __memmove_mops;
> +
>    if (sve && HAVE_AARCH64_SVE_ASM)
>      {
>        if (IS_A64FX (midr))
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index f9c81d3d8e46ec86d9f100835e1c43210899770a..23fc66e15879847557b0e4f6941f03bc7ac5cab9 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -33,12 +33,16 @@ extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
>  
>  static inline __typeof (__redirect_memset) *
>  select_memset_ifunc (void)
>  {
>    INIT_ARCH ();
>  
> +  if (mops)
> +    return __memset_mops;
> +
>    if (sve && HAVE_AARCH64_SVE_ASM)
>      {
>        if (IS_A64FX (midr) && zva_size == 256)
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index d67d286b536ea599dcd8873fa2b882c5c27e8b05..40b709677d86f040c653315199f62677425abc58 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -76,6 +76,7 @@ struct cpu_features
>    /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
>    uint8_t mte_state;
>    bool sve;
> +  bool mops;
>  };
>  
>  #endif /* _CPU_FEATURES_AARCH64_H  */
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index dc09c1c8274855c8215c2c6d57af9a2b5f8a7e2f..233d5b2407e2b792805b7fa661852f59fca0cb71 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -120,4 +120,7 @@ init_cpu_features (struct cpu_features *cpu_features)
>  
>    /* Check if SVE is supported.  */
>    cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
> +
> +  /* Check if MOPS is supported.  */
> +  cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
>  }
>
Wilco Dijkstra Oct. 20, 2023, 3:19 p.m. UTC | #2
Hi Adhemerval,

> I think it would be better to move each function to its own file.

I can do that, but it's just extra effort, more files, more maintenance for no gain...

> Also, the libc_hidden_builtin_def is superflous here, libc does not use
> internal names in any place (other aarch64 implementation have the same
> directive).
>
> The libc_hidden_builtin_def does not really make the symbol hidden
> on assembly, but rather add an alias to a __GI_##name symbol.  To
> actually sets the symbol hidden you will need to add a .hidden symbol
> directive.

It's unclear to me what all these magic defines do... So you're saying all
of the internal implementations should use .hidden instead of one of the
[libc_]hidden_[builtin_]def macros? Ie. they should not be used inside
the multiarch directory at all?

Cheers,
Wilco
Adhemerval Zanella Netto Oct. 20, 2023, 3:56 p.m. UTC | #3
On 20/10/23 12:19, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
>> I think it would be better to move each function to its own file.
> 
> I can do that, but it's just extra effort, more files, more maintenance for no gain...

It helps on static linking (although due function size not that much).
The multiarch is usually quite messy already (x86_64 is example), so I
am not sure if packing implementation does not really related is the
best way forward.

> 
>> Also, the libc_hidden_builtin_def is superflous here, libc does not use
>> internal names in any place (other aarch64 implementation have the same
>> directive).
>>
>> The libc_hidden_builtin_def does not really make the symbol hidden
>> on assembly, but rather add an alias to a __GI_##name symbol.  To
>> actually sets the symbol hidden you will need to add a .hidden symbol
>> directive.
> 
> It's unclear to me what all these magic defines do... So you're saying all
> of the internal implementations should use .hidden instead of one of the
> [libc_]hidden_[builtin_]def macros? Ie. they should not be used inside
> the multiarch directory at all?

The issue is libc_hidden_def is different for assembly implementation,
compare to its C counterpart.  The C macro will create a __GI_##symbol
global hidden symbol alias, where for assembly it will create a global
symb alias.  It should not really matter if the caller always see
the libc_hidden_proto (so compiler will call the __GI_ symbol.
Szabolcs Nagy Oct. 20, 2023, 4:04 p.m. UTC | #4
The 10/20/2023 16:19, Wilco Dijkstra wrote:
> Hi Adhemerval,
> 
> > I think it would be better to move each function to its own file.
> 
> I can do that, but it's just extra effort, more files, more maintenance for no gain...
> 
> > Also, the libc_hidden_builtin_def is superflous here, libc does not use
> > internal names in any place (other aarch64 implementation have the same
> > directive).
> >
> > The libc_hidden_builtin_def does not really make the symbol hidden
> > on assembly, but rather add an alias to a __GI_##name symbol.  To
> > actually sets the symbol hidden you will need to add a .hidden symbol
> > directive.
> 
> It's unclear to me what all these magic defines do... So you're saying all
> of the internal implementations should use .hidden instead of one of the
> [libc_]hidden_[builtin_]def macros? Ie. they should not be used inside
> the multiarch directory at all?

we don't need .hidden: all symbols are hidden in libc.so unless
the Versions files export the symbol explicitly (and for static
linking we don't really care about hidden).

i think the __GI_foo alias to foo is only needed if foo is an
exported sym, then it can avoid a PLT when foo called internally.

i think we should just drop these macros in the multiarch asm.
(and fix things up e.g. if asm code is directly included across
multiarch and generic code such that only generic exported syms
get the alias)
Florian Weimer Oct. 21, 2023, 10:12 p.m. UTC | #5
* Szabolcs Nagy:

> i think the __GI_foo alias to foo is only needed if foo is an
> exported sym, then it can avoid a PLT when foo called internally.
>
> i think we should just drop these macros in the multiarch asm.
> (and fix things up e.g. if asm code is directly included across
> multiarch and generic code such that only generic exported syms
> get the alias)

We might want to start exporting the alternative implementation under
GLIBC_PRIVATE names, so that we can find them in the dynamic symbol
table.  Then we could have an LD_DEBUG mode that prints these function
symbols, translated from the function addresses returned from IFUNC
resolvers.  I think this would be a really useful diagnostic tool.

Thanks,
Florian
Wilco Dijkstra Oct. 23, 2023, 12:35 p.m. UTC | #6
Hi,

>> It's unclear to me what all these magic defines do... So you're saying all
>> of the internal implementations should use .hidden instead of one of the
>> [libc_]hidden_[builtin_]def macros? Ie. they should not be used inside
>> the multiarch directory at all?
>
> we don't need .hidden: all symbols are hidden in libc.so unless
> the Versions files export the symbol explicitly (and for static
> linking we don't really care about hidden).

OK, I'll remove these defines in the next version then.

> i think we should just drop these macros in the multiarch asm.
> (and fix things up e.g. if asm code is directly included across
> multiarch and generic code such that only generic exported syms
> get the alias)

I'll do a quick cleanup of the existing multiarch directory in a seperate
patch. There are many unnecessary differences between the various
implementations.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 223777d94e350fdfd1bb82a4b38eea4653d63057..8ecf9bdd3c6facad11adb96b3713f2fcb5167db2 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,5 +1,6 @@ 
 ifeq ($(subdir),string)
 sysdep_routines += \
+  mem_mops \
   memchr_generic \
   memchr_nosimd \
   memcpy_a64fx \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index d274f01fdbd1c99d7f872bbdcdc7b2a1cbc7b042..da7f1153778efd41a15ef5aa900252f9e492b18d 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -41,6 +41,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)
 	      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_sve)
 #endif
+	      IFUNC_IMPL_ADD (array, i, memcpy, mops, __memcpy_mops)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
   IFUNC_IMPL (i, name, memmove,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
@@ -50,6 +51,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_a64fx)
 	      IFUNC_IMPL_ADD (array, i, memmove, sve, __memmove_sve)
 #endif
+	      IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
   IFUNC_IMPL (i, name, memset,
 	      /* Enable this on non-falkor processors too so that other cores
@@ -60,6 +62,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #if HAVE_AARCH64_SVE_ASM
 	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 256, __memset_a64fx)
 #endif
+	      IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
   IFUNC_IMPL (i, name, memchr,
 	      IFUNC_IMPL_ADD (array, i, memchr, !mte, __memchr_nosimd)
diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
index 6de081e3814d55812572c889fb8749e0e4e6222a..e23e6ff29042a68ce5499eddcc35b4f7f2ffa14d 100644
--- a/sysdeps/aarch64/multiarch/init-arch.h
+++ b/sysdeps/aarch64/multiarch/init-arch.h
@@ -35,4 +35,6 @@ 
   bool __attribute__((unused)) mte =					      \
     MTE_ENABLED ();							      \
   bool __attribute__((unused)) sve =					      \
-    GLRO(dl_aarch64_cpu_features).sve;
+    GLRO(dl_aarch64_cpu_features).sve;					      \
+  bool __attribute__((unused)) mops =					      \
+    GLRO(dl_aarch64_cpu_features).mops;
diff --git a/sysdeps/aarch64/multiarch/mem_mops.S b/sysdeps/aarch64/multiarch/mem_mops.S
new file mode 100644
index 0000000000000000000000000000000000000000..36316f14e369c1dddb3fc3f234160195861cc8da
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/mem_mops.S
@@ -0,0 +1,69 @@ 
+/* Optimized memcpy/memmove/memset for MOPS.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, MOPS.
+ *
+ */
+
+ENTRY (__memcpy_mops)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	mov	x3, x0
+	.inst	0x19010443	/* cpyfp   [x3]!, [x1]!, x2!  */
+	.inst	0x19410443	/* cpyfm   [x3]!, [x1]!, x2!  */
+	.inst	0x19810443	/* cpyfe   [x3]!, [x1]!, x2!  */
+	ret
+
+END (__memcpy_mops)
+libc_hidden_builtin_def (__memcpy_mops)
+
+
+ENTRY (__memmove_mops)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	mov	x3, x0
+	.inst	0x1d010443	/* cpyp    [x3]!, [x1]!, x2!  */
+	.inst	0x1d410443	/* cpym    [x3]!, [x1]!, x2!  */
+	.inst	0x1d810443	/* cpye    [x3]!, [x1]!, x2!  */
+	ret
+
+END (__memmove_mops)
+libc_hidden_builtin_def (__memmove_mops)
+
+
+ENTRY (__memset_mops)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	mov     x3, x0
+	.inst   0x19c10443	/* setp    [x3]!, x2!, x1  */
+	.inst   0x19c14443	/* setm    [x3]!, x2!, x1  */
+	.inst   0x19c18443	/* sete    [x3]!, x2!, x1  */
+	ret
+
+END (__memset_mops)
+libc_hidden_builtin_def (__memset_mops)
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 3aae915c5f479fe5426399fbaa2f951d5771a997..9aace954cbfd1eb3e2b35e570e4eb31bbb3c6cfe 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -34,12 +34,16 @@  extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_a64fx attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_sve attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_mops attribute_hidden;
 
 static inline __typeof (__redirect_memcpy) *
 select_memcpy_ifunc (void)
 {
   INIT_ARCH ();
 
+  if (mops)
+    return __memcpy_mops;
+
   if (sve && HAVE_AARCH64_SVE_ASM)
     {
       if (IS_A64FX (midr))
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 312f90f111868c71dd1f32f4a175a28e93c948ee..fd346e7b73a86a076ba8e1cdd7fd588098333f48 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -34,12 +34,16 @@  extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_a64fx attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_sve attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_mops attribute_hidden;
 
 static inline __typeof (__redirect_memmove) *
 select_memmove_ifunc (void)
 {
   INIT_ARCH ();
 
+  if (mops)
+    return __memmove_mops;
+
   if (sve && HAVE_AARCH64_SVE_ASM)
     {
       if (IS_A64FX (midr))
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index f9c81d3d8e46ec86d9f100835e1c43210899770a..23fc66e15879847557b0e4f6941f03bc7ac5cab9 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -33,12 +33,16 @@  extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
 extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
 extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
+extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
 
 static inline __typeof (__redirect_memset) *
 select_memset_ifunc (void)
 {
   INIT_ARCH ();
 
+  if (mops)
+    return __memset_mops;
+
   if (sve && HAVE_AARCH64_SVE_ASM)
     {
       if (IS_A64FX (midr) && zva_size == 256)
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index d67d286b536ea599dcd8873fa2b882c5c27e8b05..40b709677d86f040c653315199f62677425abc58 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -76,6 +76,7 @@  struct cpu_features
   /* Currently, the GLIBC memory tagging tunable only defines 8 bits.  */
   uint8_t mte_state;
   bool sve;
+  bool mops;
 };
 
 #endif /* _CPU_FEATURES_AARCH64_H  */
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index dc09c1c8274855c8215c2c6d57af9a2b5f8a7e2f..233d5b2407e2b792805b7fa661852f59fca0cb71 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -120,4 +120,7 @@  init_cpu_features (struct cpu_features *cpu_features)
 
   /* Check if SVE is supported.  */
   cpu_features->sve = GLRO (dl_hwcap) & HWCAP_SVE;
+
+  /* Check if MOPS is supported.  */
+  cpu_features->mops = GLRO (dl_hwcap2) & HWCAP2_MOPS;
 }