diff mbox series

[4/4] Libatomic: Clean up AArch64 `atomic_16.S' implementation file

Message ID 20240516133647.2453141-5-victor.donascimento@arm.com
State New
Headers show
Series Libatomic: Cleanup ifunc selector and aliasing | expand

Commit Message

Victor Do Nascimento May 16, 2024, 1:36 p.m. UTC
At present, `atomic_16.S' groups different implementations of the
same functions together in the file.  Therefore, as an example,
the LSE128 implementation of `exchange_16' follows on immediately
from its core implementation, as does the `fetch_or_16' LSE128
implementation.

Such architectural extension-dependent implementations are dependent
both on ifunc and assembler support.  They may therefore conceivably
be guarded by 2 preprocessor macros, e.g. `#if HAVE_IFUNC' and `#if
HAVE_FEAT_LSE128'.

Having to apply these guards on a per-function basis adds unnecessary
clutter to the file and makes its maintenance more error-prone.

We therefore reorganize the layout of the file in such a way that all
core implementations needing no `#ifdef's are placed first, followed
by all ifunc-dependent implementations, which can all be guarded by a
single `#if HAVE_IFUNC'.  Within the guard, these are then subdivided
and organized according to architectural extension requirements such
that in the case of LSE128-specific functions, for example, they can
all be guarded by a single `#if HAVE_FEAT_LSE128', greatly reducing
the overall number of required `#ifdef' macros.

libatomic/ChangeLog:

	* config/linux/aarch64/atomic_16.S: reshuffle functions.
---
 libatomic/config/linux/aarch64/atomic_16.S | 583 ++++++++++-----------
 1 file changed, 288 insertions(+), 295 deletions(-)
diff mbox series

Patch

diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S
index 16ff03057ab..27363f82b75 100644
--- a/libatomic/config/linux/aarch64/atomic_16.S
+++ b/libatomic/config/linux/aarch64/atomic_16.S
@@ -40,15 +40,12 @@ 
 
 #include "auto-config.h"
 
-#if !HAVE_IFUNC
-# undef HAVE_FEAT_LSE128
-# define HAVE_FEAT_LSE128 0
-#endif
-
-#define HAVE_FEAT_LSE2	HAVE_IFUNC
-
-#if HAVE_FEAT_LSE128
+#if HAVE_IFUNC
+# if HAVE_FEAT_LSE128
 	.arch	armv9-a+lse128
+# else
+	.arch	armv8-a+lse
+# endif
 #else
 	.arch	armv8-a+lse
 #endif
@@ -124,6 +121,8 @@  NAME:				\
 #define ACQ_REL 4
 #define SEQ_CST 5
 
+/* Core atomic operation implementations.  These are available irrespective of
+   ifunc support or the presence of additional architectural extensions.  */
 
 ENTRY (load_16)
 	mov	x5, x0
@@ -143,31 +142,6 @@  ENTRY (load_16)
 END (load_16)
 
 
-#if HAVE_FEAT_LSE2
-ENTRY_FEAT (load_16, LSE2)
-	cbnz	w1, 1f
-
-	/* RELAXED.  */
-	ldp	res0, res1, [x0]
-	ret
-1:
-	cmp	w1, SEQ_CST
-	b.eq	2f
-
-	/* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
-	ldp	res0, res1, [x0]
-	dmb	ishld
-	ret
-
-	/* SEQ_CST.  */
-2:	ldar	tmp0, [x0]	/* Block reordering with Store-Release instr.  */
-	ldp	res0, res1, [x0]
-	dmb	ishld
-	ret
-END_FEAT (load_16, LSE2)
-#endif
-
-
 ENTRY (store_16)
 	cbnz	w4, 2f
 
@@ -185,23 +159,6 @@  ENTRY (store_16)
 END (store_16)
 
 
-#if HAVE_FEAT_LSE2
-ENTRY_FEAT (store_16, LSE2)
-	cbnz	w4, 1f
-
-	/* RELAXED.  */
-	stp	in0, in1, [x0]
-	ret
-
-	/* RELEASE/SEQ_CST.  */
-1:	ldxp	xzr, tmp0, [x0]
-	stlxp	w4, in0, in1, [x0]
-	cbnz	w4, 1b
-	ret
-END_FEAT (store_16, LSE2)
-#endif
-
-
 ENTRY (exchange_16)
 	mov	x5, x0
 	cbnz	w4, 2f
@@ -229,31 +186,6 @@  ENTRY (exchange_16)
 END (exchange_16)
 
 
-#if HAVE_FEAT_LSE128
-ENTRY_FEAT (exchange_16, LSE128)
-	mov	tmp0, x0
-	mov	res0, in0
-	mov	res1, in1
-	cbnz	w4, 1f
-
-	/* RELAXED.  */
-	swpp	res0, res1, [tmp0]
-	ret
-1:
-	cmp	w4, ACQUIRE
-	b.hi	2f
-
-	/* ACQUIRE/CONSUME.  */
-	swppa	res0, res1, [tmp0]
-	ret
-
-	/* RELEASE/ACQ_REL/SEQ_CST.  */
-2:	swppal	res0, res1, [tmp0]
-	ret
-END_FEAT (exchange_16, LSE128)
-#endif
-
-
 ENTRY (compare_exchange_16)
 	ldp	exp0, exp1, [x1]
 	cbz	w4, 3f
@@ -301,43 +233,97 @@  ENTRY (compare_exchange_16)
 END (compare_exchange_16)
 
 
-#if HAVE_FEAT_LSE2
-ENTRY_FEAT (compare_exchange_16, LSE)
-	ldp	exp0, exp1, [x1]
-	mov	tmp0, exp0
-	mov	tmp1, exp1
-	cbz	w4, 2f
-	cmp	w4, RELEASE
-	b.hs	3f
+ENTRY (fetch_or_16)
+	mov	x5, x0
+	cbnz	w4, 2f
 
-	/* ACQUIRE/CONSUME.  */
-	caspa	exp0, exp1, in0, in1, [x0]
-0:
-	cmp	exp0, tmp0
-	ccmp	exp1, tmp1, 0, eq
-	bne	1f
-	mov	x0, 1
+	/* RELAXED.  */
+1:	ldxp	res0, res1, [x5]
+	orr	tmp0, res0, in0
+	orr	tmp1, res1, in1
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
 	ret
-1:
-	stp	exp0, exp1, [x1]
-	mov	x0, 0
+
+	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	orr	tmp0, res0, in0
+	orr	tmp1, res1, in1
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
 	ret
+END (fetch_or_16)
+
+
+ENTRY (or_fetch_16)
+	mov	x5, x0
+	cbnz	w4, 2f
 
 	/* RELAXED.  */
-2:	casp	exp0, exp1, in0, in1, [x0]
-	b	0b
+1:	ldxp	res0, res1, [x5]
+	orr	res0, res0, in0
+	orr	res1, res1, in1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
 
-	/* RELEASE.  */
-3:	b.hi	4f
-	caspl	exp0, exp1, in0, in1, [x0]
-	b	0b
+	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	orr	res0, res0, in0
+	orr	res1, res1, in1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (or_fetch_16)
+
+
+ENTRY (fetch_and_16)
+	mov	x5, x0
+	cbnz	w4, 2f
+
+	/* RELAXED.  */
+1:	ldxp	res0, res1, [x5]
+	and	tmp0, res0, in0
+	and	tmp1, res1, in1
+	stxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 1b
+	ret
+
+	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	and	tmp0, res0, in0
+	and	tmp1, res1, in1
+	stlxp	w4, tmp0, tmp1, [x5]
+	cbnz	w4, 2b
+	ret
+END (fetch_and_16)
+
+
+ENTRY (and_fetch_16)
+	mov	x5, x0
+	cbnz	w4, 2f
+
+	/* RELAXED.  */
+1:	ldxp	res0, res1, [x5]
+	and	res0, res0, in0
+	and	res1, res1, in1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
+	ret
+
+	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	and	res0, res0, in0
+	and	res1, res1, in1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
+	ret
+END (and_fetch_16)
 
-	/* ACQ_REL/SEQ_CST.  */
-4:	caspal	exp0, exp1, in0, in1, [x0]
-	b	0b
-END_FEAT (compare_exchange_16, LSE)
-#endif
 
+/* The following functions are currently single-implementation operations,
+   so they are never assigned an ifunc selector.  As such, they must be
+   reachable from __atomic_* entrypoints.  */
 
 ENTRY_ALIASED (fetch_add_16)
 	mov	x5, x0
@@ -427,309 +413,316 @@  ENTRY_ALIASED (sub_fetch_16)
 END (sub_fetch_16)
 
 
-ENTRY (fetch_or_16)
+ENTRY_ALIASED (fetch_xor_16)
 	mov	x5, x0
 	cbnz	w4, 2f
 
 	/* RELAXED.  */
 1:	ldxp	res0, res1, [x5]
-	orr	tmp0, res0, in0
-	orr	tmp1, res1, in1
+	eor	tmp0, res0, in0
+	eor	tmp1, res1, in1
 	stxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 1b
 	ret
 
 	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
 2:	ldaxp	res0, res1, [x5]
-	orr	tmp0, res0, in0
-	orr	tmp1, res1, in1
+	eor	tmp0, res0, in0
+	eor	tmp1, res1, in1
 	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (fetch_or_16)
+END (fetch_xor_16)
 
 
-#if HAVE_FEAT_LSE128
-ENTRY_FEAT (fetch_or_16, LSE128)
-	mov	tmp0, x0
-	mov	res0, in0
-	mov	res1, in1
-	cbnz	w4, 1f
+ENTRY_ALIASED (xor_fetch_16)
+	mov	x5, x0
+	cbnz	w4, 2f
 
 	/* RELAXED.  */
-	ldsetp	res0, res1, [tmp0]
-	ret
-1:
-	cmp	w4, ACQUIRE
-	b.hi	2f
-
-	/* ACQUIRE/CONSUME.  */
-	ldsetpa	res0, res1, [tmp0]
+1:	ldxp	res0, res1, [x5]
+	eor	res0, res0, in0
+	eor	res1, res1, in1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
 	ret
 
-	/* RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldsetpal	res0, res1, [tmp0]
+	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	eor	res0, res0, in0
+	eor	res1, res1, in1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
 	ret
-END_FEAT (fetch_or_16, LSE128)
-#endif
+END (xor_fetch_16)
 
 
-ENTRY (or_fetch_16)
+ENTRY_ALIASED (fetch_nand_16)
 	mov	x5, x0
+	mvn	in0, in0
+	mvn	in1, in1
 	cbnz	w4, 2f
 
 	/* RELAXED.  */
 1:	ldxp	res0, res1, [x5]
-	orr	res0, res0, in0
-	orr	res1, res1, in1
-	stxp	w4, res0, res1, [x5]
+	orn	tmp0, in0, res0
+	orn	tmp1, in1, res1
+	stxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 1b
 	ret
 
 	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
 2:	ldaxp	res0, res1, [x5]
-	orr	res0, res0, in0
-	orr	res1, res1, in1
-	stlxp	w4, res0, res1, [x5]
+	orn	tmp0, in0, res0
+	orn	tmp1, in1, res1
+	stlxp	w4, tmp0, tmp1, [x5]
 	cbnz	w4, 2b
 	ret
-END (or_fetch_16)
+END (fetch_nand_16)
 
 
-#if HAVE_FEAT_LSE128
-ENTRY_FEAT (or_fetch_16, LSE128)
-	cbnz	w4, 1f
-	mov	tmp0, in0
-	mov	tmp1, in1
+ENTRY_ALIASED (nand_fetch_16)
+	mov	x5, x0
+	mvn	in0, in0
+	mvn	in1, in1
+	cbnz	w4, 2f
 
 	/* RELAXED.  */
-	ldsetp	in0, in1, [x0]
-	orr	res0, in0, tmp0
-	orr	res1, in1, tmp1
+1:	ldxp	res0, res1, [x5]
+	orn	res0, in0, res0
+	orn	res1, in1, res1
+	stxp	w4, res0, res1, [x5]
+	cbnz	w4, 1b
 	ret
-1:
-	cmp	w4, ACQUIRE
-	b.hi	2f
 
-	/* ACQUIRE/CONSUME.  */
-	ldsetpa	in0, in1, [x0]
-	orr	res0, in0, tmp0
-	orr	res1, in1, tmp1
+	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldaxp	res0, res1, [x5]
+	orn	res0, in0, res0
+	orn	res1, in1, res1
+	stlxp	w4, res0, res1, [x5]
+	cbnz	w4, 2b
 	ret
+END (nand_fetch_16)
 
-	/* RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldsetpal	in0, in1, [x0]
-	orr	res0, in0, tmp0
-	orr	res1, in1, tmp1
-	ret
-END_FEAT (or_fetch_16, LSE128)
-#endif
 
+/* __atomic_test_and_set is always inlined, so this entry is unused and
+   only required for completeness.  */
+ENTRY_ALIASED (test_and_set_16)
 
-ENTRY (fetch_and_16)
+	/* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
 	mov	x5, x0
-	cbnz	w4, 2f
-
-	/* RELAXED.  */
-1:	ldxp	res0, res1, [x5]
-	and	tmp0, res0, in0
-	and	tmp1, res1, in1
-	stxp	w4, tmp0, tmp1, [x5]
+1:	ldaxrb	w0, [x5]
+	stlxrb	w4, w2, [x5]
 	cbnz	w4, 1b
 	ret
+END (test_and_set_16)
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldaxp	res0, res1, [x5]
-	and	tmp0, res0, in0
-	and	tmp1, res1, in1
-	stlxp	w4, tmp0, tmp1, [x5]
-	cbnz	w4, 2b
-	ret
-END (fetch_and_16)
-
+/* Ensure extension-specific implementations are not included unless ifunc
+   support is present, along with necessary assembler support.  */
 
-#if HAVE_FEAT_LSE128
-ENTRY_FEAT (fetch_and_16, LSE128)
-	mov	tmp0, x0
-	mvn	res0, in0
-	mvn	res1, in1
-	cbnz	w4, 1f
+#if HAVE_IFUNC
+ENTRY_FEAT (load_16, LSE2)
+	cbnz	w1, 1f
 
 	/* RELAXED.  */
-	ldclrp	res0, res1, [tmp0]
+	ldp	res0, res1, [x0]
 	ret
-
 1:
-	cmp	w4, ACQUIRE
-	b.hi	2f
+	cmp	w1, SEQ_CST
+	b.eq	2f
 
-	/* ACQUIRE/CONSUME.  */
-	ldclrpa res0, res1, [tmp0]
+	/* ACQUIRE/CONSUME (Load-AcquirePC semantics).  */
+	ldp	res0, res1, [x0]
+	dmb	ishld
 	ret
 
-	/* RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldclrpal	res0, res1, [tmp0]
+	/* SEQ_CST.  */
+2:	ldar	tmp0, [x0]	/* Block reordering with Store-Release instr.  */
+	ldp	res0, res1, [x0]
+	dmb	ishld
 	ret
-END_FEAT (fetch_and_16, LSE128)
-#endif
+END_FEAT (load_16, LSE2)
 
 
-ENTRY (and_fetch_16)
-	mov	x5, x0
-	cbnz	w4, 2f
+ENTRY_FEAT (store_16, LSE2)
+	cbnz	w4, 1f
 
 	/* RELAXED.  */
-1:	ldxp	res0, res1, [x5]
-	and	res0, res0, in0
-	and	res1, res1, in1
-	stxp	w4, res0, res1, [x5]
+	stp	in0, in1, [x0]
+	ret
+
+	/* RELEASE/SEQ_CST.  */
+1:	ldxp	xzr, tmp0, [x0]
+	stlxp	w4, in0, in1, [x0]
 	cbnz	w4, 1b
 	ret
+END_FEAT (store_16, LSE2)
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldaxp	res0, res1, [x5]
-	and	res0, res0, in0
-	and	res1, res1, in1
-	stlxp	w4, res0, res1, [x5]
-	cbnz	w4, 2b
+
+ENTRY_FEAT (compare_exchange_16, LSE)
+	ldp	exp0, exp1, [x1]
+	mov	tmp0, exp0
+	mov	tmp1, exp1
+	cbz	w4, 2f
+	cmp	w4, RELEASE
+	b.hs	3f
+
+	/* ACQUIRE/CONSUME.  */
+	caspa	exp0, exp1, in0, in1, [x0]
+0:
+	cmp	exp0, tmp0
+	ccmp	exp1, tmp1, 0, eq
+	bne	1f
+	mov	x0, 1
 	ret
-END (and_fetch_16)
+1:
+	stp	exp0, exp1, [x1]
+	mov	x0, 0
+	ret
+
+	/* RELAXED.  */
+2:	casp	exp0, exp1, in0, in1, [x0]
+	b	0b
+
+	/* RELEASE.  */
+3:	b.hi	4f
+	caspl	exp0, exp1, in0, in1, [x0]
+	b	0b
+
+	/* ACQ_REL/SEQ_CST.  */
+4:	caspal	exp0, exp1, in0, in1, [x0]
+	b	0b
+END_FEAT (compare_exchange_16, LSE)
 
 
 #if HAVE_FEAT_LSE128
-ENTRY_FEAT (and_fetch_16, LSE128)
-	mvn	tmp0, in0
-	mvn	tmp0, in1
+ENTRY_FEAT (exchange_16, LSE128)
+	mov	tmp0, x0
+	mov	res0, in0
+	mov	res1, in1
 	cbnz	w4, 1f
 
 	/* RELAXED.  */
-	ldclrp	tmp0, tmp1, [x0]
-	and	res0, tmp0, in0
-	and	res1, tmp1, in1
+	swpp	res0, res1, [tmp0]
 	ret
-
 1:
 	cmp	w4, ACQUIRE
 	b.hi	2f
 
 	/* ACQUIRE/CONSUME.  */
-	ldclrpa tmp0, tmp1, [x0]
-	and	res0, tmp0, in0
-	and	res1, tmp1, in1
+	swppa	res0, res1, [tmp0]
 	ret
 
 	/* RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldclrpal	tmp0, tmp1, [x5]
-	and	res0, tmp0, in0
-	and	res1, tmp1, in1
+2:	swppal	res0, res1, [tmp0]
 	ret
-END_FEAT (and_fetch_16, LSE128)
-#endif
+END_FEAT (exchange_16, LSE128)
 
 
-ENTRY_ALIASED (fetch_xor_16)
-	mov	x5, x0
-	cbnz	w4, 2f
+ENTRY_FEAT (fetch_or_16, LSE128)
+	mov	tmp0, x0
+	mov	res0, in0
+	mov	res1, in1
+	cbnz	w4, 1f
 
 	/* RELAXED.  */
-1:	ldxp	res0, res1, [x5]
-	eor	tmp0, res0, in0
-	eor	tmp1, res1, in1
-	stxp	w4, tmp0, tmp1, [x5]
-	cbnz	w4, 1b
+	ldsetp	res0, res1, [tmp0]
 	ret
+1:
+	cmp	w4, ACQUIRE
+	b.hi	2f
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldaxp	res0, res1, [x5]
-	eor	tmp0, res0, in0
-	eor	tmp1, res1, in1
-	stlxp	w4, tmp0, tmp1, [x5]
-	cbnz	w4, 2b
+	/* ACQUIRE/CONSUME.  */
+	ldsetpa	res0, res1, [tmp0]
 	ret
-END (fetch_xor_16)
 
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldsetpal	res0, res1, [tmp0]
+	ret
+END_FEAT (fetch_or_16, LSE128)
 
-ENTRY_ALIASED (xor_fetch_16)
-	mov	x5, x0
-	cbnz	w4, 2f
+
+ENTRY_FEAT (or_fetch_16, LSE128)
+	cbnz	w4, 1f
+	mov	tmp0, in0
+	mov	tmp1, in1
 
 	/* RELAXED.  */
-1:	ldxp	res0, res1, [x5]
-	eor	res0, res0, in0
-	eor	res1, res1, in1
-	stxp	w4, res0, res1, [x5]
-	cbnz	w4, 1b
+	ldsetp	in0, in1, [x0]
+	orr	res0, in0, tmp0
+	orr	res1, in1, tmp1
 	ret
+1:
+	cmp	w4, ACQUIRE
+	b.hi	2f
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldaxp	res0, res1, [x5]
-	eor	res0, res0, in0
-	eor	res1, res1, in1
-	stlxp	w4, res0, res1, [x5]
-	cbnz	w4, 2b
+	/* ACQUIRE/CONSUME.  */
+	ldsetpa	in0, in1, [x0]
+	orr	res0, in0, tmp0
+	orr	res1, in1, tmp1
 	ret
-END (xor_fetch_16)
 
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldsetpal	in0, in1, [x0]
+	orr	res0, in0, tmp0
+	orr	res1, in1, tmp1
+	ret
+END_FEAT (or_fetch_16, LSE128)
 
-ENTRY_ALIASED (fetch_nand_16)
-	mov	x5, x0
-	mvn	in0, in0
-	mvn	in1, in1
-	cbnz	w4, 2f
+
+ENTRY_FEAT (fetch_and_16, LSE128)
+	mov	tmp0, x0
+	mvn	res0, in0
+	mvn	res1, in1
+	cbnz	w4, 1f
 
 	/* RELAXED.  */
-1:	ldxp	res0, res1, [x5]
-	orn	tmp0, in0, res0
-	orn	tmp1, in1, res1
-	stxp	w4, tmp0, tmp1, [x5]
-	cbnz	w4, 1b
+	ldclrp	res0, res1, [tmp0]
 	ret
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldaxp	res0, res1, [x5]
-	orn	tmp0, in0, res0
-	orn	tmp1, in1, res1
-	stlxp	w4, tmp0, tmp1, [x5]
-	cbnz	w4, 2b
+1:
+	cmp	w4, ACQUIRE
+	b.hi	2f
+
+	/* ACQUIRE/CONSUME.  */
+	ldclrpa res0, res1, [tmp0]
 	ret
-END (fetch_nand_16)
 
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldclrpal	res0, res1, [tmp0]
+	ret
+END_FEAT (fetch_and_16, LSE128)
 
-ENTRY_ALIASED (nand_fetch_16)
-	mov	x5, x0
-	mvn	in0, in0
-	mvn	in1, in1
-	cbnz	w4, 2f
 
-	/* RELAXED.  */
-1:	ldxp	res0, res1, [x5]
-	orn	res0, in0, res0
-	orn	res1, in1, res1
-	stxp	w4, res0, res1, [x5]
-	cbnz	w4, 1b
-	ret
+ENTRY_FEAT (and_fetch_16, LSE128)
+	mvn	tmp0, in0
+	mvn	tmp0, in1
+	cbnz	w4, 1f
 
-	/* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-2:	ldaxp	res0, res1, [x5]
-	orn	res0, in0, res0
-	orn	res1, in1, res1
-	stlxp	w4, res0, res1, [x5]
-	cbnz	w4, 2b
+	/* RELAXED.  */
+	ldclrp	tmp0, tmp1, [x0]
+	and	res0, tmp0, in0
+	and	res1, tmp1, in1
 	ret
-END (nand_fetch_16)
 
+1:
+	cmp	w4, ACQUIRE
+	b.hi	2f
 
-/* __atomic_test_and_set is always inlined, so this entry is unused and
-   only required for completeness.  */
-ENTRY_ALIASED (test_and_set_16)
+	/* ACQUIRE/CONSUME.  */
+	ldclrpa tmp0, tmp1, [x0]
+	and	res0, tmp0, in0
+	and	res1, tmp1, in1
+	ret
 
-	/* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST.  */
-	mov	x5, x0
-1:	ldaxrb	w0, [x5]
-	stlxrb	w4, w2, [x5]
-	cbnz	w4, 1b
+	/* RELEASE/ACQ_REL/SEQ_CST.  */
+2:	ldclrpal	tmp0, tmp1, [x5]
+	and	res0, tmp0, in0
+	and	res1, tmp1, in1
 	ret
-END (test_and_set_16)
+END_FEAT (and_fetch_16, LSE128)
+#endif /* HAVE_FEAT_LSE128 */
+#endif /* HAVE_IFUNC */
 
 
 /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code.  */