aarch64: Optimized implementation of pthread_spin_lock and unlock
diff mbox series

Message ID 20191028133911.8612-1-zhangxuelei4@huawei.com
State New
Headers show
Series
  • aarch64: Optimized implementation of pthread_spin_lock and unlock
Related show

Commit Message

Xuelei Zhang Oct. 28, 2019, 1:39 p.m. UTC
An optimized algorithm of spin_lock is implemented to wait for
random time in the case that multi-threads want to store spinlock
at the same time. This way can make the operation of different
threads asynchronous, thereby reducing bus conflicts, and futherly
improving the overall performance, which benefits more on aarch64
for its multi-core features.

In addition, the assembly version of spin_unlock is also implemented.
---
 sysdeps/aarch64/nptl/pthread_spin_lock.S   | 103 +++++++++++++++++++++++++++++
 sysdeps/aarch64/nptl/pthread_spin_unlock.S |  34 ++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S
 create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S

Comments

Szabolcs Nagy Oct. 28, 2019, 4:17 p.m. UTC | #1
On 28/10/2019 13:39, Xuelei Zhang wrote:
> An optimized algorithm of spin_lock is implemented to wait for
> random time in the case that multi-threads want to store spinlock
> at the same time. This way can make the operation of different
> threads asynchronous, thereby reducing bus conflicts, and futherly
> improving the overall performance, which benefits more on aarch64
> for its multi-core features.
> 
> In addition, the assembly version of spin_unlock is also implemented.

this has many issues

spinlock benchmarking is non-trivial (workload dependent,
uarch dependent and memory system dependent) and you need
to demonstrate that the new code is significantly better
than the generic c code, otherwise we don't want to
maintain asm in the aarch64 port.

the generic c code may gain improvements that outperform
whatever asm implementation (e.g. numa aware spinlock)
and allows compiling the code to different targets
(e.g. armv8.1-a+lse).

the arm arm has a spin lock implementation in the "barrier
litmus tests" section using wfe and sevl, if i had to add
asm i'd probably start with that.

> +ENTRY (pthread_spin_lock)
> +	DELOUSE (0)
> +
> +	mov	w1, #0x1
> +
> +L(spin):
> +	prfm	pldl1strm, [x0]
> +    /* A count to distinguish wating time.
> +    The larger the value, the more intense the current lock
> +    conflicts, and the longer the waiting time. */
> +	add	w3, w3, #0x1

w3 is not initialized, i would not call that "wait for
random time", since the lock contention is most likely
between the exact same code path executing in different
threads with identical w3 at function entry.

> +	ldaxr	w2, [x0]
> +	cbnz	w2, L(spin)
> +	stxr	w2, w1, [x0]
> +	cbnz	w2, 1f
> +	b	L(end)
> +
> +L(end):
> +	mov w0, #0x0
> +	ret
> +
> +    /* Set the loop times of L(wait) from 1000 to 7000 cycles,
> +    equals waiting 1~2us per 1000 cycles. */
> +	.p2align 4
> +7:
> +	mov	w6, #0x1b58
> +	b	L(wait_init)
> +6:
> +	mov	w6, #0x1770
> +	b	L(wait_init)
> +5:
> +	mov	w6, #0x1388
> +	b	L(wait_init)
> +4:
> +	mov	w6, #0x0fa0
> +	b	L(wait_init)
> +3:
> +	mov	w6, #0x0bb8
> +	b	L(wait_init)
> +2:
> +	mov	w6, #0x07d0
> +	b	L(wait_init)
> +1:
> +	mov	w6, #0x03e8
> +	b	L(wait_init)
> +
> +L(wait_init):
> +	mov	w5, #0x0
> +L(wait):
> +	add	w5, w5, #0x1
> +	cmp	w5, w6
> +	b.lt	L(wait) /* Wait ends when w5 equals w6. */
> +
> +L(stxr_try):
> +	ldr	w2, [x0]
> +	cbz	w2, L(spin)
> +	and	w3, w3, #0x07
> +    /* 8 kinds of distinguish wating time 
> +    based on the lower three bits of w3. */
> +	cmp	w3, #0x01
> +	beq	1b
> +	cmp	w3, #0x02
> +	beq	2b
> +	cmp	w3, #0x03
> +	beq	3b
> +	cmp	w3, #0x04
> +	beq	4b
> +	cmp	w3, #0x05
> +	beq	5b
> +	cmp	w3, #0x06
> +	beq	6b
> +	cmp	w3, #0x07
> +	beq	7b
> +
> +	wfe /* w3=0x000: the most intense situation so to wait 50 us */
> +	b	L(stxr_try)
> +
> +
> +END (pthread_spin_lock)
> +libc_hidden_builtin_def (pthread_spin_lock)
> +weak_alias (pthread_spin_lock, index)

that weak_alias is wrong.

> +ENTRY (pthread_spin_unlock)
> +	DELOUSE (0)
> +	stlr	wzr, [x0]

note: current glibc code would generate

  dmb ish
  str wzr, [x0]

instead of stlr, which is faster on some cores,
but has weaker ordering guarantees.

> +	mov	w0, #0x0
> +	ret
> +
> +END (pthread_spin_unlock)
> +libc_hidden_builtin_def (pthread_spin_unlock)
> +weak_alias (pthread_spin_unlock, index)

this weak_alias is wrong too.
Andrew Pinski Oct. 28, 2019, 5:46 p.m. UTC | #2
On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote:
>
> An optimized algorithm of spin_lock is implemented to wait for
> random time in the case that multi-threads want to store spinlock
> at the same time. This way can make the operation of different
> threads asynchronous, thereby reducing bus conflicts, and futherly
> improving the overall performance, which benefits more on aarch64
> for its multi-core features.
>
> In addition, the assembly version of spin_unlock is also implemented.
> ---
>  sysdeps/aarch64/nptl/pthread_spin_lock.S   | 103 +++++++++++++++++++++++++++++
>  sysdeps/aarch64/nptl/pthread_spin_unlock.S |  34 ++++++++++
>  2 files changed, 137 insertions(+)
>  create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S
>  create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S
>
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> new file mode 100644
> index 00000000000..662410c5fa3
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> @@ -0,0 +1,103 @@
> +/* pthread_spin_lock -- lock a spin lock. Generic version.
> +   Copyright (C) 2012-2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_lock)
> +       DELOUSE (0)
> +
> +       mov     w1, #0x1
> +
> +L(spin):
> +       prfm    pldl1strm, [x0]
> +    /* A count to distinguish wating time.
> +    The larger the value, the more intense the current lock
> +    conflicts, and the longer the waiting time. */
> +       add     w3, w3, #0x1
> +       ldaxr   w2, [x0]
> +       cbnz    w2, L(spin)
> +       stxr    w2, w1, [x0]
> +       cbnz    w2, 1f
> +       b       L(end)
> +
> +L(end):
> +       mov w0, #0x0
> +       ret
> +
> +    /* Set the loop times of L(wait) from 1000 to 7000 cycles,
> +    equals waiting 1~2us per 1000 cycles. */
> +       .p2align 4
> +7:
> +       mov     w6, #0x1b58
> +       b       L(wait_init)
> +6:
> +       mov     w6, #0x1770
> +       b       L(wait_init)
> +5:
> +       mov     w6, #0x1388
> +       b       L(wait_init)
> +4:
> +       mov     w6, #0x0fa0
> +       b       L(wait_init)
> +3:
> +       mov     w6, #0x0bb8
> +       b       L(wait_init)
> +2:
> +       mov     w6, #0x07d0
> +       b       L(wait_init)
> +1:
> +       mov     w6, #0x03e8
> +       b       L(wait_init)
> +
> +L(wait_init):
> +       mov     w5, #0x0
> +L(wait):
> +       add     w5, w5, #0x1
> +       cmp     w5, w6
> +       b.lt    L(wait) /* Wait ends when w5 equals w6. */

I could see this loop even taking less than one cycle/iteration.  With
a micro-op loop cache.

But really my bet is r3 will 99% be 0 so it won't be that random after all.
If this had the ability to become a ticket lock, then you could use
the difference between the ticket and the next ticket for the loop
time.  But since the ABI is set in stone (that is the unlocked value
needs to be all zeros), it does not have the ability to become a
ticket spin lock.

Maybe one way is to use the lower bits of the thread ID.  I don't know
if that would be fast enough to get though.  Maybe you could do
popcount on an TLS address; though that takes a few cycles due to
using the vector registers (no scalar popcount instruction).

Thanks,
Andrew Pinski

> +
> +L(stxr_try):
> +       ldr     w2, [x0]
> +       cbz     w2, L(spin)
> +       and     w3, w3, #0x07
> +    /* 8 kinds of distinguish wating time
> +    based on the lower three bits of w3. */
> +       cmp     w3, #0x01
> +       beq     1b
> +       cmp     w3, #0x02
> +       beq     2b
> +       cmp     w3, #0x03
> +       beq     3b
> +       cmp     w3, #0x04
> +       beq     4b
> +       cmp     w3, #0x05
> +       beq     5b
> +       cmp     w3, #0x06
> +       beq     6b
> +       cmp     w3, #0x07
> +       beq     7b
> +
> +       wfe /* w3=0x000: the most intense situation so to wait 50 us */
> +       b       L(stxr_try)
> +
> +
> +END (pthread_spin_lock)
> +libc_hidden_builtin_def (pthread_spin_lock)
> +weak_alias (pthread_spin_lock, index)
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> new file mode 100644
> index 00000000000..57a6bca86fe
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> @@ -0,0 +1,34 @@
> +/* pthread_spin_unlock -- unlock a spin lock. Generic version.
> +   Copyright (C) 2012-2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_unlock)
> +       DELOUSE (0)
> +       stlr    wzr, [x0]
> +       mov     w0, #0x0
> +       ret
> +
> +END (pthread_spin_unlock)
> +libc_hidden_builtin_def (pthread_spin_unlock)
> +weak_alias (pthread_spin_unlock, index)
> --
> 2.14.1.windows.1
>
>
Andrew Pinski Oct. 28, 2019, 5:53 p.m. UTC | #3
On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote:
>
> An optimized algorithm of spin_lock is implemented to wait for
> random time in the case that multi-threads want to store spinlock
> at the same time. This way can make the operation of different
> threads asynchronous, thereby reducing bus conflicts, and futherly
> improving the overall performance, which benefits more on aarch64
> for its multi-core features.
>
> In addition, the assembly version of spin_unlock is also implemented.
> ---
>  sysdeps/aarch64/nptl/pthread_spin_lock.S   | 103 +++++++++++++++++++++++++++++
>  sysdeps/aarch64/nptl/pthread_spin_unlock.S |  34 ++++++++++
>  2 files changed, 137 insertions(+)
>  create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S
>  create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S
>
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> new file mode 100644
> index 00000000000..662410c5fa3
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S
> @@ -0,0 +1,103 @@
> +/* pthread_spin_lock -- lock a spin lock. Generic version.
> +   Copyright (C) 2012-2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_lock)
> +       DELOUSE (0)
> +
> +       mov     w1, #0x1
> +
> +L(spin):
> +       prfm    pldl1strm, [x0]
> +    /* A count to distinguish wating time.
> +    The larger the value, the more intense the current lock
> +    conflicts, and the longer the waiting time. */
> +       add     w3, w3, #0x1
> +       ldaxr   w2, [x0]
> +       cbnz    w2, L(spin)
> +       stxr    w2, w1, [x0]
> +       cbnz    w2, 1f
> +       b       L(end)
> +
> +L(end):
> +       mov w0, #0x0
> +       ret
> +
> +    /* Set the loop times of L(wait) from 1000 to 7000 cycles,
> +    equals waiting 1~2us per 1000 cycles. */
> +       .p2align 4
> +7:
> +       mov     w6, #0x1b58
> +       b       L(wait_init)
> +6:
> +       mov     w6, #0x1770
> +       b       L(wait_init)
> +5:
> +       mov     w6, #0x1388
> +       b       L(wait_init)
> +4:
> +       mov     w6, #0x0fa0
> +       b       L(wait_init)
> +3:
> +       mov     w6, #0x0bb8
> +       b       L(wait_init)
> +2:
> +       mov     w6, #0x07d0
> +       b       L(wait_init)
> +1:
> +       mov     w6, #0x03e8
> +       b       L(wait_init)
> +
> +L(wait_init):
> +       mov     w5, #0x0
> +L(wait):
> +       add     w5, w5, #0x1
> +       cmp     w5, w6
> +       b.lt    L(wait) /* Wait ends when w5 equals w6. */
> +
> +L(stxr_try):
> +       ldr     w2, [x0]
> +       cbz     w2, L(spin)
> +       and     w3, w3, #0x07
> +    /* 8 kinds of distinguish wating time
> +    based on the lower three bits of w3. */
> +       cmp     w3, #0x01
> +       beq     1b
> +       cmp     w3, #0x02
> +       beq     2b
> +       cmp     w3, #0x03
> +       beq     3b
> +       cmp     w3, #0x04
> +       beq     4b
> +       cmp     w3, #0x05
> +       beq     5b
> +       cmp     w3, #0x06
> +       beq     6b
> +       cmp     w3, #0x07
> +       beq     7b
> +
> +       wfe /* w3=0x000: the most intense situation so to wait 50 us */
> +       b       L(stxr_try)
> +
> +
> +END (pthread_spin_lock)
> +libc_hidden_builtin_def (pthread_spin_lock)
> +weak_alias (pthread_spin_lock, index)
> diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> new file mode 100644
> index 00000000000..57a6bca86fe
> --- /dev/null
> +++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
> @@ -0,0 +1,34 @@
> +/* pthread_spin_unlock -- unlock a spin lock. Generic version.
> +   Copyright (C) 2012-2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64
> + */
> +
> +ENTRY (pthread_spin_unlock)
> +       DELOUSE (0)
> +       stlr    wzr, [x0]
> +       mov     w0, #0x0

Actually the C version of pthread_spin_unlock is implemented this way
already and provides exactly this same assembly.
Why do you need an assembly version?

Thanks,
Andrew Pinski

PS for reference:
./nptl/pthread_spin_unlock.o:     file format elf64-littleaarch64


Disassembly of section .text:

0000000000000000 <pthread_spin_unlock>:
   0:   889ffc1f        stlr    wzr, [x0]
   4:   52800000        mov     w0, #0x0                        // #0
   8:   d65f03c0        ret




> +       ret
> +
> +END (pthread_spin_unlock)
> +libc_hidden_builtin_def (pthread_spin_unlock)
> +weak_alias (pthread_spin_unlock, index)
> --
> 2.14.1.windows.1
>
>
Szabolcs Nagy Oct. 29, 2019, 10:31 a.m. UTC | #4
On 28/10/2019 17:53, Andrew Pinski wrote:
> On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote:
>> +ENTRY (pthread_spin_unlock)
>> +       DELOUSE (0)
>> +       stlr    wzr, [x0]
>> +       mov     w0, #0x0
> 
> Actually the C version of pthread_spin_unlock is implemented this way
> already and provides exactly this same assembly.
> Why do you need an assembly version?

if lock is in asm, then unlock must be in asm too.

we don't want to reason about synchronization
between asm and c code as their memory model is
defined differently and we don't want to rely
on a particular implementation in the compiler.

and the generic c code may change independently
(e.g. spin_unlock used to be barrier + store
but changed at some point, such change can easily
break the synchronization without us noticing)

i'd prefer to improve the generic c code if
possible instead of using asm.
Wilco Dijkstra Oct. 29, 2019, 11:54 a.m. UTC | #5
Hi,

> i'd prefer to improve the generic c code if
> possible instead of using asm.

Agreed, there isn't really anything performance critical that assembler
could speed up. It's more about what strategy to use if there is any
contention.

But we really need representative benchmarks first. Without that it's
impossible to decide whether a new spinlock implementation is faster.

Cheers,
Wilco

Patch
diff mbox series

diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S
new file mode 100644
index 00000000000..662410c5fa3
--- /dev/null
+++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S
@@ -0,0 +1,103 @@ 
+/* pthread_spin_lock -- lock a spin lock. Generic version.
+   Copyright (C) 2012-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ENTRY (pthread_spin_lock)
+	DELOUSE (0)
+
+	mov	w1, #0x1
+
+L(spin):
+	prfm	pldl1strm, [x0]
+    /* A count to distinguish wating time.
+    The larger the value, the more intense the current lock
+    conflicts, and the longer the waiting time. */
+	add	w3, w3, #0x1
+	ldaxr	w2, [x0]
+	cbnz	w2, L(spin)
+	stxr	w2, w1, [x0]
+	cbnz	w2, 1f
+	b	L(end)
+
+L(end):
+	mov w0, #0x0
+	ret
+
+    /* Set the loop times of L(wait) from 1000 to 7000 cycles,
+    equals waiting 1~2us per 1000 cycles. */
+	.p2align 4
+7:
+	mov	w6, #0x1b58
+	b	L(wait_init)
+6:
+	mov	w6, #0x1770
+	b	L(wait_init)
+5:
+	mov	w6, #0x1388
+	b	L(wait_init)
+4:
+	mov	w6, #0x0fa0
+	b	L(wait_init)
+3:
+	mov	w6, #0x0bb8
+	b	L(wait_init)
+2:
+	mov	w6, #0x07d0
+	b	L(wait_init)
+1:
+	mov	w6, #0x03e8
+	b	L(wait_init)
+
+L(wait_init):
+	mov	w5, #0x0
+L(wait):
+	add	w5, w5, #0x1
+	cmp	w5, w6
+	b.lt	L(wait) /* Wait ends when w5 equals w6. */
+
+L(stxr_try):
+	ldr	w2, [x0]
+	cbz	w2, L(spin)
+	and	w3, w3, #0x07
+    /* 8 kinds of distinguish wating time 
+    based on the lower three bits of w3. */
+	cmp	w3, #0x01
+	beq	1b
+	cmp	w3, #0x02
+	beq	2b
+	cmp	w3, #0x03
+	beq	3b
+	cmp	w3, #0x04
+	beq	4b
+	cmp	w3, #0x05
+	beq	5b
+	cmp	w3, #0x06
+	beq	6b
+	cmp	w3, #0x07
+	beq	7b
+
+	wfe /* w3=0x000: the most intense situation so to wait 50 us */
+	b	L(stxr_try)
+
+
+END (pthread_spin_lock)
+libc_hidden_builtin_def (pthread_spin_lock)
+weak_alias (pthread_spin_lock, index)
diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
new file mode 100644
index 00000000000..57a6bca86fe
--- /dev/null
+++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S
@@ -0,0 +1,34 @@ 
+/* pthread_spin_unlock -- unlock a spin lock. Generic version.
+   Copyright (C) 2012-2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+ENTRY (pthread_spin_unlock)
+	DELOUSE (0)
+	stlr	wzr, [x0]
+	mov	w0, #0x0
+	ret
+
+END (pthread_spin_unlock)
+libc_hidden_builtin_def (pthread_spin_unlock)
+weak_alias (pthread_spin_unlock, index)