Message ID | 20191028133911.8612-1-zhangxuelei4@huawei.com |
---|---|
State | New |
Headers | show |
Series | aarch64: Optimized implementation of pthread_spin_lock and unlock | expand |
On 28/10/2019 13:39, Xuelei Zhang wrote: > An optimized algorithm of spin_lock is implemented to wait for > random time in the case that multi-threads want to store spinlock > at the same time. This way can make the operation of different > threads asynchronous, thereby reducing bus conflicts, and futherly > improving the overall performance, which benefits more on aarch64 > for its multi-core features. > > In addition, the assembly version of spin_unlock is also implemented. this has many issues spinlock benchmarking is non-trivial (workload dependent, uarch dependent and memory system dependent) and you need to demonstrate that the new code is significantly better than the generic c code, otherwise we don't want to maintain asm in the aarch64 port. the generic c code may gain improvements that outperform whatever asm implementation (e.g. numa aware spinlock) and allows compiling the code to different targets (e.g. armv8.1-a+lse). the arm arm has a spin lock implementation in the "barrier litmus tests" section using wfe and sevl, if i had to add asm i'd probably start with that. > +ENTRY (pthread_spin_lock) > + DELOUSE (0) > + > + mov w1, #0x1 > + > +L(spin): > + prfm pldl1strm, [x0] > + /* A count to distinguish wating time. > + The larger the value, the more intense the current lock > + conflicts, and the longer the waiting time. */ > + add w3, w3, #0x1 w3 is not initialized, i would not call that "wait for random time", since the lock contention is most likely between the exact same code path executing in different threads with identical w3 at function entry. > + ldaxr w2, [x0] > + cbnz w2, L(spin) > + stxr w2, w1, [x0] > + cbnz w2, 1f > + b L(end) > + > +L(end): > + mov w0, #0x0 > + ret > + > + /* Set the loop times of L(wait) from 1000 to 7000 cycles, > + equals waiting 1~2us per 1000 cycles. */ > + .p2align 4 > +7: > + mov w6, #0x1b58 > + b L(wait_init) > +6: > + mov w6, #0x1770 > + b L(wait_init) > +5: > + mov w6, #0x1388 > + b L(wait_init) > +4: > + mov w6, #0x0fa0 > + b L(wait_init) > +3: > + mov w6, #0x0bb8 > + b L(wait_init) > +2: > + mov w6, #0x07d0 > + b L(wait_init) > +1: > + mov w6, #0x03e8 > + b L(wait_init) > + > +L(wait_init): > + mov w5, #0x0 > +L(wait): > + add w5, w5, #0x1 > + cmp w5, w6 > + b.lt L(wait) /* Wait ends when w5 equals w6. */ > + > +L(stxr_try): > + ldr w2, [x0] > + cbz w2, L(spin) > + and w3, w3, #0x07 > + /* 8 kinds of distinguish wating time > + based on the lower three bits of w3. */ > + cmp w3, #0x01 > + beq 1b > + cmp w3, #0x02 > + beq 2b > + cmp w3, #0x03 > + beq 3b > + cmp w3, #0x04 > + beq 4b > + cmp w3, #0x05 > + beq 5b > + cmp w3, #0x06 > + beq 6b > + cmp w3, #0x07 > + beq 7b > + > + wfe /* w3=0x000: the most intense situation so to wait 50 us */ > + b L(stxr_try) > + > + > +END (pthread_spin_lock) > +libc_hidden_builtin_def (pthread_spin_lock) > +weak_alias (pthread_spin_lock, index) that weak_alias is wrong. > +ENTRY (pthread_spin_unlock) > + DELOUSE (0) > + stlr wzr, [x0] note: current glibc code would generate dmb ish str wzr, [x0] instead of stlr, which is faster on some cores, but has weaker ordering guarantees. > + mov w0, #0x0 > + ret > + > +END (pthread_spin_unlock) > +libc_hidden_builtin_def (pthread_spin_unlock) > +weak_alias (pthread_spin_unlock, index) this weak_alias is wrong too.
On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote: > > An optimized algorithm of spin_lock is implemented to wait for > random time in the case that multi-threads want to store spinlock > at the same time. This way can make the operation of different > threads asynchronous, thereby reducing bus conflicts, and futherly > improving the overall performance, which benefits more on aarch64 > for its multi-core features. > > In addition, the assembly version of spin_unlock is also implemented. > --- > sysdeps/aarch64/nptl/pthread_spin_lock.S | 103 +++++++++++++++++++++++++++++ > sysdeps/aarch64/nptl/pthread_spin_unlock.S | 34 ++++++++++ > 2 files changed, 137 insertions(+) > create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S > create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S > > diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S > new file mode 100644 > index 00000000000..662410c5fa3 > --- /dev/null > +++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S > @@ -0,0 +1,103 @@ > +/* pthread_spin_lock -- lock a spin lock. Generic version. > + Copyright (C) 2012-2019 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* Assumptions: > + * > + * ARMv8-a, AArch64 > + */ > + > +ENTRY (pthread_spin_lock) > + DELOUSE (0) > + > + mov w1, #0x1 > + > +L(spin): > + prfm pldl1strm, [x0] > + /* A count to distinguish wating time. > + The larger the value, the more intense the current lock > + conflicts, and the longer the waiting time. */ > + add w3, w3, #0x1 > + ldaxr w2, [x0] > + cbnz w2, L(spin) > + stxr w2, w1, [x0] > + cbnz w2, 1f > + b L(end) > + > +L(end): > + mov w0, #0x0 > + ret > + > + /* Set the loop times of L(wait) from 1000 to 7000 cycles, > + equals waiting 1~2us per 1000 cycles. */ > + .p2align 4 > +7: > + mov w6, #0x1b58 > + b L(wait_init) > +6: > + mov w6, #0x1770 > + b L(wait_init) > +5: > + mov w6, #0x1388 > + b L(wait_init) > +4: > + mov w6, #0x0fa0 > + b L(wait_init) > +3: > + mov w6, #0x0bb8 > + b L(wait_init) > +2: > + mov w6, #0x07d0 > + b L(wait_init) > +1: > + mov w6, #0x03e8 > + b L(wait_init) > + > +L(wait_init): > + mov w5, #0x0 > +L(wait): > + add w5, w5, #0x1 > + cmp w5, w6 > + b.lt L(wait) /* Wait ends when w5 equals w6. */ I could see this loop even taking less than one cycle/iteration. With a micro-op loop cache. But really my bet is r3 will 99% be 0 so it won't be that random after all. If this had the ability to become a ticket lock, then you could use the difference between the ticket and the next ticket for the loop time. But since the ABI is set in stone (that is the unlocked value needs to be all zeros), it does not have the ability to become a ticket spin lock. Maybe one way is to use the lower bits of the thread ID. I don't know if that would be fast enough to get though. Maybe you could do popcount on an TLS address; though that takes a few cycles due to using the vector registers (no scalar popcount instruction). Thanks, Andrew Pinski > + > +L(stxr_try): > + ldr w2, [x0] > + cbz w2, L(spin) > + and w3, w3, #0x07 > + /* 8 kinds of distinguish wating time > + based on the lower three bits of w3. */ > + cmp w3, #0x01 > + beq 1b > + cmp w3, #0x02 > + beq 2b > + cmp w3, #0x03 > + beq 3b > + cmp w3, #0x04 > + beq 4b > + cmp w3, #0x05 > + beq 5b > + cmp w3, #0x06 > + beq 6b > + cmp w3, #0x07 > + beq 7b > + > + wfe /* w3=0x000: the most intense situation so to wait 50 us */ > + b L(stxr_try) > + > + > +END (pthread_spin_lock) > +libc_hidden_builtin_def (pthread_spin_lock) > +weak_alias (pthread_spin_lock, index) > diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S > new file mode 100644 > index 00000000000..57a6bca86fe > --- /dev/null > +++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S > @@ -0,0 +1,34 @@ > +/* pthread_spin_unlock -- unlock a spin lock. Generic version. > + Copyright (C) 2012-2019 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* Assumptions: > + * > + * ARMv8-a, AArch64 > + */ > + > +ENTRY (pthread_spin_unlock) > + DELOUSE (0) > + stlr wzr, [x0] > + mov w0, #0x0 > + ret > + > +END (pthread_spin_unlock) > +libc_hidden_builtin_def (pthread_spin_unlock) > +weak_alias (pthread_spin_unlock, index) > -- > 2.14.1.windows.1 > >
On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote: > > An optimized algorithm of spin_lock is implemented to wait for > random time in the case that multi-threads want to store spinlock > at the same time. This way can make the operation of different > threads asynchronous, thereby reducing bus conflicts, and futherly > improving the overall performance, which benefits more on aarch64 > for its multi-core features. > > In addition, the assembly version of spin_unlock is also implemented. > --- > sysdeps/aarch64/nptl/pthread_spin_lock.S | 103 +++++++++++++++++++++++++++++ > sysdeps/aarch64/nptl/pthread_spin_unlock.S | 34 ++++++++++ > 2 files changed, 137 insertions(+) > create mode 100644 sysdeps/aarch64/nptl/pthread_spin_lock.S > create mode 100644 sysdeps/aarch64/nptl/pthread_spin_unlock.S > > diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S > new file mode 100644 > index 00000000000..662410c5fa3 > --- /dev/null > +++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S > @@ -0,0 +1,103 @@ > +/* pthread_spin_lock -- lock a spin lock. Generic version. > + Copyright (C) 2012-2019 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* Assumptions: > + * > + * ARMv8-a, AArch64 > + */ > + > +ENTRY (pthread_spin_lock) > + DELOUSE (0) > + > + mov w1, #0x1 > + > +L(spin): > + prfm pldl1strm, [x0] > + /* A count to distinguish wating time. > + The larger the value, the more intense the current lock > + conflicts, and the longer the waiting time. */ > + add w3, w3, #0x1 > + ldaxr w2, [x0] > + cbnz w2, L(spin) > + stxr w2, w1, [x0] > + cbnz w2, 1f > + b L(end) > + > +L(end): > + mov w0, #0x0 > + ret > + > + /* Set the loop times of L(wait) from 1000 to 7000 cycles, > + equals waiting 1~2us per 1000 cycles. */ > + .p2align 4 > +7: > + mov w6, #0x1b58 > + b L(wait_init) > +6: > + mov w6, #0x1770 > + b L(wait_init) > +5: > + mov w6, #0x1388 > + b L(wait_init) > +4: > + mov w6, #0x0fa0 > + b L(wait_init) > +3: > + mov w6, #0x0bb8 > + b L(wait_init) > +2: > + mov w6, #0x07d0 > + b L(wait_init) > +1: > + mov w6, #0x03e8 > + b L(wait_init) > + > +L(wait_init): > + mov w5, #0x0 > +L(wait): > + add w5, w5, #0x1 > + cmp w5, w6 > + b.lt L(wait) /* Wait ends when w5 equals w6. */ > + > +L(stxr_try): > + ldr w2, [x0] > + cbz w2, L(spin) > + and w3, w3, #0x07 > + /* 8 kinds of distinguish wating time > + based on the lower three bits of w3. */ > + cmp w3, #0x01 > + beq 1b > + cmp w3, #0x02 > + beq 2b > + cmp w3, #0x03 > + beq 3b > + cmp w3, #0x04 > + beq 4b > + cmp w3, #0x05 > + beq 5b > + cmp w3, #0x06 > + beq 6b > + cmp w3, #0x07 > + beq 7b > + > + wfe /* w3=0x000: the most intense situation so to wait 50 us */ > + b L(stxr_try) > + > + > +END (pthread_spin_lock) > +libc_hidden_builtin_def (pthread_spin_lock) > +weak_alias (pthread_spin_lock, index) > diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S > new file mode 100644 > index 00000000000..57a6bca86fe > --- /dev/null > +++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S > @@ -0,0 +1,34 @@ > +/* pthread_spin_unlock -- unlock a spin lock. Generic version. > + Copyright (C) 2012-2019 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* Assumptions: > + * > + * ARMv8-a, AArch64 > + */ > + > +ENTRY (pthread_spin_unlock) > + DELOUSE (0) > + stlr wzr, [x0] > + mov w0, #0x0 Actually the C version of pthread_spin_unlock is implemented this way already and provides exactly this same assembly. Why do you need an assembly version? Thanks, Andrew Pinski PS for reference: ./nptl/pthread_spin_unlock.o: file format elf64-littleaarch64 Disassembly of section .text: 0000000000000000 <pthread_spin_unlock>: 0: 889ffc1f stlr wzr, [x0] 4: 52800000 mov w0, #0x0 // #0 8: d65f03c0 ret > + ret > + > +END (pthread_spin_unlock) > +libc_hidden_builtin_def (pthread_spin_unlock) > +weak_alias (pthread_spin_unlock, index) > -- > 2.14.1.windows.1 > >
On 28/10/2019 17:53, Andrew Pinski wrote: > On Mon, Oct 28, 2019 at 6:39 AM Xuelei Zhang <zhangxuelei4@huawei.com> wrote: >> +ENTRY (pthread_spin_unlock) >> + DELOUSE (0) >> + stlr wzr, [x0] >> + mov w0, #0x0 > > Actually the C version of pthread_spin_unlock is implemented this way > already and provides exactly this same assembly. > Why do you need an assembly version? if lock is in asm, then unlock must be in asm too. we don't want to reason about synchronization between asm and c code as their memory model is defined differently and we don't want to rely on a particular implementation in the compiler. and the generic c code may change independently (e.g. spin_unlock used to be barrier + store but changed at some point, such change can easily break the synchronization without us noticing) i'd prefer to improve the generic c code if possible instead of using asm.
Hi, > i'd prefer to improve the generic c code if > possible instead of using asm. Agreed, there isn't really anything performance critical that assembler could speed up. It's more about what strategy to use if there is any contention. But we really need representative benchmarks first. Without that it's impossible to decide whether a new spinlock implementation is faster. Cheers, Wilco
diff --git a/sysdeps/aarch64/nptl/pthread_spin_lock.S b/sysdeps/aarch64/nptl/pthread_spin_lock.S new file mode 100644 index 00000000000..662410c5fa3 --- /dev/null +++ b/sysdeps/aarch64/nptl/pthread_spin_lock.S @@ -0,0 +1,103 @@ +/* pthread_spin_lock -- lock a spin lock. Generic version. + Copyright (C) 2012-2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +ENTRY (pthread_spin_lock) + DELOUSE (0) + + mov w1, #0x1 + +L(spin): + prfm pldl1strm, [x0] + /* A count to distinguish wating time. + The larger the value, the more intense the current lock + conflicts, and the longer the waiting time. */ + add w3, w3, #0x1 + ldaxr w2, [x0] + cbnz w2, L(spin) + stxr w2, w1, [x0] + cbnz w2, 1f + b L(end) + +L(end): + mov w0, #0x0 + ret + + /* Set the loop times of L(wait) from 1000 to 7000 cycles, + equals waiting 1~2us per 1000 cycles. */ + .p2align 4 +7: + mov w6, #0x1b58 + b L(wait_init) +6: + mov w6, #0x1770 + b L(wait_init) +5: + mov w6, #0x1388 + b L(wait_init) +4: + mov w6, #0x0fa0 + b L(wait_init) +3: + mov w6, #0x0bb8 + b L(wait_init) +2: + mov w6, #0x07d0 + b L(wait_init) +1: + mov w6, #0x03e8 + b L(wait_init) + +L(wait_init): + mov w5, #0x0 +L(wait): + add w5, w5, #0x1 + cmp w5, w6 + b.lt L(wait) /* Wait ends when w5 equals w6. */ + +L(stxr_try): + ldr w2, [x0] + cbz w2, L(spin) + and w3, w3, #0x07 + /* 8 kinds of distinguish wating time + based on the lower three bits of w3. */ + cmp w3, #0x01 + beq 1b + cmp w3, #0x02 + beq 2b + cmp w3, #0x03 + beq 3b + cmp w3, #0x04 + beq 4b + cmp w3, #0x05 + beq 5b + cmp w3, #0x06 + beq 6b + cmp w3, #0x07 + beq 7b + + wfe /* w3=0x000: the most intense situation so to wait 50 us */ + b L(stxr_try) + + +END (pthread_spin_lock) +libc_hidden_builtin_def (pthread_spin_lock) +weak_alias (pthread_spin_lock, index) diff --git a/sysdeps/aarch64/nptl/pthread_spin_unlock.S b/sysdeps/aarch64/nptl/pthread_spin_unlock.S new file mode 100644 index 00000000000..57a6bca86fe --- /dev/null +++ b/sysdeps/aarch64/nptl/pthread_spin_unlock.S @@ -0,0 +1,34 @@ +/* pthread_spin_unlock -- unlock a spin lock. Generic version. + Copyright (C) 2012-2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + * + * ARMv8-a, AArch64 + */ + +ENTRY (pthread_spin_unlock) + DELOUSE (0) + stlr wzr, [x0] + mov w0, #0x0 + ret + +END (pthread_spin_unlock) +libc_hidden_builtin_def (pthread_spin_unlock) +weak_alias (pthread_spin_unlock, index)