Message ID | e6f815d9181bab09df3b350af51149437863e9f9.1632236981.git.christophe.leroy@csgroup.eu (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | [v5,1/3] powerpc/bitops: Use immediate operand when possible | expand |
Related | show |
Hi Michael, Any chance to get this series merged this cycle ? Thanks Christophe Le 21/09/2021 à 17:09, Christophe Leroy a écrit : > Today we get the following code generation for bitops like > set or clear bit: > > c0009fe0: 39 40 08 00 li r10,2048 > c0009fe4: 7c e0 40 28 lwarx r7,0,r8 > c0009fe8: 7c e7 53 78 or r7,r7,r10 > c0009fec: 7c e0 41 2d stwcx. r7,0,r8 > > c000d568: 39 00 18 00 li r8,6144 > c000d56c: 7c c0 38 28 lwarx r6,0,r7 > c000d570: 7c c6 40 78 andc r6,r6,r8 > c000d574: 7c c0 39 2d stwcx. r6,0,r7 > > Most set bits are constant on lower 16 bits, so it can easily > be replaced by the "immediate" version of the operation. Allow > GCC to choose between the normal or immediate form. > > For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for > when all bits to be cleared are consecutive. > > On 64 bits we don't have any equivalent single operation for clearing, > single bits or a few bits, we'd need two 'rldicl' so it is not > worth it, the li/andc sequence is doing the same. > > With this patch we get: > > c0009fe0: 7d 00 50 28 lwarx r8,0,r10 > c0009fe4: 61 08 08 00 ori r8,r8,2048 > c0009fe8: 7d 00 51 2d stwcx. r8,0,r10 > > c000d558: 7c e0 40 28 lwarx r7,0,r8 > c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18 > c000d560: 7c e0 41 2d stwcx. r7,0,r8 > > On pmac32_defconfig, it reduces the text by approx 10 kbytes. > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> > Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org> > --- > v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits() > > v4: Rebased > > v3: > - Using the mask validation proposed by Segher > > v2: > - Use "n" instead of "i" as constraint for the rlwinm mask > - Improve mask verification to handle more than single bit masks > > Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> > --- > arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++--- > 1 file changed, 81 insertions(+), 8 deletions(-) > > diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h > index 11847b6a244e..a05d8c62cbea 100644 > --- a/arch/powerpc/include/asm/bitops.h > +++ b/arch/powerpc/include/asm/bitops.h > @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \ > __asm__ __volatile__ ( \ > prefix \ > "1:" PPC_LLARX "%0,0,%3,0\n" \ > - stringify_in_c(op) "%0,%0,%2\n" \ > + #op "%I2 %0,%0,%2\n" \ > PPC_STLCX "%0,0,%3\n" \ > "bne- 1b\n" \ > : "=&r" (old), "+m" (*p) \ > - : "r" (mask), "r" (p) \ > + : "rK" (mask), "r" (p) \ > : "cc", "memory"); \ > } > > DEFINE_BITOP(set_bits, or, "") > -DEFINE_BITOP(clear_bits, andc, "") > -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) > DEFINE_BITOP(change_bits, xor, "") > > +static __always_inline bool is_rlwinm_mask_valid(unsigned long x) > +{ > + if (!x) > + return false; > + if (x & 1) > + x = ~x; // make the mask non-wrapping > + x += x & -x; // adding the low set bit results in at most one bit set > + > + return !(x & (x - 1)); > +} > + > +#define DEFINE_CLROP(fn, prefix) \ > +static inline void fn(unsigned long mask, volatile unsigned long *_p) \ > +{ \ > + unsigned long old; \ > + unsigned long *p = (unsigned long *)_p; \ > + \ > + if (IS_ENABLED(CONFIG_PPC32) && \ > + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\ > + asm volatile ( \ > + prefix \ > + "1:" "lwarx %0,0,%3\n" \ > + "rlwinm %0,%0,0,%2\n" \ > + "stwcx. %0,0,%3\n" \ > + "bne- 1b\n" \ > + : "=&r" (old), "+m" (*p) \ > + : "n" (~mask), "r" (p) \ > + : "cc", "memory"); \ > + } else { \ > + asm volatile ( \ > + prefix \ > + "1:" PPC_LLARX "%0,0,%3,0\n" \ > + "andc %0,%0,%2\n" \ > + PPC_STLCX "%0,0,%3\n" \ > + "bne- 1b\n" \ > + : "=&r" (old), "+m" (*p) \ > + : "r" (mask), "r" (p) \ > + : "cc", "memory"); \ > + } \ > +} > + > +DEFINE_CLROP(clear_bits, "") > +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER) > + > static inline void arch_set_bit(int nr, volatile unsigned long *addr) > { > set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); > @@ -116,12 +158,12 @@ static inline unsigned long fn( \ > __asm__ __volatile__ ( \ > prefix \ > "1:" PPC_LLARX "%0,0,%3,%4\n" \ > - stringify_in_c(op) "%1,%0,%2\n" \ > + #op "%I2 %1,%0,%2\n" \ > PPC_STLCX "%1,0,%3\n" \ > "bne- 1b\n" \ > postfix \ > : "=&r" (old), "=&r" (t) \ > - : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ > + : "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ > : "cc", "memory"); \ > return (old & mask); \ > } > @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER, > PPC_ATOMIC_EXIT_BARRIER, 0) > DEFINE_TESTOP(test_and_set_bits_lock, or, "", > PPC_ACQUIRE_BARRIER, 1) > -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER, > - PPC_ATOMIC_EXIT_BARRIER, 0) > DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER, > PPC_ATOMIC_EXIT_BARRIER, 0) > > +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p) > +{ > + unsigned long old, t; > + unsigned long *p = (unsigned long *)_p; > + > + if (IS_ENABLED(CONFIG_PPC32) && > + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) { > + asm volatile ( > + PPC_ATOMIC_ENTRY_BARRIER > + "1:" "lwarx %0,0,%3\n" > + "rlwinm %1,%0,0,%2\n" > + "stwcx. %1,0,%3\n" > + "bne- 1b\n" > + PPC_ATOMIC_EXIT_BARRIER > + : "=&r" (old), "=&r" (t) > + : "n" (~mask), "r" (p) > + : "cc", "memory"); > + } else { > + asm volatile ( > + PPC_ATOMIC_ENTRY_BARRIER > + "1:" PPC_LLARX "%0,0,%3,0\n" > + "andc %1,%0,%2\n" > + PPC_STLCX "%1,0,%3\n" > + "bne- 1b\n" > + PPC_ATOMIC_EXIT_BARRIER > + : "=&r" (old), "=&r" (t) > + : "r" (mask), "r" (p) > + : "cc", "memory"); > + } > + > + return (old & mask); > +} > + > static inline int arch_test_and_set_bit(unsigned long nr, > volatile unsigned long *addr) > { >
LEROY Christophe <christophe.leroy@csgroup.eu> writes: > Hi Michael, > > Any chance to get this series merged this cycle ? Yeah. I hesitated a bit at the changes which make the atomic asm even more complicated, but I guess it's worth it. cheers > Le 21/09/2021 à 17:09, Christophe Leroy a écrit : >> Today we get the following code generation for bitops like >> set or clear bit: >> >> c0009fe0: 39 40 08 00 li r10,2048 >> c0009fe4: 7c e0 40 28 lwarx r7,0,r8 >> c0009fe8: 7c e7 53 78 or r7,r7,r10 >> c0009fec: 7c e0 41 2d stwcx. r7,0,r8 >> >> c000d568: 39 00 18 00 li r8,6144 >> c000d56c: 7c c0 38 28 lwarx r6,0,r7 >> c000d570: 7c c6 40 78 andc r6,r6,r8 >> c000d574: 7c c0 39 2d stwcx. r6,0,r7 >> >> Most set bits are constant on lower 16 bits, so it can easily >> be replaced by the "immediate" version of the operation. Allow >> GCC to choose between the normal or immediate form. >> >> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for >> when all bits to be cleared are consecutive. >> >> On 64 bits we don't have any equivalent single operation for clearing, >> single bits or a few bits, we'd need two 'rldicl' so it is not >> worth it, the li/andc sequence is doing the same. >> >> With this patch we get: >> >> c0009fe0: 7d 00 50 28 lwarx r8,0,r10 >> c0009fe4: 61 08 08 00 ori r8,r8,2048 >> c0009fe8: 7d 00 51 2d stwcx. r8,0,r10 >> >> c000d558: 7c e0 40 28 lwarx r7,0,r8 >> c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18 >> c000d560: 7c e0 41 2d stwcx. r7,0,r8 >> >> On pmac32_defconfig, it reduces the text by approx 10 kbytes. >> >> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> >> Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org> >> --- >> v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits() >> >> v4: Rebased >> >> v3: >> - Using the mask validation proposed by Segher >> >> v2: >> - Use "n" instead of "i" as constraint for the rlwinm mask >> - Improve mask verification to handle more than single bit masks >> >> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> >> --- >> arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++--- >> 1 file changed, 81 insertions(+), 8 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h >> index 11847b6a244e..a05d8c62cbea 100644 >> --- a/arch/powerpc/include/asm/bitops.h >> +++ b/arch/powerpc/include/asm/bitops.h >> @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \ >> __asm__ __volatile__ ( \ >> prefix \ >> "1:" PPC_LLARX "%0,0,%3,0\n" \ >> - stringify_in_c(op) "%0,%0,%2\n" \ >> + #op "%I2 %0,%0,%2\n" \ >> PPC_STLCX "%0,0,%3\n" \ >> "bne- 1b\n" \ >> : "=&r" (old), "+m" (*p) \ >> - : "r" (mask), "r" (p) \ >> + : "rK" (mask), "r" (p) \ >> : "cc", "memory"); \ >> } >> >> DEFINE_BITOP(set_bits, or, "") >> -DEFINE_BITOP(clear_bits, andc, "") >> -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) >> DEFINE_BITOP(change_bits, xor, "") >> >> +static __always_inline bool is_rlwinm_mask_valid(unsigned long x) >> +{ >> + if (!x) >> + return false; >> + if (x & 1) >> + x = ~x; // make the mask non-wrapping >> + x += x & -x; // adding the low set bit results in at most one bit set >> + >> + return !(x & (x - 1)); >> +} >> + >> +#define DEFINE_CLROP(fn, prefix) \ >> +static inline void fn(unsigned long mask, volatile unsigned long *_p) \ >> +{ \ >> + unsigned long old; \ >> + unsigned long *p = (unsigned long *)_p; \ >> + \ >> + if (IS_ENABLED(CONFIG_PPC32) && \ >> + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\ >> + asm volatile ( \ >> + prefix \ >> + "1:" "lwarx %0,0,%3\n" \ >> + "rlwinm %0,%0,0,%2\n" \ >> + "stwcx. %0,0,%3\n" \ >> + "bne- 1b\n" \ >> + : "=&r" (old), "+m" (*p) \ >> + : "n" (~mask), "r" (p) \ >> + : "cc", "memory"); \ >> + } else { \ >> + asm volatile ( \ >> + prefix \ >> + "1:" PPC_LLARX "%0,0,%3,0\n" \ >> + "andc %0,%0,%2\n" \ >> + PPC_STLCX "%0,0,%3\n" \ >> + "bne- 1b\n" \ >> + : "=&r" (old), "+m" (*p) \ >> + : "r" (mask), "r" (p) \ >> + : "cc", "memory"); \ >> + } \ >> +} >> + >> +DEFINE_CLROP(clear_bits, "") >> +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER) >> + >> static inline void arch_set_bit(int nr, volatile unsigned long *addr) >> { >> set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); >> @@ -116,12 +158,12 @@ static inline unsigned long fn( \ >> __asm__ __volatile__ ( \ >> prefix \ >> "1:" PPC_LLARX "%0,0,%3,%4\n" \ >> - stringify_in_c(op) "%1,%0,%2\n" \ >> + #op "%I2 %1,%0,%2\n" \ >> PPC_STLCX "%1,0,%3\n" \ >> "bne- 1b\n" \ >> postfix \ >> : "=&r" (old), "=&r" (t) \ >> - : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ >> + : "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ >> : "cc", "memory"); \ >> return (old & mask); \ >> } >> @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER, >> PPC_ATOMIC_EXIT_BARRIER, 0) >> DEFINE_TESTOP(test_and_set_bits_lock, or, "", >> PPC_ACQUIRE_BARRIER, 1) >> -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER, >> - PPC_ATOMIC_EXIT_BARRIER, 0) >> DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER, >> PPC_ATOMIC_EXIT_BARRIER, 0) >> >> +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p) >> +{ >> + unsigned long old, t; >> + unsigned long *p = (unsigned long *)_p; >> + >> + if (IS_ENABLED(CONFIG_PPC32) && >> + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) { >> + asm volatile ( >> + PPC_ATOMIC_ENTRY_BARRIER >> + "1:" "lwarx %0,0,%3\n" >> + "rlwinm %1,%0,0,%2\n" >> + "stwcx. %1,0,%3\n" >> + "bne- 1b\n" >> + PPC_ATOMIC_EXIT_BARRIER >> + : "=&r" (old), "=&r" (t) >> + : "n" (~mask), "r" (p) >> + : "cc", "memory"); >> + } else { >> + asm volatile ( >> + PPC_ATOMIC_ENTRY_BARRIER >> + "1:" PPC_LLARX "%0,0,%3,0\n" >> + "andc %1,%0,%2\n" >> + PPC_STLCX "%1,0,%3\n" >> + "bne- 1b\n" >> + PPC_ATOMIC_EXIT_BARRIER >> + : "=&r" (old), "=&r" (t) >> + : "r" (mask), "r" (p) >> + : "cc", "memory"); >> + } >> + >> + return (old & mask); >> +} >> + >> static inline int arch_test_and_set_bit(unsigned long nr, >> volatile unsigned long *addr) >> { >>
On Tue, 21 Sep 2021 17:09:47 +0200, Christophe Leroy wrote: > Today we get the following code generation for bitops like > set or clear bit: > > c0009fe0: 39 40 08 00 li r10,2048 > c0009fe4: 7c e0 40 28 lwarx r7,0,r8 > c0009fe8: 7c e7 53 78 or r7,r7,r10 > c0009fec: 7c e0 41 2d stwcx. r7,0,r8 > > [...] Applied to powerpc/next. [1/3] powerpc/bitops: Use immediate operand when possible https://git.kernel.org/powerpc/c/fb350784d8d17952afa93383bb47aaa6b715c459 [2/3] powerpc/atomics: Use immediate operand when possible https://git.kernel.org/powerpc/c/41d65207de9fbff58acd8937a7c3f8940c186a87 [3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends https://git.kernel.org/powerpc/c/f05cab0034babaa9b3dfaf6003ee6493496a8180 cheers
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 11847b6a244e..a05d8c62cbea 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \ __asm__ __volatile__ ( \ prefix \ "1:" PPC_LLARX "%0,0,%3,0\n" \ - stringify_in_c(op) "%0,%0,%2\n" \ + #op "%I2 %0,%0,%2\n" \ PPC_STLCX "%0,0,%3\n" \ "bne- 1b\n" \ : "=&r" (old), "+m" (*p) \ - : "r" (mask), "r" (p) \ + : "rK" (mask), "r" (p) \ : "cc", "memory"); \ } DEFINE_BITOP(set_bits, or, "") -DEFINE_BITOP(clear_bits, andc, "") -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER) DEFINE_BITOP(change_bits, xor, "") +static __always_inline bool is_rlwinm_mask_valid(unsigned long x) +{ + if (!x) + return false; + if (x & 1) + x = ~x; // make the mask non-wrapping + x += x & -x; // adding the low set bit results in at most one bit set + + return !(x & (x - 1)); +} + +#define DEFINE_CLROP(fn, prefix) \ +static inline void fn(unsigned long mask, volatile unsigned long *_p) \ +{ \ + unsigned long old; \ + unsigned long *p = (unsigned long *)_p; \ + \ + if (IS_ENABLED(CONFIG_PPC32) && \ + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\ + asm volatile ( \ + prefix \ + "1:" "lwarx %0,0,%3\n" \ + "rlwinm %0,%0,0,%2\n" \ + "stwcx. %0,0,%3\n" \ + "bne- 1b\n" \ + : "=&r" (old), "+m" (*p) \ + : "n" (~mask), "r" (p) \ + : "cc", "memory"); \ + } else { \ + asm volatile ( \ + prefix \ + "1:" PPC_LLARX "%0,0,%3,0\n" \ + "andc %0,%0,%2\n" \ + PPC_STLCX "%0,0,%3\n" \ + "bne- 1b\n" \ + : "=&r" (old), "+m" (*p) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ + } \ +} + +DEFINE_CLROP(clear_bits, "") +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER) + static inline void arch_set_bit(int nr, volatile unsigned long *addr) { set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); @@ -116,12 +158,12 @@ static inline unsigned long fn( \ __asm__ __volatile__ ( \ prefix \ "1:" PPC_LLARX "%0,0,%3,%4\n" \ - stringify_in_c(op) "%1,%0,%2\n" \ + #op "%I2 %1,%0,%2\n" \ PPC_STLCX "%1,0,%3\n" \ "bne- 1b\n" \ postfix \ : "=&r" (old), "=&r" (t) \ - : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ + : "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ : "cc", "memory"); \ return (old & mask); \ } @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, 0) DEFINE_TESTOP(test_and_set_bits_lock, or, "", PPC_ACQUIRE_BARRIER, 1) -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER, - PPC_ATOMIC_EXIT_BARRIER, 0) DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, 0) +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p) +{ + unsigned long old, t; + unsigned long *p = (unsigned long *)_p; + + if (IS_ENABLED(CONFIG_PPC32) && + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) { + asm volatile ( + PPC_ATOMIC_ENTRY_BARRIER + "1:" "lwarx %0,0,%3\n" + "rlwinm %1,%0,0,%2\n" + "stwcx. %1,0,%3\n" + "bne- 1b\n" + PPC_ATOMIC_EXIT_BARRIER + : "=&r" (old), "=&r" (t) + : "n" (~mask), "r" (p) + : "cc", "memory"); + } else { + asm volatile ( + PPC_ATOMIC_ENTRY_BARRIER + "1:" PPC_LLARX "%0,0,%3,0\n" + "andc %1,%0,%2\n" + PPC_STLCX "%1,0,%3\n" + "bne- 1b\n" + PPC_ATOMIC_EXIT_BARRIER + : "=&r" (old), "=&r" (t) + : "r" (mask), "r" (p) + : "cc", "memory"); + } + + return (old & mask); +} + static inline int arch_test_and_set_bit(unsigned long nr, volatile unsigned long *addr) {