diff mbox series

[v1,1/2] powerpc/bitops: Use immediate operand when possible

Message ID 09da6fec57792d6559d1ea64e00be9870b02dab4.1617896018.git.christophe.leroy@csgroup.eu (mailing list archive)
State Changes Requested
Headers show
Series [v1,1/2] powerpc/bitops: Use immediate operand when possible | expand
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (f2b8ef18c8e0634e176be99dcf242e515cfdb1d3)
snowpatch_ozlabs/checkpatch warning total: 9 errors, 3 warnings, 10 checks, 111 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Christophe Leroy April 8, 2021, 3:33 p.m. UTC
Today we get the following code generation for bitops like
set or clear bit:

	c0009fe0:	39 40 08 00 	li      r10,2048
	c0009fe4:	7c e0 40 28 	lwarx   r7,0,r8
	c0009fe8:	7c e7 53 78 	or      r7,r7,r10
	c0009fec:	7c e0 41 2d 	stwcx.  r7,0,r8

	c000c044:	39 40 20 00 	li      r10,8192
	c000c048:	7c e0 40 28 	lwarx   r7,0,r8
	c000c04c:	7c e7 50 78 	andc    r7,r7,r10
	c000c050:	7c e0 41 2d 	stwcx.  r7,0,r8

Most set bits are constant on lower 16 bits, so it can easily
be replaced by the "immediate" version of the operation. Allow
GCC to choose between the normal or immediate form.

For clear bits, on 32 bits 'rlwinm' can be used instead or 'andc' for
when all bits to be cleared are consecutive. For the time being only
handle the single bit case, which we detect by checking whether the
mask is a power of two. Can't use is_power_of_2() function because it
is not included yet, but it is easy to code with (mask & (mask - 1))
and even the 0 case which is not a power of two is acceptable for us.

On 64 bits we don't have any equivalent single operation, we'd need
two 'rldicl' so it is not worth it.

With this patch we get:

	c0009fe0:	7d 00 50 28 	lwarx   r8,0,r10
	c0009fe4:	61 08 08 00 	ori     r8,r8,2048
	c0009fe8:	7d 00 51 2d 	stwcx.  r8,0,r10

	c000c034:	7d 00 50 28 	lwarx   r8,0,r10
	c000c038:	55 08 04 e2 	rlwinm  r8,r8,0,19,17
	c000c03c:	7d 00 51 2d 	stwcx.  r8,0,r10

On pmac32_defconfig, it reduces the text by approx 10 kbytes.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/bitops.h | 77 +++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 8 deletions(-)

Comments

Segher Boessenkool April 12, 2021, 9:54 p.m. UTC | #1
Hi!

On Thu, Apr 08, 2021 at 03:33:44PM +0000, Christophe Leroy wrote:
> For clear bits, on 32 bits 'rlwinm' can be used instead or 'andc' for
> when all bits to be cleared are consecutive.

Also on 64-bits, as long as both the top and bottom bits are in the low
32-bit half (for 32 bit mode, it can wrap as well).

> For the time being only
> handle the single bit case, which we detect by checking whether the
> mask is a power of two.

You could look at rs6000_is_valid_mask in GCC:
  <https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/config/rs6000/rs6000.c;h=48b8efd732b251c059628096314848305deb0c0b;hb=HEAD#l11148>
used by rs6000_is_valid_and_mask immediately after it.  You probably
want to allow only rlwinm in your case, and please note this checks if
something is a valid mask, not the inverse of a valid mask (as you
want here).

So yes this is pretty involved :-)

Your patch looks good btw.  But please use "n", not "i", as constraint?


Segher
Christophe Leroy April 13, 2021, 4:33 p.m. UTC | #2
Le 12/04/2021 à 23:54, Segher Boessenkool a écrit :
> Hi!
> 
> On Thu, Apr 08, 2021 at 03:33:44PM +0000, Christophe Leroy wrote:
>> For clear bits, on 32 bits 'rlwinm' can be used instead or 'andc' for
>> when all bits to be cleared are consecutive.
> 
> Also on 64-bits, as long as both the top and bottom bits are in the low
> 32-bit half (for 32 bit mode, it can wrap as well).

Yes. But here we are talking about clearing a few bits, all other ones must remain unchanged. An 
rlwinm on PPC64 will always clear the upper part, which is unlikely what we want.

> 
>> For the time being only
>> handle the single bit case, which we detect by checking whether the
>> mask is a power of two.
> 
> You could look at rs6000_is_valid_mask in GCC:
>    <https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/config/rs6000/rs6000.c;h=48b8efd732b251c059628096314848305deb0c0b;hb=HEAD#l11148>
> used by rs6000_is_valid_and_mask immediately after it.  You probably
> want to allow only rlwinm in your case, and please note this checks if
> something is a valid mask, not the inverse of a valid mask (as you
> want here).

This check looks more complex than what I need. It is used for both rlw... and rld..., and it 
calculates the operants. The only thing I need is to validate the mask.
I found a way: By anding the mask with the complement of itself rotated by left bits to 1, we 
identify the transitions from 0 to 1. If the result is a power of 2, it means there's only one 
transition so the mask is as expected.

So I did that in v2.


> 
> So yes this is pretty involved :-)
> 
> Your patch looks good btw.  But please use "n", not "i", as constraint?

Done.

Christophe
Segher Boessenkool April 13, 2021, 9:58 p.m. UTC | #3
On Tue, Apr 13, 2021 at 06:33:19PM +0200, Christophe Leroy wrote:
> Le 12/04/2021 à 23:54, Segher Boessenkool a écrit :
> >On Thu, Apr 08, 2021 at 03:33:44PM +0000, Christophe Leroy wrote:
> >>For clear bits, on 32 bits 'rlwinm' can be used instead or 'andc' for
> >>when all bits to be cleared are consecutive.
> >
> >Also on 64-bits, as long as both the top and bottom bits are in the low
> >32-bit half (for 32 bit mode, it can wrap as well).
> 
> Yes. But here we are talking about clearing a few bits, all other ones must 
> remain unchanged. An rlwinm on PPC64 will always clear the upper part, 
> which is unlikely what we want.

No, it does not.  It takes the low 32 bits of the source reg, duplicated
to the top half as well, then rotated, then ANDed with the mask (which
can wrap around).  This isn't very often very useful, but :-)

(One useful operation is splatting 32 bits to both halves of a 64-bit
register, which is just rlwinm d,s,0,1,0).

If you only look at the low 32 bits, it does exactly the same as on
32-bit implementations.

> >>For the time being only
> >>handle the single bit case, which we detect by checking whether the
> >>mask is a power of two.
> >
> >You could look at rs6000_is_valid_mask in GCC:
> >   <https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/config/rs6000/rs6000.c;h=48b8efd732b251c059628096314848305deb0c0b;hb=HEAD#l11148>
> >used by rs6000_is_valid_and_mask immediately after it.  You probably
> >want to allow only rlwinm in your case, and please note this checks if
> >something is a valid mask, not the inverse of a valid mask (as you
> >want here).
> 
> This check looks more complex than what I need. It is used for both rlw... 
> and rld..., and it calculates the operants.  The only thing I need is to 
> validate the mask.

It has to do exactly the same thing for rlwinm as for all 64-bit
variants (rldicl, rldicr, rldic).

One side effect of calculation the bit positions with exact_log2 is that
that returns negative if the argument is not a power of two.

Here is a simpler way, that handles all cases:  input in "u32 val":

	if (!val)
		return nonono;
	if (val & 1)
		val = ~val;	// make the mask non-wrapping
	val += val & -val;	// adding the low set bit should result in
				// at most one bit set
	if (!(val & (val - 1)))
		return okidoki_all_good;

> I found a way: By anding the mask with the complement of itself rotated by 
> left bits to 1, we identify the transitions from 0 to 1. If the result is a 
> power of 2, it means there's only one transition so the mask is as expected.

That does not handle all cases (it misses all bits set at least).  Which
isn't all that interesting of course, but is a valid mask (but won't
clear any bits, so not too interesting for your specific case :-) )


Segher
Nicholas Piggin April 14, 2021, 2:01 a.m. UTC | #4
Excerpts from Segher Boessenkool's message of April 14, 2021 7:58 am:
> On Tue, Apr 13, 2021 at 06:33:19PM +0200, Christophe Leroy wrote:
>> Le 12/04/2021 à 23:54, Segher Boessenkool a écrit :
>> >On Thu, Apr 08, 2021 at 03:33:44PM +0000, Christophe Leroy wrote:
>> >>For clear bits, on 32 bits 'rlwinm' can be used instead or 'andc' for
>> >>when all bits to be cleared are consecutive.
>> >
>> >Also on 64-bits, as long as both the top and bottom bits are in the low
>> >32-bit half (for 32 bit mode, it can wrap as well).
>> 
>> Yes. But here we are talking about clearing a few bits, all other ones must 
>> remain unchanged. An rlwinm on PPC64 will always clear the upper part, 
>> which is unlikely what we want.
> 
> No, it does not.  It takes the low 32 bits of the source reg, duplicated
> to the top half as well, then rotated, then ANDed with the mask (which
> can wrap around).  This isn't very often very useful, but :-)
> 
> (One useful operation is splatting 32 bits to both halves of a 64-bit
> register, which is just rlwinm d,s,0,1,0).
> 
> If you only look at the low 32 bits, it does exactly the same as on
> 32-bit implementations.
> 
>> >>For the time being only
>> >>handle the single bit case, which we detect by checking whether the
>> >>mask is a power of two.
>> >
>> >You could look at rs6000_is_valid_mask in GCC:
>> >   <https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/config/rs6000/rs6000.c;h=48b8efd732b251c059628096314848305deb0c0b;hb=HEAD#l11148>
>> >used by rs6000_is_valid_and_mask immediately after it.  You probably
>> >want to allow only rlwinm in your case, and please note this checks if
>> >something is a valid mask, not the inverse of a valid mask (as you
>> >want here).
>> 
>> This check looks more complex than what I need. It is used for both rlw... 
>> and rld..., and it calculates the operants.  The only thing I need is to 
>> validate the mask.
> 
> It has to do exactly the same thing for rlwinm as for all 64-bit
> variants (rldicl, rldicr, rldic).
> 
> One side effect of calculation the bit positions with exact_log2 is that
> that returns negative if the argument is not a power of two.
> 
> Here is a simpler way, that handles all cases:  input in "u32 val":
> 
> 	if (!val)
> 		return nonono;
> 	if (val & 1)
> 		val = ~val;	// make the mask non-wrapping
> 	val += val & -val;	// adding the low set bit should result in
> 				// at most one bit set
> 	if (!(val & (val - 1)))
> 		return okidoki_all_good;
> 
>> I found a way: By anding the mask with the complement of itself rotated by 
>> left bits to 1, we identify the transitions from 0 to 1. If the result is a 
>> power of 2, it means there's only one transition so the mask is as expected.
> 
> That does not handle all cases (it misses all bits set at least).  Which
> isn't all that interesting of course, but is a valid mask (but won't
> clear any bits, so not too interesting for your specific case :-) )

Would be nice if we could let the compiler deal with it all...

static inline unsigned long lr(unsigned long *mem)
{
        unsigned long val;

        /*
         * This doesn't clobber memory but want to avoid memory operations
         * moving ahead of it
         */
        asm volatile("ldarx     %0, %y1" : "=r"(val) : "Z"(*mem) : "memory");

        return val;
}

static inline bool stc(unsigned long *mem, unsigned long val)
{
        /*
         * This doesn't really clobber memory but same as above, also can't
         * specify output in asm goto.
         */
        asm volatile goto(
                "stdcx. %0, %y1 \n\t"
                "bne-   %l[fail]        \n\t"
                : : "r"(val), "Z"(*mem) : "cr0", "memory" : fail);

        return true;
fail: __attribute__((cold))
        return false;
}

static inline void atomic_add(unsigned long *mem, unsigned long val)
{
        unsigned long old, new;

        do {
                old = lr(mem);
                new = old + val;
        } while (unlikely(!stc(mem, new)));
}
Segher Boessenkool April 14, 2021, 12:24 p.m. UTC | #5
On Wed, Apr 14, 2021 at 12:01:21PM +1000, Nicholas Piggin wrote:
> Would be nice if we could let the compiler deal with it all...
> 
> static inline unsigned long lr(unsigned long *mem)
> {
>         unsigned long val;
> 
>         /*
>          * This doesn't clobber memory but want to avoid memory operations
>          * moving ahead of it
>          */
>         asm volatile("ldarx     %0, %y1" : "=r"(val) : "Z"(*mem) : "memory");
> 
>         return val;
> }

(etc.)

That can not work reliably: the compiler can put random instructions
between the larx and stcx. this way, and you then do not have guaranteed
forward progress anymore.  It can put the two in different routines
(after inlining and other interprocedural optimisations), duplicate
them, make a different number of copies of them, etc.

Nothing of that is okay if you want to guarantee forward progress on all
implementations, and also not if you want to have good performance
everywhere (or anywhere even).  Unfortunately you have to write all
larx/stcx. loops as one block of assembler, so that you know exactly
what instructions will end up in your binary.

If you don't, it will fail mysteriously after random recompilations, or
have performance degradations, etc.  You don't want to go there :-)


Segher
Christophe Leroy April 14, 2021, 12:42 p.m. UTC | #6
Le 14/04/2021 à 14:24, Segher Boessenkool a écrit :
> On Wed, Apr 14, 2021 at 12:01:21PM +1000, Nicholas Piggin wrote:
>> Would be nice if we could let the compiler deal with it all...
>>
>> static inline unsigned long lr(unsigned long *mem)
>> {
>>          unsigned long val;
>>
>>          /*
>>           * This doesn't clobber memory but want to avoid memory operations
>>           * moving ahead of it
>>           */
>>          asm volatile("ldarx     %0, %y1" : "=r"(val) : "Z"(*mem) : "memory");
>>
>>          return val;
>> }
> 
> (etc.)
> 
> That can not work reliably: the compiler can put random instructions
> between the larx and stcx. this way, and you then do not have guaranteed
> forward progress anymore.  It can put the two in different routines
> (after inlining and other interprocedural optimisations), duplicate
> them, make a different number of copies of them, etc.
> 
> Nothing of that is okay if you want to guarantee forward progress on all
> implementations, and also not if you want to have good performance
> everywhere (or anywhere even).  Unfortunately you have to write all
> larx/stcx. loops as one block of assembler, so that you know exactly
> what instructions will end up in your binary.
> 
> If you don't, it will fail mysteriously after random recompilations, or
> have performance degradations, etc.  You don't want to go there :-)
> 

Could the kernel use GCC builtin atomic functions instead ?

https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
Segher Boessenkool April 14, 2021, 3:19 p.m. UTC | #7
On Wed, Apr 14, 2021 at 02:42:51PM +0200, Christophe Leroy wrote:
> Le 14/04/2021 à 14:24, Segher Boessenkool a écrit :
> >On Wed, Apr 14, 2021 at 12:01:21PM +1000, Nicholas Piggin wrote:
> >>Would be nice if we could let the compiler deal with it all...
> >>
> >>static inline unsigned long lr(unsigned long *mem)
> >>{
> >>         unsigned long val;
> >>
> >>         /*
> >>          * This doesn't clobber memory but want to avoid memory 
> >>          operations
> >>          * moving ahead of it
> >>          */
> >>         asm volatile("ldarx     %0, %y1" : "=r"(val) : "Z"(*mem) : 
> >>         "memory");
> >>
> >>         return val;
> >>}
> >
> >(etc.)
> >
> >That can not work reliably: the compiler can put random instructions
> >between the larx and stcx. this way, and you then do not have guaranteed
> >forward progress anymore.  It can put the two in different routines
> >(after inlining and other interprocedural optimisations), duplicate
> >them, make a different number of copies of them, etc.
> >
> >Nothing of that is okay if you want to guarantee forward progress on all
> >implementations, and also not if you want to have good performance
> >everywhere (or anywhere even).  Unfortunately you have to write all
> >larx/stcx. loops as one block of assembler, so that you know exactly
> >what instructions will end up in your binary.
> >
> >If you don't, it will fail mysteriously after random recompilations, or
> >have performance degradations, etc.  You don't want to go there :-)
> >
> 
> Could the kernel use GCC builtin atomic functions instead ?
> 
> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

Certainly that should work fine for the simpler cases that the atomic
operations are meant to provide.  But esp. for not-so-simple cases the
kernel may require some behaviour provided by the existing assembler
implementation, and not by the atomic builtins.

I'm not saying this cannot work, just that some serious testing will be
needed.  If it works it should be the best of all worlds, so then it is
a really good idea yes :-)


Segher
David Laight April 14, 2021, 3:32 p.m. UTC | #8
From: Segher Boessenkool
> Sent: 14 April 2021 16:19
...
> > Could the kernel use GCC builtin atomic functions instead ?
> >
> > https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> 
> Certainly that should work fine for the simpler cases that the atomic
> operations are meant to provide.  But esp. for not-so-simple cases the
> kernel may require some behaviour provided by the existing assembler
> implementation, and not by the atomic builtins.
> 
> I'm not saying this cannot work, just that some serious testing will be
> needed.  If it works it should be the best of all worlds, so then it is
> a really good idea yes :-)

I suspect they just add an extra layer of abstraction that makes it
even more difficult to verify and could easily get broken by a compiler
update (etc).

The other issue is that the code needs to be correct with compiled
with (for example) -O0.
That could very easily break anything except the asm implementation
if additional memory accesses and/or increased code size cause grief.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Segher Boessenkool April 14, 2021, 5:20 p.m. UTC | #9
On Wed, Apr 14, 2021 at 03:32:04PM +0000, David Laight wrote:
> From: Segher Boessenkool
> > Sent: 14 April 2021 16:19
> ...
> > > Could the kernel use GCC builtin atomic functions instead ?
> > >
> > > https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> > 
> > Certainly that should work fine for the simpler cases that the atomic
> > operations are meant to provide.  But esp. for not-so-simple cases the
> > kernel may require some behaviour provided by the existing assembler
> > implementation, and not by the atomic builtins.
> > 
> > I'm not saying this cannot work, just that some serious testing will be
> > needed.  If it works it should be the best of all worlds, so then it is
> > a really good idea yes :-)
> 
> I suspect they just add an extra layer of abstraction that makes it
> even more difficult to verify and could easily get broken by a compiler
> update (etc).

I would say it uses an existing facility, instead of creating a kernel-
specific one.

> The other issue is that the code needs to be correct with compiled
> with (for example) -O0.
> That could very easily break anything except the asm implementation
> if additional memory accesses and/or increased code size cause grief.

The compiler generates correct code.  New versions of the compiler or
old, -O0 or not, under any phase of the moon.

Of course sometimes the compiler is broken, but there are pre-existing
ways of dealing with that, and there is no reason at all to think this
would break more often than random other code.


Segher
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 299ab33505a6..0b0c6bdd9be9 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -71,19 +71,49 @@  static inline void fn(unsigned long mask,	\
 	__asm__ __volatile__ (			\
 	prefix					\
 "1:"	PPC_LLARX(%0,0,%3,0) "\n"		\
-	stringify_in_c(op) "%0,%0,%2\n"		\
+	#op "%I2 %0,%0,%2\n"			\
 	PPC_STLCX "%0,0,%3\n"			\
 	"bne- 1b\n"				\
 	: "=&r" (old), "+m" (*p)		\
-	: "r" (mask), "r" (p)			\
+	: "rK" (mask), "r" (p)			\
 	: "cc", "memory");			\
 }
 
 DEFINE_BITOP(set_bits, or, "")
-DEFINE_BITOP(clear_bits, andc, "")
-DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
 DEFINE_BITOP(change_bits, xor, "")
 
+#define DEFINE_CLROP(fn, prefix)					\
+static inline void fn(unsigned long mask, volatile unsigned long *_p)	\
+{									\
+	unsigned long old;						\
+	unsigned long *p = (unsigned long *)_p;				\
+	if (IS_ENABLED(CONFIG_PPC32) &&					\
+	    __builtin_constant_p(mask) && !(mask & (mask - 1))) {	\
+		asm volatile (						\
+			prefix						\
+		"1:"	"lwarx	%0,0,%3\n"				\
+			"rlwinm	%0,%0,0,%2\n"				\
+			"stwcx.	%0,0,%3\n"				\
+			"bne- 1b\n"					\
+			: "=&r" (old), "+m" (*p)			\
+			: "i" (~mask), "r" (p)				\
+			: "cc", "memory");				\
+	} else {							\
+		asm volatile (						\
+			prefix						\
+		"1:"	PPC_LLARX(%0,0,%3,0) "\n"			\
+			"andc %0,%0,%2\n"				\
+			PPC_STLCX "%0,0,%3\n"				\
+			"bne- 1b\n"					\
+			: "=&r" (old), "+m" (*p)			\
+			: "r" (mask), "r" (p)				\
+			: "cc", "memory");				\
+	}								\
+}
+
+DEFINE_CLROP(clear_bits, "")
+DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER)
+
 static inline void arch_set_bit(int nr, volatile unsigned long *addr)
 {
 	set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
@@ -116,12 +146,12 @@  static inline unsigned long fn(			\
 	__asm__ __volatile__ (				\
 	prefix						\
 "1:"	PPC_LLARX(%0,0,%3,eh) "\n"			\
-	stringify_in_c(op) "%1,%0,%2\n"			\
+	#op "%I2 %1,%0,%2\n"				\
 	PPC_STLCX "%1,0,%3\n"				\
 	"bne- 1b\n"					\
 	postfix						\
 	: "=&r" (old), "=&r" (t)			\
-	: "r" (mask), "r" (p)				\
+	: "rK" (mask), "r" (p)				\
 	: "cc", "memory");				\
 	return (old & mask);				\
 }
@@ -130,11 +160,42 @@  DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
 	      PPC_ATOMIC_EXIT_BARRIER, 0)
 DEFINE_TESTOP(test_and_set_bits_lock, or, "",
 	      PPC_ACQUIRE_BARRIER, 1)
-DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
-	      PPC_ATOMIC_EXIT_BARRIER, 0)
 DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
 	      PPC_ATOMIC_EXIT_BARRIER, 0)
 
+static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
+{
+	unsigned long old, t;
+	unsigned long *p = (unsigned long *)_p;
+
+	if (IS_ENABLED(CONFIG_PPC32) &&
+	    __builtin_constant_p(mask) && !(mask & (mask - 1))) {
+		asm volatile (
+			PPC_ATOMIC_ENTRY_BARRIER
+		"1:"	PPC_LLARX(%0,0,%3,0) "\n"
+			"rlwinm	%1,%0,0,%2\n"
+			PPC_STLCX "%1,0,%3\n"
+			"bne- 1b\n"
+			PPC_ATOMIC_EXIT_BARRIER
+			: "=&r" (old), "=&r" (t)
+			: "i" (~mask), "r" (p)
+			: "cc", "memory");
+	} else {
+		asm volatile (
+			PPC_ATOMIC_ENTRY_BARRIER
+		"1:"	PPC_LLARX(%0,0,%3,0) "\n"
+			"andc	%1,%0,%2\n"
+			PPC_STLCX "%1,0,%3\n"
+			"bne- 1b\n"
+			PPC_ATOMIC_EXIT_BARRIER
+			: "=&r" (old), "=&r" (t)
+			: "r" (mask), "r" (p)
+			: "cc", "memory");
+	}
+
+	return (old & mask);
+}
+
 static inline int arch_test_and_set_bit(unsigned long nr,
 					volatile unsigned long *addr)
 {