Patchwork RFC: powerpc: expose the multi-bit ops that underlie single-bit ops.

login
register
mail settings
Submitter Geoff Thorpe
Date May 26, 2009, 6:19 p.m.
Message ID <1243361946-6771-1-git-send-email-Geoff.Thorpe@freescale.com>
Download mbox | patch
Permalink /patch/27668/
State Changes Requested, archived
Headers show

Comments

Geoff Thorpe - May 26, 2009, 6:19 p.m.
NOT FOR COMMIT, THIS IS A REQUEST FOR FEEDBACK.

The bitops.h functions that operate on a single bit in a bitfield are
implemented by operating on the corresponding word location. In all cases
the inner logic appears to be valid if the mask being applied has more
than one bit set, so this patch exposes those inner operations. Indeed,
set_bits() was already available, but it duplicated code from set_bit()
(rather than making the latter a wrapper) - it was also missing the
PPC405_ERR77() workaround and the "volatile" address qualifier present in
other APIs. This corrects that, and exposes the other multi-bit
equivalents.

One advantage of these multi-bit forms is that they allow word-sized
variables to essentially be their own spinlocks.

NB, the same factoring is possible in asm-generic/bitops/[non-]atomic.h.
I would be happy to provide corresponding patches if this approach is
deemed appropriate. Feedback would be most welcome.

Signed-off-by: Geoff Thorpe <Geoff.Thorpe@freescale.com>
---
 arch/powerpc/include/asm/bitops.h |  111 +++++++++++++++++++++++--------------
 1 files changed, 69 insertions(+), 42 deletions(-)
Benjamin Herrenschmidt - June 16, 2009, 3:53 a.m.
On Tue, 2009-05-26 at 14:19 -0400, Geoff Thorpe wrote:
> NOT FOR COMMIT, THIS IS A REQUEST FOR FEEDBACK.
> 
> The bitops.h functions that operate on a single bit in a bitfield are
> implemented by operating on the corresponding word location. In all cases
> the inner logic appears to be valid if the mask being applied has more
> than one bit set, so this patch exposes those inner operations. Indeed,
> set_bits() was already available, but it duplicated code from set_bit()
> (rather than making the latter a wrapper) - it was also missing the
> PPC405_ERR77() workaround and the "volatile" address qualifier present in
> other APIs. This corrects that, and exposes the other multi-bit
> equivalents.
> 
> One advantage of these multi-bit forms is that they allow word-sized
> variables to essentially be their own spinlocks.

Hi ! Sorry for the delay, that was on my "have a look one of these days,
low priority" list for a bit too long :-)

> NB, the same factoring is possible in asm-generic/bitops/[non-]atomic.h.
> I would be happy to provide corresponding patches if this approach is
> deemed appropriate. Feedback would be most welcome.
> 
> Signed-off-by: Geoff Thorpe <Geoff.Thorpe@freescale.com>
> ---
>  arch/powerpc/include/asm/bitops.h |  111 +++++++++++++++++++++++--------------
>  1 files changed, 69 insertions(+), 42 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
> index 897eade..72de28c 100644
> --- a/arch/powerpc/include/asm/bitops.h
> +++ b/arch/powerpc/include/asm/bitops.h
> @@ -56,11 +56,10 @@
>  #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
>  #define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
>  
> -static __inline__ void set_bit(int nr, volatile unsigned long *addr)
> +static __inline__ void set_bits(unsigned long mask, volatile unsigned long *_p)
>  {
>  	unsigned long old;
> -	unsigned long mask = BITOP_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
> +	unsigned long *p = (unsigned long *)_p;
>  
>  	__asm__ __volatile__(
>  "1:"	PPC_LLARX "%0,0,%3	# set_bit\n"
> @@ -73,11 +72,16 @@ static __inline__ void set_bit(int nr, volatile unsigned long *addr)
>  	: "cc" );
>  }
>  
> -static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
> +static __inline__ void set_bit(int nr, volatile unsigned long *addr)
> +{
> +	set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
> +}

No objection with the above.

> +static __inline__ void clear_bits(unsigned long mask,
> +				volatile unsigned long *_p)
>  {
>  	unsigned long old;
> -	unsigned long mask = BITOP_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
> +	unsigned long *p = (unsigned long *)_p;
>  
>  	__asm__ __volatile__(
>  "1:"	PPC_LLARX "%0,0,%3	# clear_bit\n"
> @@ -90,11 +94,16 @@ static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
>  	: "cc" );
>  }
>  
> -static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
> +static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
> +{
> +	clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
> +}

Looks good too.

> +static __inline__ void clear_bits_unlock(unsigned long mask,
> +					volatile unsigned long *_p)
>  {
>  	unsigned long old;
> -	unsigned long mask = BITOP_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
> +	unsigned long *p = (unsigned long *)_p;
>  
>  	__asm__ __volatile__(
>  	LWSYNC_ON_SMP
> @@ -108,11 +117,16 @@ static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
>  	: "cc", "memory");
>  }
>  
> -static __inline__ void change_bit(int nr, volatile unsigned long *addr)
> +static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
> +{
> +	clear_bits_unlock(BITOP_MASK(nr), addr + BITOP_WORD(nr));
> +}

I'm not sure it's useful to provide a multi-bit variant of the
"lock" and "unlock" primitives. Do other archs do ?

> +static __inline__ void change_bits(unsigned long mask,
> +				volatile unsigned long *_p)
>  {
>  	unsigned long old;
> -	unsigned long mask = BITOP_MASK(nr);
> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
> +	unsigned long *p = (unsigned long *)_p;
>  
>  	__asm__ __volatile__(
>  "1:"	PPC_LLARX "%0,0,%3	# change_bit\n"
> @@ -125,12 +139,16 @@ static __inline__ void change_bit(int nr, volatile unsigned long *addr)
>  	: "cc" );
>  }
>  
> -static __inline__ int test_and_set_bit(unsigned long nr,
> -				       volatile unsigned long *addr)
> +static __inline__ void change_bit(int nr, volatile unsigned long *addr)
> +{
> +	change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
> +}

Ah, patch is getting confused between change_bit and
test_and_set_bit :-)

Now, you know what I'm thinking is ... Those are all the same except
for:

 - Barriers for lock and unlock variants
 - Barriers for "return" (aka test_and_set) variants
 - Actual op done on the mask

Maybe we can shrink that file significantly (and avoid the risk for
typos etc...) by generating them all from a macro.

Something like (typed directly into the mailer :-)

#define DEFINE_BITOP(op, prefix, postfix) \
	asm volatile (			  \
	prefix				  \
"1:"    PPC_LLARX "%0,0,%3\n"		  \
	__stringify(op) "%1,%0,%2\n"	  \
	PPC405_ERR77(0,%3)		  \
	PPC_STLCX "%1,0,%3\n"		  \
	"bne- 1b\n"			  \
	postfix				  \
	 : "=&r" (old), "=&r" (t)
	 : "r" (mask), "r" (p)
	 : "cc", "memory")

and so:

static inline void set_bits(unsigned long mask, volatile unsigned long *addr)
{
	unsigned long old, t;

	DEFINE_BITOP(or, "", "");
}

static inline void test_and_set_bits(unsigned long mask, volatile unsigned long *addr)
{
	unsigned long old, t;

	DEFINE_BITOP(or, LWSYNC_ON_SMP, ISYNC_ON_SMP);

	return (old & mask) != 0;
}

etc...

Cheers,
Ben.
Geoff Thorpe - June 16, 2009, 2:28 p.m.
Thanks for taking the time to look at this Ben, comments inline.

Benjamin Herrenschmidt wrote:
> On Tue, 2009-05-26 at 14:19 -0400, Geoff Thorpe wrote:
>> NOT FOR COMMIT, THIS IS A REQUEST FOR FEEDBACK.
>>
>> The bitops.h functions that operate on a single bit in a bitfield are
>> implemented by operating on the corresponding word location. In all cases
>> the inner logic appears to be valid if the mask being applied has more
>> than one bit set, so this patch exposes those inner operations. Indeed,
>> set_bits() was already available, but it duplicated code from set_bit()
>> (rather than making the latter a wrapper) - it was also missing the
>> PPC405_ERR77() workaround and the "volatile" address qualifier present in
>> other APIs. This corrects that, and exposes the other multi-bit
>> equivalents.
>>
>> One advantage of these multi-bit forms is that they allow word-sized
>> variables to essentially be their own spinlocks.
> 
> Hi ! Sorry for the delay, that was on my "have a look one of these days,
> low priority" list for a bit too long :-)

NP, optimal throughput often requires a compromise in latency :-)

> 
>> NB, the same factoring is possible in asm-generic/bitops/[non-]atomic.h.
>> I would be happy to provide corresponding patches if this approach is
>> deemed appropriate. Feedback would be most welcome.
>>
>> Signed-off-by: Geoff Thorpe <Geoff.Thorpe@freescale.com>
>> ---
>>  arch/powerpc/include/asm/bitops.h |  111 +++++++++++++++++++++++--------------
>>  1 files changed, 69 insertions(+), 42 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
>> index 897eade..72de28c 100644
>> --- a/arch/powerpc/include/asm/bitops.h
>> +++ b/arch/powerpc/include/asm/bitops.h
>> @@ -56,11 +56,10 @@
>>  #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
>>  #define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
>>  
>> -static __inline__ void set_bit(int nr, volatile unsigned long *addr)
>> +static __inline__ void set_bits(unsigned long mask, volatile unsigned long *_p)
>>  {
>>  	unsigned long old;
>> -	unsigned long mask = BITOP_MASK(nr);
>> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
>> +	unsigned long *p = (unsigned long *)_p;
>>  
>>  	__asm__ __volatile__(
>>  "1:"	PPC_LLARX "%0,0,%3	# set_bit\n"
>> @@ -73,11 +72,16 @@ static __inline__ void set_bit(int nr, volatile unsigned long *addr)
>>  	: "cc" );
>>  }
>>  
>> -static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
>> +static __inline__ void set_bit(int nr, volatile unsigned long *addr)
>> +{
>> +	set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
>> +}
> 
> No objection with the above.
> 
>> +static __inline__ void clear_bits(unsigned long mask,
>> +				volatile unsigned long *_p)
>>  {
>>  	unsigned long old;
>> -	unsigned long mask = BITOP_MASK(nr);
>> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
>> +	unsigned long *p = (unsigned long *)_p;
>>  
>>  	__asm__ __volatile__(
>>  "1:"	PPC_LLARX "%0,0,%3	# clear_bit\n"
>> @@ -90,11 +94,16 @@ static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
>>  	: "cc" );
>>  }
>>  
>> -static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
>> +static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
>> +{
>> +	clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
>> +}
> 
> Looks good too.
> 
>> +static __inline__ void clear_bits_unlock(unsigned long mask,
>> +					volatile unsigned long *_p)
>>  {
>>  	unsigned long old;
>> -	unsigned long mask = BITOP_MASK(nr);
>> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
>> +	unsigned long *p = (unsigned long *)_p;
>>  
>>  	__asm__ __volatile__(
>>  	LWSYNC_ON_SMP
>> @@ -108,11 +117,16 @@ static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
>>  	: "cc", "memory");
>>  }
>>  
>> -static __inline__ void change_bit(int nr, volatile unsigned long *addr)
>> +static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
>> +{
>> +	clear_bits_unlock(BITOP_MASK(nr), addr + BITOP_WORD(nr));
>> +}
> 
> I'm not sure it's useful to provide a multi-bit variant of the
> "lock" and "unlock" primitives. Do other archs do ?

For clear_bit_unlock(), no they don't appear to. There is a fallback in
include/asm-generic though, and I notice that it's used in a few places,
eg. drivers/rtc/rtc-dev. OTOH some other archs appear to provide their
own test_and_set_bit_lock(), and there's a fallback in
includes/asm-generic for that too.

Do you see a reason to isolate either of these and not factor out the
inner word-based logic?

> 
>> +static __inline__ void change_bits(unsigned long mask,
>> +				volatile unsigned long *_p)
>>  {
>>  	unsigned long old;
>> -	unsigned long mask = BITOP_MASK(nr);
>> -	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
>> +	unsigned long *p = (unsigned long *)_p;
>>  
>>  	__asm__ __volatile__(
>>  "1:"	PPC_LLARX "%0,0,%3	# change_bit\n"
>> @@ -125,12 +139,16 @@ static __inline__ void change_bit(int nr, volatile unsigned long *addr)
>>  	: "cc" );
>>  }
>>  
>> -static __inline__ int test_and_set_bit(unsigned long nr,
>> -				       volatile unsigned long *addr)
>> +static __inline__ void change_bit(int nr, volatile unsigned long *addr)
>> +{
>> +	change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
>> +}
> 
> Ah, patch is getting confused between change_bit and
> test_and_set_bit :-)

The diff machinery off-by-one'd the entire file, which makes the patch a
little more difficult to review. But I didn't feel like (further)
massacring the code in order to improve the diff. :-)

> 
> Now, you know what I'm thinking is ... Those are all the same except
> for:
> 
>  - Barriers for lock and unlock variants
>  - Barriers for "return" (aka test_and_set) variants
>  - Actual op done on the mask

Yup, believe it or not, I saw this coming but didn't have the guts to
try proposing something like it up-front (in particular, I was wary of
botching some subtleties in the assembly).

> 
> Maybe we can shrink that file significantly (and avoid the risk for
> typos etc...) by generating them all from a macro.
> 
> Something like (typed directly into the mailer :-)
> 
> #define DEFINE_BITOP(op, prefix, postfix) \
> 	asm volatile (			  \
> 	prefix				  \
> "1:"    PPC_LLARX "%0,0,%3\n"		  \
> 	__stringify(op) "%1,%0,%2\n"	  \
> 	PPC405_ERR77(0,%3)		  \
> 	PPC_STLCX "%1,0,%3\n"		  \
> 	"bne- 1b\n"			  \
> 	postfix				  \
> 	 : "=&r" (old), "=&r" (t)
> 	 : "r" (mask), "r" (p)
> 	 : "cc", "memory")
> 
> and so:
> 
> static inline void set_bits(unsigned long mask, volatile unsigned long *addr)
> {
> 	unsigned long old, t;
> 
> 	DEFINE_BITOP(or, "", "");
> }
> 
> static inline void test_and_set_bits(unsigned long mask, volatile unsigned long *addr)
> {
> 	unsigned long old, t;
> 
> 	DEFINE_BITOP(or, LWSYNC_ON_SMP, ISYNC_ON_SMP);
> 
> 	return (old & mask) != 0;
> }
> 
> etc...


Sounds good, I'll try working this up and I'll send a new patch shortly.

So can I assume implicitly that changing the set_bits() function to add
the 'volatile' qualifier to the prototype (and the missing
PPC405_ERR77() workaround) was OK?

Also - any opinion on whether the same re-factoring of the asm-generic
versions should be undertaken? I'm not looking to bite off more than I
can chew, but I don't know if it's frowned upon to make powerpc-only
extensions to the API. And if you think an asm-generic patch makes
sense, could that be taken via linuxppc-dev too or does it need to go
elsewhere?

Thanks again,
Geoff
Benjamin Herrenschmidt - June 16, 2009, 9:33 p.m.
On Tue, 2009-06-16 at 10:28 -0400, Geoff Thorpe wrote:

> > Hi ! Sorry for the delay, that was on my "have a look one of these days,
> > low priority" list for a bit too long :-)
> 
> NP, optimal throughput often requires a compromise in latency :-)

Hehehe, so true :-)
 
> > I'm not sure it's useful to provide a multi-bit variant of the
> > "lock" and "unlock" primitives. Do other archs do ?
> 
> For clear_bit_unlock(), no they don't appear to. There is a fallback in
> include/asm-generic though, and I notice that it's used in a few places,
> eg. drivers/rtc/rtc-dev. OTOH some other archs appear to provide their
> own test_and_set_bit_lock(), and there's a fallback in
> includes/asm-generic for that too.
> 
> Do you see a reason to isolate either of these and not factor out the
> inner word-based logic?

Don't bother, especially if we are using a macro to generate them, we
may as well make them all look the same.

 .../...

> Yup, believe it or not, I saw this coming but didn't have the guts to
> try proposing something like it up-front (in particular, I was wary of
> botching some subtleties in the assembly).

Fair enough :-)

> > Maybe we can shrink that file significantly (and avoid the risk for
> > typos etc...) by generating them all from a macro.
> > 
> > Something like (typed directly into the mailer :-)
> > 
> > #define DEFINE_BITOP(op, prefix, postfix) \
> > 	asm volatile (			  \
> > 	prefix				  \
> > "1:"    PPC_LLARX "%0,0,%3\n"		  \
> > 	__stringify(op) "%1,%0,%2\n"	  \
> > 	PPC405_ERR77(0,%3)		  \
> > 	PPC_STLCX "%1,0,%3\n"		  \
> > 	"bne- 1b\n"			  \
> > 	postfix				  \
> > 	 : "=&r" (old), "=&r" (t)
> > 	 : "r" (mask), "r" (p)
> > 	 : "cc", "memory")
> > 
> > and so:
> > 
> > static inline void set_bits(unsigned long mask, volatile unsigned long *addr)
> > {
> > 	unsigned long old, t;
> > 
> > 	DEFINE_BITOP(or, "", "");
> > }
> > 
> > static inline void test_and_set_bits(unsigned long mask, volatile unsigned long *addr)
> > {
> > 	unsigned long old, t;
> > 
> > 	DEFINE_BITOP(or, LWSYNC_ON_SMP, ISYNC_ON_SMP);
> > 
> > 	return (old & mask) != 0;
> > }
> > 
> > etc...
> 
> 
> Sounds good, I'll try working this up and I'll send a new patch shortly.

You can also go totally mad and generate the whole function (both -s and
non -s variants) from one macro but I wouldn't go that far :-)

> So can I assume implicitly that changing the set_bits() function to add
> the 'volatile' qualifier to the prototype (and the missing
> PPC405_ERR77() workaround) was OK?

The PPC405_ERR77 workaround is definitely needed. The volatile, well, I
suspect it's useless, but it will remove warnings when callers call
these on something that is declared as volatile in the first place.

Do x86 use volatile there ? If not, then don't do it on powerpc neither,
it could well be an historical remain. It's not functionally useful, the
"memory" clobber in the asm takes care of telling the compiler not to
mess around I believe.

> Also - any opinion on whether the same re-factoring of the asm-generic
> versions should be undertaken? I'm not looking to bite off more than I
> can chew, but I don't know if it's frowned upon to make powerpc-only
> extensions to the API. And if you think an asm-generic patch makes
> sense, could that be taken via linuxppc-dev too or does it need to go
> elsewhere?

I'm not people care -that- much :-) You can always try and post it to
lkml (maybe linux-arch too) and see what comes back... but let's finish
the powerpc variant first :-)

Cheers,
Ben.
Stephen Rothwell - June 17, 2009, 1:07 a.m.
On Wed, 17 Jun 2009 07:33:46 +1000 Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
>
> You can also go totally mad and generate the whole function (both -s and
> non -s variants) from one macro but I wouldn't go that far :-)

Please don't (unless you keep the function names intact), it makes
finding the function with grep almost impossible ...

> > Also - any opinion on whether the same re-factoring of the asm-generic
> > versions should be undertaken? I'm not looking to bite off more than I
> > can chew, but I don't know if it's frowned upon to make powerpc-only
> > extensions to the API. And if you think an asm-generic patch makes
> > sense, could that be taken via linuxppc-dev too or does it need to go
> > elsewhere?
> 
> I'm not people care -that- much :-) You can always try and post it to
> lkml (maybe linux-arch too) and see what comes back... but let's finish
> the powerpc variant first :-)

Also, we have an asm-generic maintainer now ...
Geoff Thorpe - June 18, 2009, 8:30 p.m.
Hi Ben et al,

Benjamin Herrenschmidt wrote:
> On Tue, 2009-06-16 at 10:28 -0400, Geoff Thorpe wrote:

[snip]

>>> Maybe we can shrink that file significantly (and avoid the risk for
>>> typos etc...) by generating them all from a macro.
>>>
>>> Something like (typed directly into the mailer :-)
>>>
>>> #define DEFINE_BITOP(op, prefix, postfix) \
>>> 	asm volatile (			  \
>>> 	prefix				  \
>>> "1:"    PPC_LLARX "%0,0,%3\n"		  \
>>> 	__stringify(op) "%1,%0,%2\n"	  \
>>> 	PPC405_ERR77(0,%3)		  \
>>> 	PPC_STLCX "%1,0,%3\n"		  \
>>> 	"bne- 1b\n"			  \
>>> 	postfix				  \
>>> 	 : "=&r" (old), "=&r" (t)
>>> 	 : "r" (mask), "r" (p)
>>> 	 : "cc", "memory")
>>>
>>> and so:
>>>
>>> static inline void set_bits(unsigned long mask, volatile unsigned long *addr)
>>> {
>>> 	unsigned long old, t;
>>>
>>> 	DEFINE_BITOP(or, "", "");
>>> }
>>>
>>> static inline void test_and_set_bits(unsigned long mask, volatile unsigned long *addr)
>>> {
>>> 	unsigned long old, t;
>>>
>>> 	DEFINE_BITOP(or, LWSYNC_ON_SMP, ISYNC_ON_SMP);
>>>
>>> 	return (old & mask) != 0;
>>> }
>>>
>>> etc...
>>
>> Sounds good, I'll try working this up and I'll send a new patch shortly.
> 
> You can also go totally mad and generate the whole function (both -s and
> non -s variants) from one macro but I wouldn't go that far :-)

I've prepared a new patch, will send it in a moment. It uses two macros
rather than one - as the test_and_***() APIs have a fundamentally
different asm because of the arguments to 'op' as well as the output
operands. However, this split made it possible to generate the entire
"inner" (single-word) function using the macro, rather than just the
inline asm part.

> 
>> So can I assume implicitly that changing the set_bits() function to add
>> the 'volatile' qualifier to the prototype (and the missing
>> PPC405_ERR77() workaround) was OK?
> 
> The PPC405_ERR77 workaround is definitely needed. The volatile, well, I
> suspect it's useless, but it will remove warnings when callers call
> these on something that is declared as volatile in the first place.
> 
> Do x86 use volatile there ? If not, then don't do it on powerpc neither,
> it could well be an historical remain. It's not functionally useful, the
> "memory" clobber in the asm takes care of telling the compiler not to
> mess around I believe.

I've left the volatile qualifier in the generated API because I didn't
feel so comfortable changing APIs, but I also added the "memory" clobber
for all cases - whereas it seems the existing set_bits(), clear_bits(),
[...] functions didn't declare this... Do you see any issue with having
the 'volatile' in the prototype as well as the clobber in the asm?

Actually, might as well just respond to the new patch instead... :-) Thx.

Cheers,
Geoff
Benjamin Herrenschmidt - June 18, 2009, 10:22 p.m.
On Thu, 2009-06-18 at 16:30 -0400, Geoff Thorpe wrote:
> I've left the volatile qualifier in the generated API because I didn't
> feel so comfortable changing APIs, but I also added the "memory" clobber
> for all cases - whereas it seems the existing set_bits(), clear_bits(),
> [...] functions didn't declare this... Do you see any issue with having
> the 'volatile' in the prototype as well as the clobber in the asm?
> 
> Actually, might as well just respond to the new patch instead... :-) Thx.

I think the story with the memory clobber is that it depends whether
we consider the functions as ordering accesses or not (ie, can
potentially be used with lock/unlock semantics).

The general rule is that those who don't return anything don't need
to have those semantics, and thus could only be advertised as clobbering
p[word] -but- there are issues there. For example, despite the
(relatively new) official _lock/_unlock variants, there's still code
that abuses constructs like test_and_set_bit/clear_bit as locks and in
that case, clear bits needs a clobber.

So I would say at this stage better safe than having to track down
incredibly hard to find bugs, and let's make them all take that clobber.

Cheers,
Ben.
Geoff Thorpe - June 19, 2009, 3:59 a.m.
Benjamin Herrenschmidt wrote:
> On Thu, 2009-06-18 at 16:30 -0400, Geoff Thorpe wrote:
>> I've left the volatile qualifier in the generated API because I didn't
>> feel so comfortable changing APIs, but I also added the "memory" clobber
>> for all cases - whereas it seems the existing set_bits(), clear_bits(),
>> [...] functions didn't declare this... Do you see any issue with having
>> the 'volatile' in the prototype as well as the clobber in the asm?
>>
>> Actually, might as well just respond to the new patch instead... :-) Thx.
> 
> I think the story with the memory clobber is that it depends whether
> we consider the functions as ordering accesses or not (ie, can
> potentially be used with lock/unlock semantics).
> 
> The general rule is that those who don't return anything don't need
> to have those semantics, and thus could only be advertised as clobbering
> p[word] -but- there are issues there. For example, despite the
> (relatively new) official _lock/_unlock variants, there's still code
> that abuses constructs like test_and_set_bit/clear_bit as locks and in
> that case, clear bits needs a clobber.
> 
> So I would say at this stage better safe than having to track down
> incredibly hard to find bugs, and let's make them all take that clobber.

Well I'm tempted agree because I'm abusing these constructs in  exactly
the manner you describe. :-) However I didn't know that this was abuse
until you mentioned it. Some time ago I noticed that the bitops code was
very similar to spinlocks, and so I presumed that a bitops word could
act as its own spinlock (ie. rather than spinlocking access to a u32).
Now that I look at spinlocks again, I see the presence of those
CLEAR_IO_SYNC definitions in the function preambles - is that the
distinction I'm abusing? CLEAR_IO_SYNC appears to be undefined except on
64-bit, in which case it's "(get_paca()->io_sync = 0)".

W.r.t the _lock/_unlock variants on the bitops side, the "lock"
particulars appear to depend on LWSYNC_ON_SMP and ISYNC_ON_SMP, which
are "isync" and "lwsync" for all platforms IIUC. So it seems the locking
intentions here are different from that of spinlocks? Is there something
I can look at that describes what semantics these primitives (are
supposed to) guarantee? I may be assuming other things that I shouldn't
be ...

Cheers,
Geoff

Patch

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 897eade..72de28c 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -56,11 +56,10 @@ 
 #define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
 #define BITOP_LE_SWIZZLE	((BITS_PER_LONG-1) & ~0x7)
 
-static __inline__ void set_bit(int nr, volatile unsigned long *addr)
+static __inline__ void set_bits(unsigned long mask, volatile unsigned long *_p)
 {
 	unsigned long old;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 "1:"	PPC_LLARX "%0,0,%3	# set_bit\n"
@@ -73,11 +72,16 @@  static __inline__ void set_bit(int nr, volatile unsigned long *addr)
 	: "cc" );
 }
 
-static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
+static __inline__ void set_bit(int nr, volatile unsigned long *addr)
+{
+	set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+}
+
+static __inline__ void clear_bits(unsigned long mask,
+				volatile unsigned long *_p)
 {
 	unsigned long old;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 "1:"	PPC_LLARX "%0,0,%3	# clear_bit\n"
@@ -90,11 +94,16 @@  static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
 	: "cc" );
 }
 
-static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
+static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
+{
+	clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+}
+
+static __inline__ void clear_bits_unlock(unsigned long mask,
+					volatile unsigned long *_p)
 {
 	unsigned long old;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 	LWSYNC_ON_SMP
@@ -108,11 +117,16 @@  static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
 	: "cc", "memory");
 }
 
-static __inline__ void change_bit(int nr, volatile unsigned long *addr)
+static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr)
+{
+	clear_bits_unlock(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+}
+
+static __inline__ void change_bits(unsigned long mask,
+				volatile unsigned long *_p)
 {
 	unsigned long old;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 "1:"	PPC_LLARX "%0,0,%3	# change_bit\n"
@@ -125,12 +139,16 @@  static __inline__ void change_bit(int nr, volatile unsigned long *addr)
 	: "cc" );
 }
 
-static __inline__ int test_and_set_bit(unsigned long nr,
-				       volatile unsigned long *addr)
+static __inline__ void change_bit(int nr, volatile unsigned long *addr)
+{
+	change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr));
+}
+
+static __inline__ unsigned long test_and_set_bits(unsigned long mask,
+						volatile unsigned long *_p)
 {
 	unsigned long old, t;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 	LWSYNC_ON_SMP
@@ -144,15 +162,21 @@  static __inline__ int test_and_set_bit(unsigned long nr,
 	: "r" (mask), "r" (p)
 	: "cc", "memory");
 
-	return (old & mask) != 0;
+	return (old & mask);
 }
 
-static __inline__ int test_and_set_bit_lock(unsigned long nr,
+static __inline__ int test_and_set_bit(unsigned long nr,
 				       volatile unsigned long *addr)
 {
+	return test_and_set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0;
+}
+
+static __inline__ unsigned long test_and_set_bits_lock(
+					unsigned long mask,
+					volatile unsigned long *_p)
+{
 	unsigned long old, t;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 "1:"	PPC_LLARX "%0,0,%3		# test_and_set_bit_lock\n"
@@ -165,15 +189,21 @@  static __inline__ int test_and_set_bit_lock(unsigned long nr,
 	: "r" (mask), "r" (p)
 	: "cc", "memory");
 
-	return (old & mask) != 0;
+	return (old & mask);
 }
 
-static __inline__ int test_and_clear_bit(unsigned long nr,
-					 volatile unsigned long *addr)
+static __inline__ int test_and_set_bit_lock(unsigned long nr,
+				       volatile unsigned long *addr)
+{
+	return test_and_set_bits_lock(BITOP_MASK(nr),
+				addr + BITOP_WORD(nr)) != 0;
+}
+
+static __inline__ unsigned long test_and_clear_bits(unsigned long mask,
+					volatile unsigned long *_p)
 {
 	unsigned long old, t;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 	LWSYNC_ON_SMP
@@ -187,15 +217,20 @@  static __inline__ int test_and_clear_bit(unsigned long nr,
 	: "r" (mask), "r" (p)
 	: "cc", "memory");
 
-	return (old & mask) != 0;
+	return (old & mask);
 }
 
-static __inline__ int test_and_change_bit(unsigned long nr,
-					  volatile unsigned long *addr)
+static __inline__ int test_and_clear_bit(unsigned long nr,
+					 volatile unsigned long *addr)
+{
+	return test_and_clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0;
+}
+
+static __inline__ unsigned long test_and_change_bits(unsigned long mask,
+						volatile unsigned long *_p)
 {
 	unsigned long old, t;
-	unsigned long mask = BITOP_MASK(nr);
-	unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
+	unsigned long *p = (unsigned long *)_p;
 
 	__asm__ __volatile__(
 	LWSYNC_ON_SMP
@@ -209,21 +244,13 @@  static __inline__ int test_and_change_bit(unsigned long nr,
 	: "r" (mask), "r" (p)
 	: "cc", "memory");
 
-	return (old & mask) != 0;
+	return (old & mask);
 }
 
-static __inline__ void set_bits(unsigned long mask, unsigned long *addr)
+static __inline__ int test_and_change_bit(unsigned long nr,
+					  volatile unsigned long *addr)
 {
-        unsigned long old;
-
-	__asm__ __volatile__(
-"1:"	PPC_LLARX "%0,0,%3         # set_bits\n"
-	"or	%0,%0,%2\n"
-	PPC_STLCX "%0,0,%3\n"
-	"bne-	1b"
-	: "=&r" (old), "+m" (*addr)
-	: "r" (mask), "r" (addr)
-	: "cc");
+	return test_and_change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0;
 }
 
 #include <asm-generic/bitops/non-atomic.h>