diff mbox

[V4] powerpc: Implement {cmp}xchg for u8 and u16

Message ID 5720837D.6050807@linux.vnet.ibm.com (mailing list archive)
State Accepted
Headers show

Commit Message

xinhui April 27, 2016, 9:16 a.m. UTC
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.

remove volatile of first parameter in __cmpxchg_local and __cmpxchg

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
---
change from v3:
	rewrite in asm for the LL/SC.
	remove volatile in __cmpxchg_local and __cmpxchg.
change from v2:
	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
change from V1:
	rework totally.
---
 arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 3 deletions(-)

Comments

Boqun Feng April 27, 2016, 1:58 p.m. UTC | #1
On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> It works on all ppc.
> 
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> ---
> change from v3:
> 	rewrite in asm for the LL/SC.
> 	remove volatile in __cmpxchg_local and __cmpxchg.
> change from v2:
> 	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> 	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
> change from V1:
> 	rework totally.
> ---
>  arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
>  1 file changed, 106 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> index 44efe73..8a3735f 100644
> --- a/arch/powerpc/include/asm/cmpxchg.h
> +++ b/arch/powerpc/include/asm/cmpxchg.h
> @@ -7,6 +7,71 @@
>  #include <asm/asm-compat.h>
>  #include <linux/bug.h>
>  
> +#ifdef __BIG_ENDIAN
> +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
> +#else
> +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
> +#endif
> +
> +#define XCHG_GEN(type, sfx, cl)				\
> +static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
> +{								\
> +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> +								\
> +	off = (unsigned long)p % sizeof(u32);			\
> +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> +	p -= off;						\
> +	val <<= bitoff;						\
> +	prev_mask = (u32)(type)-1 << bitoff;			\
> +								\
> +	__asm__ __volatile__(					\
> +"1:	lwarx   %0,0,%3\n"					\
> +"	andc	%1,%0,%5\n"					\
> +"	or	%1,%1,%4\n"					\
> +	PPC405_ERR77(0,%3)					\
> +"	stwcx.	%1,0,%3\n"					\
> +"	bne-	1b\n"						\
> +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\

I think we can save the "tmp" here by:

	__asm__ volatile__(
"1:	lwarx	%0,0,%2\n"
"	andc	%0,%0,%4\n"
"	or	%0,%0,%3\n"
	PPC405_ERR77(0,%2)
"	stwcx.	%0,0,%2\n"
"	bne-	1b\n"
	: "=&r" (prev), "+m" (*(u32*)p)
	: "r" (p), "r" (val), "r" (prev_mask)
	: "cc", cl);

right?

> +	: "r" (p), "r" (val), "r" (prev_mask)			\
> +	: "cc", cl);						\
> +								\
> +	return prev >> bitoff;					\
> +}
> +
> +#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
> +static inline							\
> +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
> +{								\
> +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> +								\
> +	off = (unsigned long)p % sizeof(u32);			\
> +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> +	p -= off;						\
> +	old <<= bitoff;						\
> +	new <<= bitoff;						\
> +	prev_mask = (u32)(type)-1 << bitoff;			\
> +								\
> +	__asm__ __volatile__(					\
> +	br							\
> +"1:	lwarx   %0,0,%3\n"					\
> +"	and	%1,%0,%6\n"					\
> +"	cmpw	0,%1,%4\n"					\
> +"	bne-	2f\n"						\
> +"	andc	%1,%0,%6\n"					\
> +"	or	%1,%1,%5\n"					\
> +	PPC405_ERR77(0,%3)					\
> +"	stwcx.  %1,0,%3\n"					\
> +"	bne-    1b\n"						\
> +	br2							\
> +	"\n"							\
> +"2:"								\
> +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\

And "tmp" here could also be saved by:

"1:	lwarx   %0,0,%2\n"					\
"	xor	%3,%0,%3\n"					\
"	and.	%3,%3,%5\n"					\
"	bne-	2f\n"						\
"	andc	%0,%0,%5\n"					\
"	or	%0,%0,%4\n"					\
	PPC405_ERR77(0,%2)					\
"	stwcx.  %0,0,%2\n"					\
"	bne-    1b\n"						\
	br2							\
	"\n"							\
"2:"								\
	: "=&r" (prev), "+m" (*(u32*)p)		\
	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
	: "cc", cl);						\

right?

IIUC, saving the local variable "tmp" will result in saving a general
register for the compilers to use for other variables.

So thoughts?

Regards,
Boqun

> +	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
> +	: "cc", cl);						\
> +								\
> +	return prev >> bitoff;					\
> +}
> +
>  /*
>   * Atomic exchange
>   *
> @@ -14,6 +79,11 @@
>   * the previous value stored there.
>   */
>  
> +XCHG_GEN(u8, _local, "memory");
> +XCHG_GEN(u8, _relaxed, "cc");
> +XCHG_GEN(u16, _local, "memory");
> +XCHG_GEN(u16, _relaxed, "cc");
> +
>  static __always_inline unsigned long
>  __xchg_u32_local(volatile void *p, unsigned long val)
>  {
> @@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val)
>  #endif
>  
>  static __always_inline unsigned long
> -__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
> +__xchg_local(void *ptr, unsigned long x, unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __xchg_u8_local(ptr, x);
> +	case 2:
> +		return __xchg_u16_local(ptr, x);
>  	case 4:
>  		return __xchg_u32_local(ptr, x);
>  #ifdef CONFIG_PPC64
> @@ -103,6 +177,10 @@ static __always_inline unsigned long
>  __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __xchg_u8_relaxed(ptr, x);
> +	case 2:
> +		return __xchg_u16_relaxed(ptr, x);
>  	case 4:
>  		return __xchg_u32_relaxed(ptr, x);
>  #ifdef CONFIG_PPC64
> @@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>   * and return the old value of *p.
>   */
>  
> +CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
> +CMPXCHG_GEN(u8, _local, , , "memory");
> +CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
> +CMPXCHG_GEN(u8, _relaxed, , , "cc");
> +CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
> +CMPXCHG_GEN(u16, _local, , , "memory");
> +CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
> +CMPXCHG_GEN(u16, _relaxed, , , "cc");
> +
>  static __always_inline unsigned long
>  __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
>  {
> @@ -312,10 +399,14 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
>  #endif
>  
>  static __always_inline unsigned long
> -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
> +__cmpxchg(void *ptr, unsigned long old, unsigned long new,
>  	  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
>  }
>  
>  static __always_inline unsigned long
> -__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
> +__cmpxchg_local(void *ptr, unsigned long old, unsigned long new,
>  	  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_local(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_local(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_local(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
>  		  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_relaxed(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_relaxed(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_relaxed(ptr, old, new);
>  #ifdef CONFIG_PPC64
> @@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
>  		  unsigned int size)
>  {
>  	switch (size) {
> +	case 1:
> +		return __cmpxchg_u8_acquire(ptr, old, new);
> +	case 2:
> +		return __cmpxchg_u16_acquire(ptr, old, new);
>  	case 4:
>  		return __cmpxchg_u32_acquire(ptr, old, new);
>  #ifdef CONFIG_PPC64
> -- 
> 2.4.3
>
Boqun Feng April 27, 2016, 2:16 p.m. UTC | #2
On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > 
> > Implement xchg{u8,u16}{local,relaxed}, and
> > cmpxchg{u8,u16}{,local,acquire,relaxed}.
> > 
> > It works on all ppc.
> > 
> > remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> > 
> > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > ---
> > change from v3:
> > 	rewrite in asm for the LL/SC.
> > 	remove volatile in __cmpxchg_local and __cmpxchg.
> > change from v2:
> > 	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> > 	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
> > change from V1:
> > 	rework totally.
> > ---
> >  arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 106 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> > index 44efe73..8a3735f 100644
> > --- a/arch/powerpc/include/asm/cmpxchg.h
> > +++ b/arch/powerpc/include/asm/cmpxchg.h
> > @@ -7,6 +7,71 @@
> >  #include <asm/asm-compat.h>
> >  #include <linux/bug.h>
> >  
> > +#ifdef __BIG_ENDIAN
> > +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
> > +#else
> > +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
> > +#endif
> > +
> > +#define XCHG_GEN(type, sfx, cl)				\
> > +static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	val <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	andc	%1,%0,%5\n"					\
> > +"	or	%1,%1,%4\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.	%1,0,%3\n"					\
> > +"	bne-	1b\n"						\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> I think we can save the "tmp" here by:
> 
> 	__asm__ volatile__(
> "1:	lwarx	%0,0,%2\n"
> "	andc	%0,%0,%4\n"
> "	or	%0,%0,%3\n"
> 	PPC405_ERR77(0,%2)
> "	stwcx.	%0,0,%2\n"
> "	bne-	1b\n"
> 	: "=&r" (prev), "+m" (*(u32*)p)
> 	: "r" (p), "r" (val), "r" (prev_mask)
> 	: "cc", cl);
> 
> right?
> 
> > +	: "r" (p), "r" (val), "r" (prev_mask)			\
> > +	: "cc", cl);						\
> > +								\
> > +	return prev >> bitoff;					\
> > +}
> > +
> > +#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
> > +static inline							\
> > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	old <<= bitoff;						\
> > +	new <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +	br							\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	and	%1,%0,%6\n"					\
> > +"	cmpw	0,%1,%4\n"					\
> > +"	bne-	2f\n"						\
> > +"	andc	%1,%0,%6\n"					\
> > +"	or	%1,%1,%5\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.  %1,0,%3\n"					\
> > +"	bne-    1b\n"						\
> > +	br2							\
> > +	"\n"							\
> > +"2:"								\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> And "tmp" here could also be saved by:
> 
> "1:	lwarx   %0,0,%2\n"					\
> "	xor	%3,%0,%3\n"					\
> "	and.	%3,%3,%5\n"					\
> "	bne-	2f\n"						\
> "	andc	%0,%0,%5\n"					\
> "	or	%0,%0,%4\n"					\
> 	PPC405_ERR77(0,%2)					\
> "	stwcx.  %0,0,%2\n"					\
> "	bne-    1b\n"						\
> 	br2							\
> 	"\n"							\
> "2:"								\
> 	: "=&r" (prev), "+m" (*(u32*)p)		\
> 	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
> 	: "cc", cl);						\
> 

Oops, this should be:

"1:	lwarx   %0,0,%3\n"					\
"	xor	%2,%0,%2\n"					\
"	and.	%2,%2,%5\n"					\
"	bne-	2f\n"						\
"	andc	%0,%0,%5\n"					\
"	or	%0,%0,%4\n"					\
	PPC405_ERR77(0,%3)					\
"	stwcx.  %0,0,%3\n"					\
"	bne-    1b\n"						\
	br2							\
	"\n"							\
"2:"								\
	: "=&r" (prev), "+m" (*(u32*)p), "+&r" (old)		\
	: "r" (p), "r" (new), "r" (prev_mask)			\
	: "cc", cl);						\

Regards,
Boqun
Boqun Feng April 27, 2016, 2:50 p.m. UTC | #3
On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> > From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > 
> > Implement xchg{u8,u16}{local,relaxed}, and
> > cmpxchg{u8,u16}{,local,acquire,relaxed}.
> > 
> > It works on all ppc.
> > 
> > remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> > 
> > Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> > ---
> > change from v3:
> > 	rewrite in asm for the LL/SC.
> > 	remove volatile in __cmpxchg_local and __cmpxchg.
> > change from v2:
> > 	in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> > 	Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN 
> > change from V1:
> > 	rework totally.
> > ---
> >  arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 106 insertions(+), 3 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> > index 44efe73..8a3735f 100644
> > --- a/arch/powerpc/include/asm/cmpxchg.h
> > +++ b/arch/powerpc/include/asm/cmpxchg.h
> > @@ -7,6 +7,71 @@
> >  #include <asm/asm-compat.h>
> >  #include <linux/bug.h>
> >  
> > +#ifdef __BIG_ENDIAN
> > +#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
> > +#else
> > +#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
> > +#endif
> > +
> > +#define XCHG_GEN(type, sfx, cl)				\
> > +static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	val <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	andc	%1,%0,%5\n"					\
> > +"	or	%1,%1,%4\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.	%1,0,%3\n"					\
> > +"	bne-	1b\n"						\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> I think we can save the "tmp" here by:
> 
> 	__asm__ volatile__(
> "1:	lwarx	%0,0,%2\n"
> "	andc	%0,%0,%4\n"
> "	or	%0,%0,%3\n"
> 	PPC405_ERR77(0,%2)
> "	stwcx.	%0,0,%2\n"
> "	bne-	1b\n"
> 	: "=&r" (prev), "+m" (*(u32*)p)
> 	: "r" (p), "r" (val), "r" (prev_mask)
> 	: "cc", cl);
> 
> right?
> 
> > +	: "r" (p), "r" (val), "r" (prev_mask)			\
> > +	: "cc", cl);						\
> > +								\
> > +	return prev >> bitoff;					\
> > +}
> > +
> > +#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
> > +static inline							\
> > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
> > +{								\
> > +	unsigned int prev, prev_mask, tmp, bitoff, off;		\
> > +								\
> > +	off = (unsigned long)p % sizeof(u32);			\
> > +	bitoff = BITOFF_CAL(sizeof(type), off);			\
> > +	p -= off;						\
> > +	old <<= bitoff;						\
> > +	new <<= bitoff;						\
> > +	prev_mask = (u32)(type)-1 << bitoff;			\
> > +								\
> > +	__asm__ __volatile__(					\
> > +	br							\
> > +"1:	lwarx   %0,0,%3\n"					\
> > +"	and	%1,%0,%6\n"					\
> > +"	cmpw	0,%1,%4\n"					\
> > +"	bne-	2f\n"						\
> > +"	andc	%1,%0,%6\n"					\
> > +"	or	%1,%1,%5\n"					\
> > +	PPC405_ERR77(0,%3)					\
> > +"	stwcx.  %1,0,%3\n"					\
> > +"	bne-    1b\n"						\
> > +	br2							\
> > +	"\n"							\
> > +"2:"								\
> > +	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
> 
> And "tmp" here could also be saved by:
> 
> "1:	lwarx   %0,0,%2\n"					\
> "	xor	%3,%0,%3\n"					\
> "	and.	%3,%3,%5\n"					\
> "	bne-	2f\n"						\
> "	andc	%0,%0,%5\n"					\
> "	or	%0,%0,%4\n"					\
> 	PPC405_ERR77(0,%2)					\
> "	stwcx.  %0,0,%2\n"					\
> "	bne-    1b\n"						\
> 	br2							\
> 	"\n"							\
> "2:"								\
> 	: "=&r" (prev), "+m" (*(u32*)p)		\
> 	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
> 	: "cc", cl);						\
> 
> right?
> 

Sorry, my bad, we can't implement cmpxchg like this.. please ignore
this, I should really go to bed soon...

But still, we can save the "tmp" for xchg() I think.

Regards,
Boqun

> IIUC, saving the local variable "tmp" will result in saving a general
> register for the compilers to use for other variables.
> 
> So thoughts?
> 
> Regards,
> Boqun
>
Boqun Feng April 27, 2016, 2:59 p.m. UTC | #4
On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote:
> 
> Sorry, my bad, we can't implement cmpxchg like this.. please ignore
> this, I should really go to bed soon...
> 
> But still, we can save the "tmp" for xchg() I think.
> 

No.. we can't. Sorry for all the noise.

This patch looks good to me.

FWIW, you can add

Acked-by: Boqun Feng <boqun.feng@gmail.com>

Regards,
Boqun
Peter Zijlstra April 28, 2016, 7:59 a.m. UTC | #5
On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> It works on all ppc.
> 
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>

Generally has the right shape; and I trust others to double check the
ppc-asm minutia.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
xinhui April 28, 2016, 10:21 a.m. UTC | #6
On 2016年04月27日 22:59, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote:
>>
>> Sorry, my bad, we can't implement cmpxchg like this.. please ignore
>> this, I should really go to bed soon...
>>
>> But still, we can save the "tmp" for xchg() I think.
>>
> 
> No.. we can't. Sorry for all the noise.
> 
> This patch looks good to me.
> 
> FWIW, you can add
> 
> Acked-by: Boqun Feng <boqun.feng@gmail.com>
> 
thanks!

> Regards,
> Boqun
>
xinhui April 28, 2016, 10:21 a.m. UTC | #7
On 2016年04月28日 15:59, Peter Zijlstra wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
>> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> It works on all ppc.
>>
>> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
>>
>> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Generally has the right shape; and I trust others to double check the
> ppc-asm minutia.
> 
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> 
thanks!
Michael Ellerman Nov. 25, 2016, 12:04 a.m. UTC | #8
On Wed, 2016-04-27 at 09:16:45 UTC, xinhui wrote:
> From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> 
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
> 
> It works on all ppc.
> 
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> 
> Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
> Acked-by: Boqun Feng <boqun.feng@gmail.com>
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/d0563a1297e234ed37f6b51c2e9321

cheers
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..8a3735f 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,71 @@ 
 #include <asm/asm-compat.h>
 #include <linux/bug.h>
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)	((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)	(off * BITS_PER_BYTE)
+#endif
+
+#define XCHG_GEN(type, sfx, cl)				\
+static inline u32 __xchg_##type##sfx(void *p, u32 val)		\
+{								\
+	unsigned int prev, prev_mask, tmp, bitoff, off;		\
+								\
+	off = (unsigned long)p % sizeof(u32);			\
+	bitoff = BITOFF_CAL(sizeof(type), off);			\
+	p -= off;						\
+	val <<= bitoff;						\
+	prev_mask = (u32)(type)-1 << bitoff;			\
+								\
+	__asm__ __volatile__(					\
+"1:	lwarx   %0,0,%3\n"					\
+"	andc	%1,%0,%5\n"					\
+"	or	%1,%1,%4\n"					\
+	PPC405_ERR77(0,%3)					\
+"	stwcx.	%1,0,%3\n"					\
+"	bne-	1b\n"						\
+	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
+	: "r" (p), "r" (val), "r" (prev_mask)			\
+	: "cc", cl);						\
+								\
+	return prev >> bitoff;					\
+}
+
+#define CMPXCHG_GEN(type, sfx, br, br2, cl)			\
+static inline							\
+u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new)		\
+{								\
+	unsigned int prev, prev_mask, tmp, bitoff, off;		\
+								\
+	off = (unsigned long)p % sizeof(u32);			\
+	bitoff = BITOFF_CAL(sizeof(type), off);			\
+	p -= off;						\
+	old <<= bitoff;						\
+	new <<= bitoff;						\
+	prev_mask = (u32)(type)-1 << bitoff;			\
+								\
+	__asm__ __volatile__(					\
+	br							\
+"1:	lwarx   %0,0,%3\n"					\
+"	and	%1,%0,%6\n"					\
+"	cmpw	0,%1,%4\n"					\
+"	bne-	2f\n"						\
+"	andc	%1,%0,%6\n"					\
+"	or	%1,%1,%5\n"					\
+	PPC405_ERR77(0,%3)					\
+"	stwcx.  %1,0,%3\n"					\
+"	bne-    1b\n"						\
+	br2							\
+	"\n"							\
+"2:"								\
+	: "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)		\
+	: "r" (p), "r" (old), "r" (new), "r" (prev_mask)	\
+	: "cc", cl);						\
+								\
+	return prev >> bitoff;					\
+}
+
 /*
  * Atomic exchange
  *
@@ -14,6 +79,11 @@ 
  * the previous value stored there.
  */
 
+XCHG_GEN(u8, _local, "memory");
+XCHG_GEN(u8, _relaxed, "cc");
+XCHG_GEN(u16, _local, "memory");
+XCHG_GEN(u16, _relaxed, "cc");
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -85,9 +155,13 @@  __xchg_u64_relaxed(u64 *p, unsigned long val)
 #endif
 
 static __always_inline unsigned long
-__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
+__xchg_local(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_local(ptr, x);
+	case 2:
+		return __xchg_u16_local(ptr, x);
 	case 4:
 		return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +177,10 @@  static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __xchg_u8_relaxed(ptr, x);
+	case 2:
+		return __xchg_u16_relaxed(ptr, x);
 	case 4:
 		return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -131,6 +209,15 @@  __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
  * and return the old value of *p.
  */
 
+CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u8, _local, , , "memory");
+CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u8, _relaxed, , , "cc");
+CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u16, _local, , , "memory");
+CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u16, _relaxed, , , "cc");
+
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 {
@@ -312,10 +399,14 @@  __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
 #endif
 
 static __always_inline unsigned long
-__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
+__cmpxchg(void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -328,10 +419,14 @@  __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 }
 
 static __always_inline unsigned long
-__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
+__cmpxchg_local(void *ptr, unsigned long old, unsigned long new,
 	  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_local(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_local(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_local(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -348,6 +443,10 @@  __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_relaxed(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_relaxed(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_relaxed(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -364,6 +463,10 @@  __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
 		  unsigned int size)
 {
 	switch (size) {
+	case 1:
+		return __cmpxchg_u8_acquire(ptr, old, new);
+	case 2:
+		return __cmpxchg_u16_acquire(ptr, old, new);
 	case 4:
 		return __cmpxchg_u32_acquire(ptr, old, new);
 #ifdef CONFIG_PPC64