diff mbox

[13/13] powerpc: rewrite local_t using soft_irq

Message ID 1473944523-624-14-git-send-email-maddy@linux.vnet.ibm.com (mailing list archive)
State Superseded
Headers show

Commit Message

maddy Sept. 15, 2016, 1:02 p.m. UTC
Local atomic operations are fast and highly reentrant per CPU counters.
Used for percpu variable updates. Local atomic operations only guarantee
variable modification atomicity wrt the CPU which owns the data and
these needs to be executed in a preemption safe way.

Here is the design of this patch. Since local_* operations
are only need to be atomic to interrupts (IIUC), we have two options.
Either replay the "op" if interrupted or replay the interrupt after
the "op". Initial patchset posted was based on implementing local_* operation
based on CR5 which replay's the "op". Patchset had issues in case of
rewinding the address pointor from an array. This make the slow patch
really slow. Since CR5 based implementation proposed using __ex_table to find
the rewind addressr, this rasied concerns about size of __ex_table and vmlinux.

https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-December/123115.html

But this patch uses, local_irq_pmu_save to soft_disable
interrupts (including PMIs). After finishing the "op", local_irq_pmu_restore()
called and correspondingly interrupts are replayed if any occured.

patch re-write the current local_* functions to use arch_local_irq_disbale.
Base flow for each function is

{
	local_irq_pmu_save(flags)
	load
	..
	store
	local_irq_pmu_restore(flags)
}

Reason for the approach is that, currently l[w/d]arx/st[w/d]cx.
instruction pair is used for local_* operations, which are heavy
on cycle count and they dont support a local variant. So to
see whether the new implementation helps, used a modified
version of Rusty's benchmark code on local_t.

https://lkml.org/lkml/2008/12/16/450

Modifications to Rusty's benchmark code:
- Executed only local_t test

Here are the values with the patch.

Time in ns per iteration

Local_t             Without Patch           With Patch

_inc                        28              8
_add                        28              8
_read                       3               3
_add_return         28              7

Currently only asm/local.h has been rewrite, and also
the entire change is tested only in PPC64 (pseries guest)
and PPC64 host (LE)

TODO:
	- local_cmpxchg and local_xchg needs modification.

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/local.h | 94 ++++++++++++++++++++++++++++------------
 1 file changed, 66 insertions(+), 28 deletions(-)

Comments

kernel test robot Sept. 15, 2016, 4:55 p.m. UTC | #1
Hi Madhavan,

[auto build test ERROR on v4.8-rc6]
[cannot apply to powerpc/next kvm-ppc/kvm-ppc-next mpe/next next-20160915]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
[Suggest to use git(>=2.9.0) format-patch --base=<commit> (or --base=auto for convenience) to record what (public, well-known) commit your patch series was built on]
[Check https://git-scm.com/docs/git-format-patch for more information]

url:    https://github.com/0day-ci/linux/commits/Madhavan-Srinivasan/powerpc-paca-soft_enabled-based-local-atomic-operation-implementation/20160915-215652
config: powerpc-allnoconfig (attached as .config)
compiler: powerpc-linux-gnu-gcc (Debian 5.4.0-6) 5.4.0 20160609
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   In file included from include/linux/perf_event.h:57:0,
                    from include/linux/trace_events.h:9,
                    from include/trace/syscall.h:6,
                    from include/linux/syscalls.h:81,
                    from init/main.c:18:
   arch/powerpc/include/asm/local.h: In function 'local_add':
>> arch/powerpc/include/asm/local.h:25:2: error: implicit declaration of function 'local_irq_pmu_save' [-Werror=implicit-function-declaration]
     local_irq_pmu_save(flags);
     ^
>> arch/powerpc/include/asm/local.h:32:2: error: implicit declaration of function 'local_irq_pmu_restore' [-Werror=implicit-function-declaration]
     local_irq_pmu_restore(flags);
     ^
   cc1: some warnings being treated as errors

vim +/local_irq_pmu_save +25 arch/powerpc/include/asm/local.h

    19	
    20	static __inline__ void local_add(long i, local_t *l)
    21	{
    22		long t;
    23		unsigned long flags;
    24	
  > 25		local_irq_pmu_save(flags);
    26		__asm__ __volatile__(
    27		PPC_LL" %0,0(%2)\n\
    28		add     %0,%1,%0\n"
    29		PPC_STL" %0,0(%2)\n"
    30		: "=&r" (t)
    31		: "r" (i), "r" (&(l->a.counter)));
  > 32		local_irq_pmu_restore(flags);
    33	}
    34	
    35	static __inline__ void local_sub(long i, local_t *l)

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Nicholas Piggin Sept. 16, 2016, 11:01 a.m. UTC | #2
On Thu, 15 Sep 2016 18:32:03 +0530
Madhavan Srinivasan <maddy@linux.vnet.ibm.com> wrote:

> Local atomic operations are fast and highly reentrant per CPU counters.
> Used for percpu variable updates. Local atomic operations only guarantee
> variable modification atomicity wrt the CPU which owns the data and
> these needs to be executed in a preemption safe way.
> 
> Here is the design of this patch. Since local_* operations
> are only need to be atomic to interrupts (IIUC), we have two options.
> Either replay the "op" if interrupted or replay the interrupt after
> the "op". Initial patchset posted was based on implementing local_* operation
> based on CR5 which replay's the "op". Patchset had issues in case of
> rewinding the address pointor from an array. This make the slow patch
> really slow. Since CR5 based implementation proposed using __ex_table to find
> the rewind addressr, this rasied concerns about size of __ex_table and vmlinux.
> 
> https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-December/123115.html
> 
> But this patch uses, local_irq_pmu_save to soft_disable
> interrupts (including PMIs). After finishing the "op", local_irq_pmu_restore()
> called and correspondingly interrupts are replayed if any occured.
> 
> patch re-write the current local_* functions to use arch_local_irq_disbale.
> Base flow for each function is
> 
> {
> 	local_irq_pmu_save(flags)
> 	load
> 	..
> 	store
> 	local_irq_pmu_restore(flags)
> }
> 
> Reason for the approach is that, currently l[w/d]arx/st[w/d]cx.
> instruction pair is used for local_* operations, which are heavy
> on cycle count and they dont support a local variant. So to
> see whether the new implementation helps, used a modified
> version of Rusty's benchmark code on local_t.
> 
> https://lkml.org/lkml/2008/12/16/450
> 
> Modifications to Rusty's benchmark code:
> - Executed only local_t test
> 
> Here are the values with the patch.
> 
> Time in ns per iteration
> 
> Local_t             Without Patch           With Patch
> 
> _inc                        28              8
> _add                        28              8
> _read                       3               3
> _add_return         28              7
> 
> Currently only asm/local.h has been rewrite, and also
> the entire change is tested only in PPC64 (pseries guest)
> and PPC64 host (LE)
> 
> TODO:
> 	- local_cmpxchg and local_xchg needs modification.
> 
> Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/local.h | 94 ++++++++++++++++++++++++++++------------
>  1 file changed, 66 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h
> index b8da91363864..fb5728abb4e9 100644
> --- a/arch/powerpc/include/asm/local.h
> +++ b/arch/powerpc/include/asm/local.h
> @@ -3,6 +3,9 @@
>  
>  #include <linux/percpu.h>
>  #include <linux/atomic.h>
> +#include <linux/irqflags.h>
> +
> +#include <asm/hw_irq.h>
>  
>  typedef struct
>  {
> @@ -14,24 +17,50 @@ typedef struct
>  #define local_read(l)	atomic_long_read(&(l)->a)
>  #define local_set(l,i)	atomic_long_set(&(l)->a, (i))
>  
> -#define local_add(i,l)	atomic_long_add((i),(&(l)->a))
> -#define local_sub(i,l)	atomic_long_sub((i),(&(l)->a))
> -#define local_inc(l)	atomic_long_inc(&(l)->a)
> -#define local_dec(l)	atomic_long_dec(&(l)->a)
> +static __inline__ void local_add(long i, local_t *l)
> +{
> +	long t;
> +	unsigned long flags;
> +
> +	local_irq_pmu_save(flags);
> +	__asm__ __volatile__(
> +	PPC_LL" %0,0(%2)\n\
> +	add     %0,%1,%0\n"
> +	PPC_STL" %0,0(%2)\n"
> +	: "=&r" (t)
> +	: "r" (i), "r" (&(l->a.counter)));
> +	local_irq_pmu_restore(flags);
> +}
> +
> +static __inline__ void local_sub(long i, local_t *l)
> +{
> +	long t;
> +	unsigned long flags;
> +
> +	local_irq_pmu_save(flags);
> +	__asm__ __volatile__(
> +	PPC_LL" %0,0(%2)\n\
> +	subf    %0,%1,%0\n"
> +	PPC_STL" %0,0(%2)\n"
> +	: "=&r" (t)
> +	: "r" (i), "r" (&(l->a.counter)));
> +	local_irq_pmu_restore(flags);
> +}
>  
>  static __inline__ long local_add_return(long a, local_t *l)
>  {
>  	long t;
> +	unsigned long flags;
>  
> +	local_irq_pmu_save(flags);
>  	__asm__ __volatile__(
> -"1:"	PPC_LLARX(%0,0,%2,0) "			# local_add_return\n\
> +	PPC_LL" %0,0(%2)\n\
>  	add	%0,%1,%0\n"
> -	PPC405_ERR77(0,%2)
> -	PPC_STLCX	"%0,0,%2 \n\
> -	bne-	1b"
> +	PPC_STL	"%0,0(%2)\n"
>  	: "=&r" (t)
>  	: "r" (a), "r" (&(l->a.counter))
>  	: "cc", "memory");
> +	local_irq_pmu_restore(flags);

Are all your clobbers correct? You might not be clobbering "cc" here
anymore, for example. Could you double check those? Otherwise, awesome
patch!

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h
index b8da91363864..fb5728abb4e9 100644
--- a/arch/powerpc/include/asm/local.h
+++ b/arch/powerpc/include/asm/local.h
@@ -3,6 +3,9 @@ 
 
 #include <linux/percpu.h>
 #include <linux/atomic.h>
+#include <linux/irqflags.h>
+
+#include <asm/hw_irq.h>
 
 typedef struct
 {
@@ -14,24 +17,50 @@  typedef struct
 #define local_read(l)	atomic_long_read(&(l)->a)
 #define local_set(l,i)	atomic_long_set(&(l)->a, (i))
 
-#define local_add(i,l)	atomic_long_add((i),(&(l)->a))
-#define local_sub(i,l)	atomic_long_sub((i),(&(l)->a))
-#define local_inc(l)	atomic_long_inc(&(l)->a)
-#define local_dec(l)	atomic_long_dec(&(l)->a)
+static __inline__ void local_add(long i, local_t *l)
+{
+	long t;
+	unsigned long flags;
+
+	local_irq_pmu_save(flags);
+	__asm__ __volatile__(
+	PPC_LL" %0,0(%2)\n\
+	add     %0,%1,%0\n"
+	PPC_STL" %0,0(%2)\n"
+	: "=&r" (t)
+	: "r" (i), "r" (&(l->a.counter)));
+	local_irq_pmu_restore(flags);
+}
+
+static __inline__ void local_sub(long i, local_t *l)
+{
+	long t;
+	unsigned long flags;
+
+	local_irq_pmu_save(flags);
+	__asm__ __volatile__(
+	PPC_LL" %0,0(%2)\n\
+	subf    %0,%1,%0\n"
+	PPC_STL" %0,0(%2)\n"
+	: "=&r" (t)
+	: "r" (i), "r" (&(l->a.counter)));
+	local_irq_pmu_restore(flags);
+}
 
 static __inline__ long local_add_return(long a, local_t *l)
 {
 	long t;
+	unsigned long flags;
 
+	local_irq_pmu_save(flags);
 	__asm__ __volatile__(
-"1:"	PPC_LLARX(%0,0,%2,0) "			# local_add_return\n\
+	PPC_LL" %0,0(%2)\n\
 	add	%0,%1,%0\n"
-	PPC405_ERR77(0,%2)
-	PPC_STLCX	"%0,0,%2 \n\
-	bne-	1b"
+	PPC_STL	"%0,0(%2)\n"
 	: "=&r" (t)
 	: "r" (a), "r" (&(l->a.counter))
 	: "cc", "memory");
+	local_irq_pmu_restore(flags);
 
 	return t;
 }
@@ -41,16 +70,18 @@  static __inline__ long local_add_return(long a, local_t *l)
 static __inline__ long local_sub_return(long a, local_t *l)
 {
 	long t;
+	unsigned long flags;
+
+	local_irq_pmu_save(flags);
 
 	__asm__ __volatile__(
-"1:"	PPC_LLARX(%0,0,%2,0) "			# local_sub_return\n\
+"1:"	PPC_LL" %0,0(%2)\n\
 	subf	%0,%1,%0\n"
-	PPC405_ERR77(0,%2)
-	PPC_STLCX	"%0,0,%2 \n\
-	bne-	1b"
+	PPC_STL	"%0,0(%2)\n"
 	: "=&r" (t)
 	: "r" (a), "r" (&(l->a.counter))
 	: "cc", "memory");
+	local_irq_pmu_restore(flags);
 
 	return t;
 }
@@ -58,16 +89,17 @@  static __inline__ long local_sub_return(long a, local_t *l)
 static __inline__ long local_inc_return(local_t *l)
 {
 	long t;
+	unsigned long flags;
 
+	local_irq_pmu_save(flags);
 	__asm__ __volatile__(
-"1:"	PPC_LLARX(%0,0,%1,0) "			# local_inc_return\n\
+"1:"	PPC_LL" %0,0(%1)\n\
 	addic	%0,%0,1\n"
-	PPC405_ERR77(0,%1)
-	PPC_STLCX	"%0,0,%1 \n\
-	bne-	1b"
+	PPC_STL "%0,0(%1)\n"
 	: "=&r" (t)
 	: "r" (&(l->a.counter))
 	: "cc", "xer", "memory");
+	local_irq_pmu_restore(flags);
 
 	return t;
 }
@@ -85,20 +117,24 @@  static __inline__ long local_inc_return(local_t *l)
 static __inline__ long local_dec_return(local_t *l)
 {
 	long t;
+	unsigned long flags;
 
+	local_irq_pmu_save(flags);
 	__asm__ __volatile__(
-"1:"	PPC_LLARX(%0,0,%1,0) "			# local_dec_return\n\
+	PPC_LL" %0,0(%1)\n\
 	addic	%0,%0,-1\n"
-	PPC405_ERR77(0,%1)
-	PPC_STLCX	"%0,0,%1\n\
-	bne-	1b"
+	PPC_STL "%0,0(%1)\n"
 	: "=&r" (t)
 	: "r" (&(l->a.counter))
 	: "cc", "xer", "memory");
+	local_irq_pmu_restore(flags);
 
 	return t;
 }
 
+#define local_inc(l)	local_inc_return(l)
+#define local_dec(l)	local_dec_return(l)
+
 #define local_cmpxchg(l, o, n) \
 	(cmpxchg_local(&((l)->a.counter), (o), (n)))
 #define local_xchg(l, n) (xchg_local(&((l)->a.counter), (n)))
@@ -115,20 +151,21 @@  static __inline__ long local_dec_return(local_t *l)
 static __inline__ int local_add_unless(local_t *l, long a, long u)
 {
 	long t;
+	unsigned long flags;
 
+	local_irq_pmu_save(flags);
 	__asm__ __volatile__ (
-"1:"	PPC_LLARX(%0,0,%1,0) "			# local_add_unless\n\
+	PPC_LL" %0,0(%1)\n\
 	cmpw	0,%0,%3 \n\
 	beq-	2f \n\
 	add	%0,%2,%0 \n"
-	PPC405_ERR77(0,%2)
-	PPC_STLCX	"%0,0,%1 \n\
-	bne-	1b \n"
+	PPC_STL" %0,0(%1) \n"
 "	subf	%0,%2,%0 \n\
 2:"
 	: "=&r" (t)
 	: "r" (&(l->a.counter)), "r" (a), "r" (u)
 	: "cc", "memory");
+	local_irq_pmu_restore(flags);
 
 	return t != u;
 }
@@ -145,19 +182,20 @@  static __inline__ int local_add_unless(local_t *l, long a, long u)
 static __inline__ long local_dec_if_positive(local_t *l)
 {
 	long t;
+	unsigned long flags;
 
+	local_irq_pmu_save(flags);
 	__asm__ __volatile__(
-"1:"	PPC_LLARX(%0,0,%1,0) "			# local_dec_if_positive\n\
+	PPC_LL" %0,0(%1)\n\
 	cmpwi	%0,1\n\
 	addi	%0,%0,-1\n\
 	blt-	2f\n"
-	PPC405_ERR77(0,%1)
-	PPC_STLCX	"%0,0,%1\n\
-	bne-	1b"
+	PPC_STL "%0,0(%1)\n"
 	"\n\
 2:"	: "=&b" (t)
 	: "r" (&(l->a.counter))
 	: "cc", "memory");
+	local_irq_pmu_restore(flags);
 
 	return t;
 }