Patchwork [v2,2/3] powerpc: use the jump label for cpu_has_feature

login
register
mail settings
Submitter Kevin Hao
Date Sept. 2, 2013, 5:45 a.m.
Message ID <1378100726-32545-3-git-send-email-haokexin@gmail.com>
Download mbox | patch
Permalink /patch/271674/
State Under Review
Delegated to: Benjamin Herrenschmidt
Headers show

Comments

Kevin Hao - Sept. 2, 2013, 5:45 a.m.
The cpu features are fixed once the probe of cpu features are done.
And the function cpu_has_feature() does be used in some hot path.
The checking of the cpu features for each time of invoking of
cpu_has_feature() seems suboptimal. This tries to reduce this
overhead of this check by using jump label. But we can only use
the jump label for this check only after the execution of
jump_label_init(), so we introduce another jump label to
still do the feature check by default before all the cpu
feature jump labels are initialized.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
---
v2: Include the jump_label.h instead of jump_label_base.h.

 arch/powerpc/include/asm/cpufeatures.h | 27 +++++++++++++++++++++++++++
 arch/powerpc/kernel/cputable.c         | 23 +++++++++++++++++++++++
 2 files changed, 50 insertions(+)
Benjamin Herrenschmidt - Nov. 27, 2013, 2:45 a.m.
On Mon, 2013-09-02 at 13:45 +0800, Kevin Hao wrote:
> The cpu features are fixed once the probe of cpu features are done.
> And the function cpu_has_feature() does be used in some hot path.
> The checking of the cpu features for each time of invoking of
> cpu_has_feature() seems suboptimal. This tries to reduce this
> overhead of this check by using jump label. But we can only use
> the jump label for this check only after the execution of
> jump_label_init(), so we introduce another jump label to
> still do the feature check by default before all the cpu
> feature jump labels are initialized.

So I was looking at these and ...

> +static inline int cpu_has_feature(unsigned long feature)
> +{
> +	if (CPU_FTRS_ALWAYS & feature)
> +		return 1;
> +
> +	if (!(CPU_FTRS_POSSIBLE | feature))
> +		return 0;
> +
> +	if (static_key_false(&cpu_feat_keys_enabled)) {
> +		int i = __builtin_ctzl(feature);
> +
> +		return static_key_false(&cpu_feat_keys[i]);
> +	} else
> +		return !!(cur_cpu_spec->cpu_features & feature);
> +}

This is gross :-)

Have you checked the generated code ? I'm worried that we end up hitting
at least 2 branches every time, which might be enough to defeat the
purposes even if they are unconditional in term of performance and
code size...

Cheers,
Ben.

> +#else
>  static inline int cpu_has_feature(unsigned long feature)
>  {
>  	return (CPU_FTRS_ALWAYS & feature) ||
> @@ -10,5 +36,6 @@ static inline int cpu_has_feature(unsigned long feature)
>  		& cur_cpu_spec->cpu_features
>  		& feature);
>  }
> +#endif
>  
>  #endif /* __ASM_POWERPC_CPUFEATURE_H */
> diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
> index 597d954..50bd216 100644
> --- a/arch/powerpc/kernel/cputable.c
> +++ b/arch/powerpc/kernel/cputable.c
> @@ -21,6 +21,7 @@
>  #include <asm/prom.h>		/* for PTRRELOC on ARCH=ppc */
>  #include <asm/mmu.h>
>  #include <asm/setup.h>
> +#include <asm/cpufeatures.h>
>  
>  struct cpu_spec* cur_cpu_spec = NULL;
>  EXPORT_SYMBOL(cur_cpu_spec);
> @@ -2258,3 +2259,25 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
>  
>  	return NULL;
>  }
> +
> +#ifdef CONFIG_JUMP_LABEL
> +struct static_key cpu_feat_keys[MAX_CPU_FEATURES];
> +struct static_key cpu_feat_keys_enabled;
> +
> +static __init int cpu_feat_keys_init(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_CPU_FEATURES; i++) {
> +		unsigned long f = 1 << i;
> +
> +		if (cur_cpu_spec->cpu_features & f)
> +			static_key_slow_inc(&cpu_feat_keys[i]);
> +	}
> +
> +	static_key_slow_inc(&cpu_feat_keys_enabled);
> +
> +	return 0;
> +}
> +early_initcall(cpu_feat_keys_init);
> +#endif
Kevin Hao - Dec. 25, 2013, 5:58 a.m.
On Wed, Nov 27, 2013 at 01:45:41PM +1100, Benjamin Herrenschmidt wrote:
> On Mon, 2013-09-02 at 13:45 +0800, Kevin Hao wrote:
> > The cpu features are fixed once the probe of cpu features are done.
> > And the function cpu_has_feature() does be used in some hot path.
> > The checking of the cpu features for each time of invoking of
> > cpu_has_feature() seems suboptimal. This tries to reduce this
> > overhead of this check by using jump label. But we can only use
> > the jump label for this check only after the execution of
> > jump_label_init(), so we introduce another jump label to
> > still do the feature check by default before all the cpu
> > feature jump labels are initialized.
> 
> So I was looking at these and ...

Sorry for the delayed response.

> 
> > +static inline int cpu_has_feature(unsigned long feature)
> > +{
> > +	if (CPU_FTRS_ALWAYS & feature)
> > +		return 1;
> > +
> > +	if (!(CPU_FTRS_POSSIBLE | feature))
> > +		return 0;
> > +
> > +	if (static_key_false(&cpu_feat_keys_enabled)) {
> > +		int i = __builtin_ctzl(feature);
> > +
> > +		return static_key_false(&cpu_feat_keys[i]);
> > +	} else
> > +		return !!(cur_cpu_spec->cpu_features & feature);
> > +}
> 
> This is gross :-)
> 
> Have you checked the generated code ? I'm worried that we end up hitting
> at least 2 branches every time,

No, we would get 2 unconditional branches at worst. The following is the
disassemble of part code in switch_mm() when jump label is enabled.

   68         /* We must stop all altivec streams before changing the HW
   69          * context
   70          */
   71 #ifdef CONFIG_ALTIVEC
   72         if (cpu_has_feature(CPU_FTR_ALTIVEC))
   73                 asm volatile ("dssall");
   74 #endif /* CONFIG_ALTIVEC */
  
  c0000000005c42f4:       60 00 00 00     nop
  c0000000005c42f8:       3d 02 00 01     addis   r8,r2,1
  c0000000005c42fc:       39 28 f6 b8     addi    r9,r8,-2376
  c0000000005c4300:       e9 29 00 00     ld      r9,0(r9)
  c0000000005c4304:       e9 29 00 10     ld      r9,16(r9)
  c0000000005c4308:       79 2a ef e3     rldicl. r10,r9,61,63
  c0000000005c430c:       41 82 00 08     beq     c0000000005c4314 <.__schedule+0x27c>
  c0000000005c4310:       7e 00 06 6c     dssall
  c0000000005c4314:       7f 43 d3 78     mr      r3,r26
  c0000000005c4318:       7f a4 eb 78     mr      r4,r29
  c0000000005c431c:       4b a5 ff 71     bl      c00000000002428c <.switch_mmu_context>
  c0000000005c4320:       60 00 00 00     nop
  ....
  c0000000005c4400:       60 00 00 00     nop
  c0000000005c4404:       7f 43 d3 78     mr      r3,r26
  c0000000005c4408:       7f a4 eb 78     mr      r4,r29
  c0000000005c440c:       4b a5 fe 81     bl      c00000000002428c <.switch_mmu_context>
  c0000000005c4410:       60 00 00 00     nop


On a p5020 board which doesn't support altivec, the code would change
to the following after jump label init.
  c0000000005c42f4: b c0000000005c4400

The final instruction sequence should be just a branch.

On a t4240 board which does have altivec support, the code would change to:

  c0000000005c42f4: b c0000000005c4400
  ...
  c0000000005c4400: b c0000000005c4310

The final instruction sequence should be two unconditional branches.


The following is the disassemble code when jump label is disabled.
  c0000000005c26fc:       60 00 00 00     nop
  c0000000005c2700:       3d 02 00 01     addis   r8,r2,1
  c0000000005c2704:       39 28 24 b8     addi    r9,r8,9400
  c0000000005c2708:       e9 29 00 00     ld      r9,0(r9)
  c0000000005c270c:       e9 29 00 10     ld      r9,16(r9)
  c0000000005c2710:       79 2a ef e3     rldicl. r10,r9,61,63
  c0000000005c2714:       40 82 00 d4     bne     c0000000005c27e8 <.__schedule+0x360>
  c0000000005c2718:       7f 43 d3 78     mr      r3,r26
  c0000000005c271c:       7f a4 eb 78     mr      r4,r29
  c0000000005c2720:       4b a6 0a 75     bl      c000000000023194 <.switch_mmu_context>
  c0000000005c2724:       60 00 00 00     nop
  ....
  c0000000005c27e8:       7e 00 06 6c     dssall
  c0000000005c27ec:       4b ff ff 2c     b       c0000000005c2718 <.__schedule+0x290>
  c0000000005c27f0:       e9 3e 00 00     ld      r9,0(r30)
  c0000000005c27f4:       71 2a 00 81     andi.   r10,r9,129
  c0000000005c27f8:       40 82 00 94     bne     c0000000005c288c <.__schedule+0x404>


The final instruction sequence is following:
  addis   r8,r2,1
  addi    r9,r8,9400
  ld      r9,0(r9)
  ld      r9,16(r9)
  rldicl. r10,r9,61,63
  bne     c0000000005c27e8


> which might be enough to defeat the
> purposes even if they are unconditional in term of performance and
> code size...

It does result in an increase in the code size due to the enable of jump label.

before:
  .text		.data	 
  005c4ff4	0005ede8

after:
  .text		.data
  005c6c04	0005fe68	

Thanks,
Kevin
> 
> Cheers,
> Ben.
> 
> > +#else
> >  static inline int cpu_has_feature(unsigned long feature)
> >  {
> >  	return (CPU_FTRS_ALWAYS & feature) ||
> > @@ -10,5 +36,6 @@ static inline int cpu_has_feature(unsigned long feature)
> >  		& cur_cpu_spec->cpu_features
> >  		& feature);
> >  }
> > +#endif
> >  
> >  #endif /* __ASM_POWERPC_CPUFEATURE_H */
> > diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
> > index 597d954..50bd216 100644
> > --- a/arch/powerpc/kernel/cputable.c
> > +++ b/arch/powerpc/kernel/cputable.c
> > @@ -21,6 +21,7 @@
> >  #include <asm/prom.h>		/* for PTRRELOC on ARCH=ppc */
> >  #include <asm/mmu.h>
> >  #include <asm/setup.h>
> > +#include <asm/cpufeatures.h>
> >  
> >  struct cpu_spec* cur_cpu_spec = NULL;
> >  EXPORT_SYMBOL(cur_cpu_spec);
> > @@ -2258,3 +2259,25 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
> >  
> >  	return NULL;
> >  }
> > +
> > +#ifdef CONFIG_JUMP_LABEL
> > +struct static_key cpu_feat_keys[MAX_CPU_FEATURES];
> > +struct static_key cpu_feat_keys_enabled;
> > +
> > +static __init int cpu_eat_keys_init(void)
> > +{
> > +	int i;
> > +
> > +	for (i = 0; i < MAX_CPU_FEATURES; i++) {
> > +		unsigned long f = 1 << i;
> > +
> > +		if (cur_cpu_spec->cpu_features & f)
> > +			static_key_slow_inc(&cpu_feat_keys[i]);
> > +	}
> > +
> > +	static_key_slow_inc(&cpu_feat_keys_enabled);
> > +
> > +	return 0;
> > +}
> > +early_initcall(cpu_feat_keys_init);
> > +#endif
> 
>

Patch

diff --git a/arch/powerpc/include/asm/cpufeatures.h b/arch/powerpc/include/asm/cpufeatures.h
index 37650db..2631287 100644
--- a/arch/powerpc/include/asm/cpufeatures.h
+++ b/arch/powerpc/include/asm/cpufeatures.h
@@ -2,7 +2,33 @@ 
 #define __ASM_POWERPC_CPUFEATURES_H
 
 #include <asm/cputable.h>
+#ifdef CONFIG_JUMP_LABEL
+#include <linux/jump_label.h>
 
+#ifdef __powerpc64__
+#define MAX_CPU_FEATURES	64
+#else
+#define MAX_CPU_FEATURES	32
+#endif
+extern struct static_key cpu_feat_keys[MAX_CPU_FEATURES];
+extern struct static_key cpu_feat_keys_enabled;
+
+static inline int cpu_has_feature(unsigned long feature)
+{
+	if (CPU_FTRS_ALWAYS & feature)
+		return 1;
+
+	if (!(CPU_FTRS_POSSIBLE | feature))
+		return 0;
+
+	if (static_key_false(&cpu_feat_keys_enabled)) {
+		int i = __builtin_ctzl(feature);
+
+		return static_key_false(&cpu_feat_keys[i]);
+	} else
+		return !!(cur_cpu_spec->cpu_features & feature);
+}
+#else
 static inline int cpu_has_feature(unsigned long feature)
 {
 	return (CPU_FTRS_ALWAYS & feature) ||
@@ -10,5 +36,6 @@  static inline int cpu_has_feature(unsigned long feature)
 		& cur_cpu_spec->cpu_features
 		& feature);
 }
+#endif
 
 #endif /* __ASM_POWERPC_CPUFEATURE_H */
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 597d954..50bd216 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -21,6 +21,7 @@ 
 #include <asm/prom.h>		/* for PTRRELOC on ARCH=ppc */
 #include <asm/mmu.h>
 #include <asm/setup.h>
+#include <asm/cpufeatures.h>
 
 struct cpu_spec* cur_cpu_spec = NULL;
 EXPORT_SYMBOL(cur_cpu_spec);
@@ -2258,3 +2259,25 @@  struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr)
 
 	return NULL;
 }
+
+#ifdef CONFIG_JUMP_LABEL
+struct static_key cpu_feat_keys[MAX_CPU_FEATURES];
+struct static_key cpu_feat_keys_enabled;
+
+static __init int cpu_feat_keys_init(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_CPU_FEATURES; i++) {
+		unsigned long f = 1 << i;
+
+		if (cur_cpu_spec->cpu_features & f)
+			static_key_slow_inc(&cpu_feat_keys[i]);
+	}
+
+	static_key_slow_inc(&cpu_feat_keys_enabled);
+
+	return 0;
+}
+early_initcall(cpu_feat_keys_init);
+#endif