diff mbox series

[v2,2/2] powerpc/32be: use stmw/lmw for registers save/restore in asm

Message ID a94e38e1dbe0c38ce5b363fac29ab6e65fd9985a.1523984745.git.christophe.leroy@c-s.fr (mailing list archive)
State Accepted
Commit a1f3ae3fe8a1883c339f1bc89d1c941b3809e084
Headers show
Series [v2,1/2] powerpc: avoid an unnecessary test and branch in longjmp() | expand

Commit Message

Christophe Leroy April 17, 2018, 5:08 p.m. UTC
arch/powerpc/Makefile activates -mmultiple on BE PPC32 configs
in order to use multiple word instructions in functions entry/exit

The patch does the same for the asm parts, for consistency

On processors like the 8xx on which insn fetching is pretty slow,
this speeds up registers save/restore

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v2: Swapped both patches in the serie to reduce number of impacted
     lines and added the same modification in ppc_save_regs()

 arch/powerpc/include/asm/ppc_asm.h  |  5 +++++
 arch/powerpc/kernel/misc.S          | 10 ++++++++++
 arch/powerpc/kernel/ppc_save_regs.S |  4 ++++
 3 files changed, 19 insertions(+)

Comments

Michael Ellerman May 17, 2018, 12:10 p.m. UTC | #1
Christophe Leroy <christophe.leroy@c-s.fr> writes:
> arch/powerpc/Makefile activates -mmultiple on BE PPC32 configs
> in order to use multiple word instructions in functions entry/exit

True, though that could be a lot simpler because the MULTIPLEWORD value
is only used for PPC32, which is always big endian. I'll send a patch
for that.

> The patch does the same for the asm parts, for consistency
>
> On processors like the 8xx on which insn fetching is pretty slow,
> this speeds up registers save/restore

OK. I've always heard that they should be avoided, but that's coming
from 64-bit land.

I guess we've been enabling this for all 32-bit targets for ever so it
must be a reasonable option.

> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  v2: Swapped both patches in the serie to reduce number of impacted
>      lines and added the same modification in ppc_save_regs()
>
>  arch/powerpc/include/asm/ppc_asm.h  |  5 +++++
>  arch/powerpc/kernel/misc.S          | 10 ++++++++++
>  arch/powerpc/kernel/ppc_save_regs.S |  4 ++++
>  3 files changed, 19 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
> index 13f7f4c0e1ea..4bb765d0b758 100644
> --- a/arch/powerpc/include/asm/ppc_asm.h
> +++ b/arch/powerpc/include/asm/ppc_asm.h
> @@ -80,11 +80,16 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
>  #else
>  #define SAVE_GPR(n, base)	stw	n,GPR0+4*(n)(base)
>  #define REST_GPR(n, base)	lwz	n,GPR0+4*(n)(base)
> +#ifdef CONFIG_CPU_BIG_ENDIAN
> +#define SAVE_NVGPRS(base)	stmw	13, GPR0+4*13(base)
> +#define REST_NVGPRS(base)	lmw	13, GPR0+4*13(base)
> +#else
>  #define SAVE_NVGPRS(base)	SAVE_GPR(13, base); SAVE_8GPRS(14, base); \
>  				SAVE_10GPRS(22, base)
>  #define REST_NVGPRS(base)	REST_GPR(13, base); REST_8GPRS(14, base); \
>  				REST_10GPRS(22, base)

There is no 32-bit little endian, so this is basically dead code now.

Maybe there'll be a 32-bit LE port one day, but if so we can put the
code back then.

So I'll just drop the else case.

>  #endif
> +#endif
>  
>  #define SAVE_2GPRS(n, base)	SAVE_GPR(n, base); SAVE_GPR(n+1, base)
>  #define SAVE_4GPRS(n, base)	SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
> diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
> index 746ee0320ad4..a316d90a5c26 100644
> --- a/arch/powerpc/kernel/misc.S
> +++ b/arch/powerpc/kernel/misc.S
> @@ -49,6 +49,10 @@ _GLOBAL(setjmp)
>  	PPC_STL	r0,0(r3)
>  	PPC_STL	r1,SZL(r3)
>  	PPC_STL	r2,2*SZL(r3)
> +#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)

And this could just be:

#ifdef CONFIG_PPC32

> +	mfcr	r12
> +	stmw	r12, 3*SZL(r3)
> +#else
>  	mfcr	r0
>  	PPC_STL	r0,3*SZL(r3)
>  	PPC_STL	r13,4*SZL(r3)
> @@ -70,10 +74,15 @@ _GLOBAL(setjmp)
>  	PPC_STL	r29,20*SZL(r3)
>  	PPC_STL	r30,21*SZL(r3)
>  	PPC_STL	r31,22*SZL(r3)
> +#endif

It's a pity to end up with this basically split in half by ifdefs for
32/64-bit, but maybe we can clean that up later.

cheers
Segher Boessenkool May 17, 2018, 1:15 p.m. UTC | #2
On Thu, May 17, 2018 at 10:10:21PM +1000, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
> > arch/powerpc/Makefile activates -mmultiple on BE PPC32 configs
> > in order to use multiple word instructions in functions entry/exit
> 
> True, though that could be a lot simpler because the MULTIPLEWORD value
> is only used for PPC32, which is always big endian. I'll send a patch
> for that.

Do you mean in the kernel?  Many 32-bit processors can do LE, and many
do not implement multiple or string insns in LE mode.

> > The patch does the same for the asm parts, for consistency
> >
> > On processors like the 8xx on which insn fetching is pretty slow,
> > this speeds up registers save/restore
> 
> OK. I've always heard that they should be avoided, but that's coming
> from 64-bit land.
> 
> I guess we've been enabling this for all 32-bit targets for ever so it
> must be a reasonable option.

On 603, load multiple (and string) are one cycle slower than doing all the
loads separately, and store is essentially the same as separate stores.
On 7xx and 7xxx both loads and stores are one cycle slower as multiple
than as separate insns.

load/store multiple are nice for saving/storing registers.


Segher
Christophe Leroy May 17, 2018, 1:27 p.m. UTC | #3
Le 17/05/2018 à 15:15, Segher Boessenkool a écrit :
> On Thu, May 17, 2018 at 10:10:21PM +1000, Michael Ellerman wrote:
>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>> arch/powerpc/Makefile activates -mmultiple on BE PPC32 configs
>>> in order to use multiple word instructions in functions entry/exit
>>
>> True, though that could be a lot simpler because the MULTIPLEWORD value
>> is only used for PPC32, which is always big endian. I'll send a patch
>> for that.
> 
> Do you mean in the kernel?  Many 32-bit processors can do LE, and many
> do not implement multiple or string insns in LE mode.
> 
>>> The patch does the same for the asm parts, for consistency
>>>
>>> On processors like the 8xx on which insn fetching is pretty slow,
>>> this speeds up registers save/restore
>>
>> OK. I've always heard that they should be avoided, but that's coming
>> from 64-bit land.
>>
>> I guess we've been enabling this for all 32-bit targets for ever so it
>> must be a reasonable option.
> 
> On 603, load multiple (and string) are one cycle slower than doing all the
> loads separately, and store is essentially the same as separate stores.
> On 7xx and 7xxx both loads and stores are one cycle slower as multiple
> than as separate insns.

That's in theory when the instructions are already in the cache.

But loading several instructions into the cache takes time.

Christophe

> 
> load/store multiple are nice for saving/storing registers.
> 
> 
> Segher
>
Benjamin Herrenschmidt May 17, 2018, 1:39 p.m. UTC | #4
On Thu, 2018-05-17 at 22:10 +1000, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
> > arch/powerpc/Makefile activates -mmultiple on BE PPC32 configs
> > in order to use multiple word instructions in functions entry/exit
> 
> True, though that could be a lot simpler because the MULTIPLEWORD value
> is only used for PPC32, which is always big endian. I'll send a patch
> for that.

There have been known cases of 4xx LE ports though none ever made it
upstream ...

> > The patch does the same for the asm parts, for consistency
> > 
> > On processors like the 8xx on which insn fetching is pretty slow,
> > this speeds up registers save/restore
> 
> OK. I've always heard that they should be avoided, but that's coming
> from 64-bit land.
> 
> I guess we've been enabling this for all 32-bit targets for ever so it
> must be a reasonable option.
> 
> > Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> > ---
> >  v2: Swapped both patches in the serie to reduce number of impacted
> >      lines and added the same modification in ppc_save_regs()
> > 
> >  arch/powerpc/include/asm/ppc_asm.h  |  5 +++++
> >  arch/powerpc/kernel/misc.S          | 10 ++++++++++
> >  arch/powerpc/kernel/ppc_save_regs.S |  4 ++++
> >  3 files changed, 19 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
> > index 13f7f4c0e1ea..4bb765d0b758 100644
> > --- a/arch/powerpc/include/asm/ppc_asm.h
> > +++ b/arch/powerpc/include/asm/ppc_asm.h
> > @@ -80,11 +80,16 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> >  #else
> >  #define SAVE_GPR(n, base)	stw	n,GPR0+4*(n)(base)
> >  #define REST_GPR(n, base)	lwz	n,GPR0+4*(n)(base)
> > +#ifdef CONFIG_CPU_BIG_ENDIAN
> > +#define SAVE_NVGPRS(base)	stmw	13, GPR0+4*13(base)
> > +#define REST_NVGPRS(base)	lmw	13, GPR0+4*13(base)
> > +#else
> >  #define SAVE_NVGPRS(base)	SAVE_GPR(13, base); SAVE_8GPRS(14, base); \
> >  				SAVE_10GPRS(22, base)
> >  #define REST_NVGPRS(base)	REST_GPR(13, base); REST_8GPRS(14, base); \
> >  				REST_10GPRS(22, base)
> 
> There is no 32-bit little endian, so this is basically dead code now.
> 
> Maybe there'll be a 32-bit LE port one day, but if so we can put the
> code back then.
> 
> So I'll just drop the else case.
> 
> >  #endif
> > +#endif
> >  
> >  #define SAVE_2GPRS(n, base)	SAVE_GPR(n, base); SAVE_GPR(n+1, base)
> >  #define SAVE_4GPRS(n, base)	SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
> > diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
> > index 746ee0320ad4..a316d90a5c26 100644
> > --- a/arch/powerpc/kernel/misc.S
> > +++ b/arch/powerpc/kernel/misc.S
> > @@ -49,6 +49,10 @@ _GLOBAL(setjmp)
> >  	PPC_STL	r0,0(r3)
> >  	PPC_STL	r1,SZL(r3)
> >  	PPC_STL	r2,2*SZL(r3)
> > +#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)
> 
> And this could just be:
> 
> #ifdef CONFIG_PPC32
> 
> > +	mfcr	r12
> > +	stmw	r12, 3*SZL(r3)
> > +#else
> >  	mfcr	r0
> >  	PPC_STL	r0,3*SZL(r3)
> >  	PPC_STL	r13,4*SZL(r3)
> > @@ -70,10 +74,15 @@ _GLOBAL(setjmp)
> >  	PPC_STL	r29,20*SZL(r3)
> >  	PPC_STL	r30,21*SZL(r3)
> >  	PPC_STL	r31,22*SZL(r3)
> > +#endif
> 
> It's a pity to end up with this basically split in half by ifdefs for
> 32/64-bit, but maybe we can clean that up later.
> 
> cheers
Segher Boessenkool May 17, 2018, 2:27 p.m. UTC | #5
On Thu, May 17, 2018 at 03:27:37PM +0200, Christophe LEROY wrote:
> Le 17/05/2018 à 15:15, Segher Boessenkool a écrit :
> >>I guess we've been enabling this for all 32-bit targets for ever so it
> >>must be a reasonable option.
> >
> >On 603, load multiple (and string) are one cycle slower than doing all the
> >loads separately, and store is essentially the same as separate stores.
> >On 7xx and 7xxx both loads and stores are one cycle slower as multiple
> >than as separate insns.
> 
> That's in theory when the instructions are already in the cache.
> 
> But loading several instructions into the cache takes time.

Yes, of course, that's why I wrote:

> >load/store multiple are nice for saving/storing registers.

:-)


Segher
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 13f7f4c0e1ea..4bb765d0b758 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -80,11 +80,16 @@  END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #else
 #define SAVE_GPR(n, base)	stw	n,GPR0+4*(n)(base)
 #define REST_GPR(n, base)	lwz	n,GPR0+4*(n)(base)
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define SAVE_NVGPRS(base)	stmw	13, GPR0+4*13(base)
+#define REST_NVGPRS(base)	lmw	13, GPR0+4*13(base)
+#else
 #define SAVE_NVGPRS(base)	SAVE_GPR(13, base); SAVE_8GPRS(14, base); \
 				SAVE_10GPRS(22, base)
 #define REST_NVGPRS(base)	REST_GPR(13, base); REST_8GPRS(14, base); \
 				REST_10GPRS(22, base)
 #endif
+#endif
 
 #define SAVE_2GPRS(n, base)	SAVE_GPR(n, base); SAVE_GPR(n+1, base)
 #define SAVE_4GPRS(n, base)	SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 746ee0320ad4..a316d90a5c26 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -49,6 +49,10 @@  _GLOBAL(setjmp)
 	PPC_STL	r0,0(r3)
 	PPC_STL	r1,SZL(r3)
 	PPC_STL	r2,2*SZL(r3)
+#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)
+	mfcr	r12
+	stmw	r12, 3*SZL(r3)
+#else
 	mfcr	r0
 	PPC_STL	r0,3*SZL(r3)
 	PPC_STL	r13,4*SZL(r3)
@@ -70,10 +74,15 @@  _GLOBAL(setjmp)
 	PPC_STL	r29,20*SZL(r3)
 	PPC_STL	r30,21*SZL(r3)
 	PPC_STL	r31,22*SZL(r3)
+#endif
 	li	r3,0
 	blr
 
 _GLOBAL(longjmp)
+#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)
+	lmw	r12, 3*SZL(r3)
+	mtcrf	0x38, r12
+#else
 	PPC_LL	r13,4*SZL(r3)
 	PPC_LL	r14,5*SZL(r3)
 	PPC_LL	r15,6*SZL(r3)
@@ -95,6 +104,7 @@  _GLOBAL(longjmp)
 	PPC_LL	r31,22*SZL(r3)
 	PPC_LL	r0,3*SZL(r3)
 	mtcrf	0x38,r0
+#endif
 	PPC_LL	r0,0(r3)
 	PPC_LL	r1,SZL(r3)
 	PPC_LL	r2,2*SZL(r3)
diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S
index 1b1787d52896..d60316e70514 100644
--- a/arch/powerpc/kernel/ppc_save_regs.S
+++ b/arch/powerpc/kernel/ppc_save_regs.S
@@ -25,6 +25,9 @@ 
  */
 _GLOBAL(ppc_save_regs)
 	PPC_STL	r0,0*SZL(r3)
+#if defined(CONFIG_PPC32) && defined(CONFIG_CPU_BIG_ENDIAN)
+	stmw	r2, 2*SZL(r3)
+#else
 	PPC_STL	r2,2*SZL(r3)
 	PPC_STL	r3,3*SZL(r3)
 	PPC_STL	r4,4*SZL(r3)
@@ -55,6 +58,7 @@  _GLOBAL(ppc_save_regs)
 	PPC_STL	r29,29*SZL(r3)
 	PPC_STL	r30,30*SZL(r3)
 	PPC_STL	r31,31*SZL(r3)
+#endif
 	/* go up one stack frame for SP */
 	PPC_LL	r4,0(r1)
 	PPC_STL	r4,1*SZL(r3)