diff mbox

[3/7,RFC] add support for BlueGene/P FPU

Message ID 1305753895-24845-3-git-send-email-ericvh@gmail.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Eric Van Hensbergen May 18, 2011, 9:24 p.m. UTC
This patch adds save/restore register support for the BlueGene/P
double hummer FPU.

Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 arch/powerpc/include/asm/ppc_asm.h |   39 ++++++++++++++++++++++++-----------
 arch/powerpc/kernel/fpu.S          |    8 +++---
 arch/powerpc/platforms/44x/Kconfig |    9 ++++++++
 3 files changed, 40 insertions(+), 16 deletions(-)

Comments

Michael Neuling May 19, 2011, 5:58 a.m. UTC | #1
Eric,

> This patch adds save/restore register support for the BlueGene/P
> double hummer FPU.

What does this mean?  Needs more details here.

> Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
> ---
>  arch/powerpc/include/asm/ppc_asm.h |   39 ++++++++++++++++++++++++----------
-
>  arch/powerpc/kernel/fpu.S          |    8 +++---
>  arch/powerpc/platforms/44x/Kconfig |    9 ++++++++
>  3 files changed, 40 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/pp
c_asm.h
> index 9821006..daa22bb 100644
> --- a/arch/powerpc/include/asm/ppc_asm.h
> +++ b/arch/powerpc/include/asm/ppc_asm.h
> @@ -88,6 +88,13 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
>  				REST_10GPRS(22, base)
>  #endif
>  
> +#ifdef CONFIG_BGP
> +#define LFPDX(frt, ra, rb)	.long (31<<26)|((frt)<<21)|((ra)<<16)| \
> +							((rb)<<11)|(462<<1)
> +#define STFPDX(frt, ra, rb)	.long (31<<26)|((frt)<<21)|((ra)<<16)| \
> +							((rb)<<11)|(974<<1)
> +#endif /* CONFIG_BGP */

Put these in arch/powerpc/include/asm/ppc-opcode.h and reformat to fit
whats there already.  

Also, don't need to put these defines inside a #ifdef.

> +
>  #define SAVE_2GPRS(n, base)	SAVE_GPR(n, base); SAVE_GPR(n+1, base)
>  #define SAVE_4GPRS(n, base)	SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
>  #define SAVE_8GPRS(n, base)	SAVE_4GPRS(n, base); SAVE_4GPRS(n+4, base)
> @@ -97,18 +104,26 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
>  #define REST_8GPRS(n, base)	REST_4GPRS(n, base); REST_4GPRS(n+4, base)
>  #define REST_10GPRS(n, base)	REST_8GPRS(n, base); REST_2GPRS(n+8, base)
>  
> -#define SAVE_FPR(n, base)	stfd	n,THREAD_FPR0+8*TS_FPRWIDTH*(n)(base)
> -#define SAVE_2FPRS(n, base)	SAVE_FPR(n, base); SAVE_FPR(n+1, base)
> -#define SAVE_4FPRS(n, base)	SAVE_2FPRS(n, base); SAVE_2FPRS(n+2, base)
> -#define SAVE_8FPRS(n, base)	SAVE_4FPRS(n, base); SAVE_4FPRS(n+4, base)
> -#define SAVE_16FPRS(n, base)	SAVE_8FPRS(n, base); SAVE_8FPRS(n+8, base)
> -#define SAVE_32FPRS(n, base)	SAVE_16FPRS(n, base); SAVE_16FPRS(n+16, base)
> -#define REST_FPR(n, base)	lfd	n,THREAD_FPR0+8*TS_FPRWIDTH*(n)(base)
> -#define REST_2FPRS(n, base)	REST_FPR(n, base); REST_FPR(n+1, base)
> -#define REST_4FPRS(n, base)	REST_2FPRS(n, base); REST_2FPRS(n+2, base)
> -#define REST_8FPRS(n, base)	REST_4FPRS(n, base); REST_4FPRS(n+4, base)
> -#define REST_16FPRS(n, base)	REST_8FPRS(n, base); REST_8FPRS(n+8, base)
> -#define REST_32FPRS(n, base)	REST_16FPRS(n, base); REST_16FPRS(n+16, base)
> +#ifdef CONFIG_BGP
> +#define SAVE_FPR(n, b, base)	li b, THREAD_FPR0+(16*(n)); STFPDX(n, base, b)
> +#define REST_FPR(n, b, base)	li b, THREAD_FPR0+(16*(n)); LFPDX(n, base, b)

16*?  Are these FP regs 64 or 128 bits wide?  If 128 you are doing to
have to play with TS_WIDTH to get the size of the FPs correct in the
thread_struct.

I think there's a bug here.

> +#else /* CONFIG_BGP */
> +#define SAVE_FPR(n, b, base)	(stfd	n, THREAD_FPR0+8*TS_FPRWIDTH*(n)(base))
> +#define REST_FPR(n, b, base)	(lfd	n, THREAD_FPR0+8*TS_FPRWIDTH*(n)(base))
> +#endif /* CONFIG_BGP */
> +
> +#define SAVE_2FPRS(n, b, base)	SAVE_FPR(n, b, base); SAVE_FPR(n+1, b, 
base)
> +#define SAVE_4FPRS(n, b, base)	SAVE_2FPRS(n, b, base); SAVE_2FPRS(n+2,
 b, base)
> +#define SAVE_8FPRS(n, b, base)	SAVE_4FPRS(n, b, base); SAVE_4FPRS(n+4,
 b, base)
> +#define SAVE_16FPRS(n, b, base)	SAVE_8FPRS(n, b, base); SAVE_8FPRS(n+8,
 b, base)
> +#define SAVE_32FPRS(n, b, base)	SAVE_16FPRS(n, b, base); \
> +				SAVE_16FPRS(n+16, b, base)
> +#define REST_2FPRS(n, b, base)	REST_FPR(n, b, base); REST_FPR(n+1, b, 
base)
> +#define REST_4FPRS(n, b, base)	REST_2FPRS(n, b, base); REST_2FPRS(n+2,
 b, base)
> +#define REST_8FPRS(n, b, base)	REST_4FPRS(n, b, base); REST_4FPRS(n+4,
 b, base)
> +#define REST_16FPRS(n, b, base)	REST_8FPRS(n, b, base); REST_8FPRS(n+8,
 b, base)
> +#define REST_32FPRS(n, b, base)	REST_16FPRS(n, b, base); \
> +				REST_16FPRS(n+16, b, base)
>  
>  #define SAVE_VR(n,b,base)	li b,THREAD_VR0+(16*(n));  stvx n,base,b
>  #define SAVE_2VRS(n,b,base)	SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
> diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
> index de36955..9f11c66 100644
> --- a/arch/powerpc/kernel/fpu.S
> +++ b/arch/powerpc/kernel/fpu.S
> @@ -30,7 +30,7 @@
>  BEGIN_FTR_SECTION							\
>  	b	2f;							\
>  END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
> -	REST_32FPRS(n,base);						\
> +	REST_32FPRS(n,c,base);						\
>  	b	3f;							\
>  2:	REST_32VSRS(n,c,base);						\
>  3:
> @@ -39,13 +39,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);		
			\
>  BEGIN_FTR_SECTION							\
>  	b	2f;							\
>  END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
> -	SAVE_32FPRS(n,base);						\
> +	SAVE_32FPRS(n,c,base);						\
>  	b	3f;							\
>  2:	SAVE_32VSRS(n,c,base);						\
>  3:
>  #else
> -#define REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
> -#define SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
> +#define REST_32FPVSRS(n,b,base)	REST_32FPRS(n,b,base)
> +#define SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n,b,base)
>  #endif
>  
>  /*
> diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/
Kconfig
> index f485fc5f..24a515e 100644
> --- a/arch/powerpc/platforms/44x/Kconfig
> +++ b/arch/powerpc/platforms/44x/Kconfig
> @@ -169,6 +169,15 @@ config YOSEMITE
>  	help
>  	  This option enables support for the AMCC PPC440EP evaluation board.
>  
> +config	BGP

Does this FPU feature have a specific name like double hammer?  I'd
rather have the BGP defconfig depend on PPC_FPU_DOUBLE_HUMMER, or
something like that...

> +	bool "Blue Gene/P"
> +	depends on 44x
> +	default n
> +	select PPC_FPU
> +	select PPC_DOUBLE_FPU

... in fact, it seem you are doing something like these here but you
don't use PPC_DOUBLE_FPU anywhere?

Mikey

> +	help
> +	  This option enables support for the IBM BlueGene/P supercomputer.
> +
>  config ISS4xx
>  	bool "ISS 4xx Simulator"
>  	depends on (44x || 40x)
> -- 
> 1.7.4.1
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
Eric Van Hensbergen May 19, 2011, 1:53 p.m. UTC | #2
On Thu, May 19, 2011 at 12:58 AM, Michael Neuling <mikey@neuling.org> wrote:
> Eric,
>
>> This patch adds save/restore register support for the BlueGene/P
>> double hummer FPU.
>
> What does this mean?  Needs more details here.
>

Hi Mikey,

any specific details you are looking for here?  AFAIK these patches
are required for the kernel to save/restore the double hummer
properly.

>>
>> +#ifdef CONFIG_BGP
>> +#define LFPDX(frt, ra, rb)   .long (31<<26)|((frt)<<21)|((ra)<<16)| \
>> +                                                     ((rb)<<11)|(462<<1)
>> +#define STFPDX(frt, ra, rb)  .long (31<<26)|((frt)<<21)|((ra)<<16)| \
>> +                                                     ((rb)<<11)|(974<<1)
>> +#endif /* CONFIG_BGP */
>
> Put these in arch/powerpc/include/asm/ppc-opcode.h and reformat to fit
> whats there already.
>
> Also, don't need to put these defines inside a #ifdef.
>

Sure, I'll fix that up.

>> +#ifdef CONFIG_BGP
>> +#define SAVE_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); STFPDX(n, base, b)
>> +#define REST_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); LFPDX(n, base, b)
>
> 16*?  Are these FP regs 64 or 128 bits wide?  If 128 you are doing to
> have to play with TS_WIDTH to get the size of the FPs correct in the
> thread_struct.
>
> I think there's a bug here.
>

I actually have three different versions of this code from different
source patches that I'm drawing from - so your help in figuring out
the best way to approach this is appreciated.  The kittyhawk version
of the code has 8* instead of 16*.  According to the docs:
"Each of the two FPU units contains 32 64-bit floating point registers
for a total of 64 FP registers per processor." which would seem to
point to the kittyhawk version - but they have a second SAVE_32SFPRS
for the second hummer.  What wasn't clear to me with this version of
the code was whether or not they were doing something clever like
saving the pair of the 64-bit FPU registers in a single 128-bit slot
(seems plausible).  If this is not the way to go, I can certainly
switch the kittyhawk version of the patch with the *, the extra
SAVE32SFPR and the extra double hummer specific storage space in the
thread_struct.  If it would help I can post an alternate version of
the patch for discussion with the kittyhawk version.

>>  /*
>> diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/
> Kconfig
>> index f485fc5f..24a515e 100644
>> --- a/arch/powerpc/platforms/44x/Kconfig
>> +++ b/arch/powerpc/platforms/44x/Kconfig
>> @@ -169,6 +169,15 @@ config YOSEMITE
>>       help
>>         This option enables support for the AMCC PPC440EP evaluation board.
>>
>> +config       BGP
>
> Does this FPU feature have a specific name like double hammer?  I'd
> rather have the BGP defconfig depend on PPC_FPU_DOUBLE_HUMMER, or
> something like that...
>
>> +     bool "Blue Gene/P"
>> +     depends on 44x
>> +     default n
>> +     select PPC_FPU
>> +     select PPC_DOUBLE_FPU
>
> ... in fact, it seem you are doing something like these here but you
> don't use PPC_DOUBLE_FPU anywhere?
>

A fair point.  I'm fine with calling it DOUBLE_HUMMER, but I wasn't sure if
that was "too internal" of a name for the kernel.  Let me know and
I'll fix it up.
I'll also change the CONFIG_BGP defines in the FPU code to PPC_DOUBLE_FPU
or PPC_DOUBLE_HUMMER depending on what the community decides.

Thanks for the feedback!

        -eric
Kazutomo Yoshii May 19, 2011, 3:22 p.m. UTC | #3
On 05/19/2011 08:53 AM, Eric Van Hensbergen wrote:
>>> >>  +#ifdef CONFIG_BGP
>>> >>  +#define SAVE_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); STFPDX(n, base, b)
>>> >>  +#define REST_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); LFPDX(n, base, b)
>>>        
>> >
>> >  16*?  Are these FP regs 64 or 128 bits wide?  If 128 you are doing to
>> >  have to play with TS_WIDTH to get the size of the FPs correct in the
>> >  thread_struct.
>> >
>> >  I think there's a bug here.
>> >
>>      
> I actually have three different versions of this code from different
> source patches that I'm drawing from - so your help in figuring out
> the best way to approach this is appreciated.  The kittyhawk version
> of the code has 8* instead of 16*.  According to the docs:
> "Each of the two FPU units contains 32 64-bit floating point registers
> for a total of 64 FP registers per processor." which would seem to
> point to the kittyhawk version - but they have a second SAVE_32SFPRS
> for the second hummer.  What wasn't clear to me with this version of
> the code was whether or not they were doing something clever like
> saving the pair of the 64-bit FPU registers in a single 128-bit slot
> (seems plausible).
Yes, it does.    SIMD like instructions are added to BGP PPC.
stdpdx or lfpdx, for example, handle two FPU registers (primary and 
secondary).

Thanks,
Kaz
Michael Neuling May 19, 2011, 9:36 p.m. UTC | #4
In message <BANLkTimKhApFW8G1-pG0u_9Kv2YB0R1O0w@mail.gmail.com> you wrote:
> On Thu, May 19, 2011 at 12:58 AM, Michael Neuling <mikey@neuling.org> wrote=
> :
> > Eric,
> >
> >> This patch adds save/restore register support for the BlueGene/P
> >> double hummer FPU.
> >
> > What does this mean? =A0Needs more details here.
> >
> 
> Hi Mikey,
> 
> any specific details you are looking for here?  AFAIK these patches
> are required for the kernel to save/restore the double hummer
> properly.

I should have been more specific.  What does double hammer mean?

I description of how double hammer differs from normal and why a change
in the fpu code is needed would be great.

> 
> >>
> >> +#ifdef CONFIG_BGP
> >> +#define LFPDX(frt, ra, rb) =A0 .long (31<<26)|((frt)<<21)|((ra)<<16)| \
> >> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 ((rb)<<11)|(462<<1)
> >> +#define STFPDX(frt, ra, rb) =A0.long (31<<26)|((frt)<<21)|((ra)<<16)| \
> >> + =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 ((rb)<<11)|(974<<1)
> >> +#endif /* CONFIG_BGP */
> >
> > Put these in arch/powerpc/include/asm/ppc-opcode.h and reformat to fit
> > whats there already.
> >
> > Also, don't need to put these defines inside a #ifdef.
> >
> 
> Sure, I'll fix that up.
> 
> >> +#ifdef CONFIG_BGP
> >> +#define SAVE_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); STFPDX(n, base=
> , b)
> >> +#define REST_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); LFPDX(n, base,=
>  b)
> >
> > 16*? =A0Are these FP regs 64 or 128 bits wide? =A0If 128 you are doing to
> > have to play with TS_WIDTH to get the size of the FPs correct in the
> > thread_struct.
> >
> > I think there's a bug here.
> >
> 
> I actually have three different versions of this code from different
> source patches that I'm drawing from - so your help in figuring out
> the best way to approach this is appreciated.  The kittyhawk version
> of the code has 8* instead of 16*.  According to the docs:
> "Each of the two FPU units contains 32 64-bit floating point registers
> for a total of 64 FP registers per processor." which would seem to
> point to the kittyhawk version - but they have a second SAVE_32SFPRS
> for the second hummer.  What wasn't clear to me with this version of
> the code was whether or not they were doing something clever like
> saving the pair of the 64-bit FPU registers in a single 128-bit slot
> (seems plausible).  

Ok, sounds like there is 32*8*2 bytes of data, rather than the normal
32*8 bytes for FP only (ignoring VSX).  If this is the case, then you'll
need make 'fpr' in the thread struct bigger which you can do by setting
TS_FPRWIDTH = 2 like we do for VSX.

If there is some instruction that saves and restores two of these at a
time (which LFPDX/STFPDX might I guess), then we can use that, otherwise
we'll have to do 64 saves/restores.  Double load/stores will be faster
I'm guessing though.  

If two at a time, do we need to increase the index in pairs?

> If this is not the way to go, I can certainly
> switch the kittyhawk version of the patch with the *, the extra
> SAVE32SFPR and the extra double hummer specific storage space in the
> thread_struct.  

I'd be tempted to keep it in the 'fpr' part of the struct so you can
then access it with ptrace/signals/core dumps.

> If it would help I can post an alternate version of the patch for
> discussion with the kittyhawk version.

Sure.

The most useful thing would be to see the instruction definition for
STFPDX/LFPDX.

> 
> >> =A0/*
> >> diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms=
> /44x/
> > Kconfig
> >> index f485fc5f..24a515e 100644
> >> --- a/arch/powerpc/platforms/44x/Kconfig
> >> +++ b/arch/powerpc/platforms/44x/Kconfig
> >> @@ -169,6 +169,15 @@ config YOSEMITE
> >> =A0 =A0 =A0 help
> >> =A0 =A0 =A0 =A0 This option enables support for the AMCC PPC440EP evalua=
> tion board.
> >>
> >> +config =A0 =A0 =A0 BGP
> >
> > Does this FPU feature have a specific name like double hammer? =A0I'd
> > rather have the BGP defconfig depend on PPC_FPU_DOUBLE_HUMMER, or
> > something like that...
> >
> >> + =A0 =A0 bool "Blue Gene/P"
> >> + =A0 =A0 depends on 44x
> >> + =A0 =A0 default n
> >> + =A0 =A0 select PPC_FPU
> >> + =A0 =A0 select PPC_DOUBLE_FPU
> >
> > ... in fact, it seem you are doing something like these here but you
> > don't use PPC_DOUBLE_FPU anywhere?
> >
> 
> A fair point.  I'm fine with calling it DOUBLE_HUMMER, but I wasn't sure if
> that was "too internal" of a name for the kernel.  Let me know and
> I'll fix it up.

What I'm mostly concerned about is disassociating it with a particular
CPU.  

If it has an external name, then all the better.

> I'll also change the CONFIG_BGP defines in the FPU code to PPC_DOUBLE_FPU
> or PPC_DOUBLE_HUMMER depending on what the community decides.

Mikey
Eric Van Hensbergen May 19, 2011, 9:55 p.m. UTC | #5
Damnit Mikey, just after I hit send on [V2].....

On Thu, May 19, 2011 at 4:36 PM, Michael Neuling <mikey@neuling.org> wrote:
> In message <BANLkTimKhApFW8G1-pG0u_9Kv2YB0R1O0w@mail.gmail.com> you wrote:
>> On Thu, May 19, 2011 at 12:58 AM, Michael Neuling <mikey@neuling.org> wrote=
>> :
>> > Eric,
>> >
>> >> This patch adds save/restore register support for the BlueGene/P
>> >> double hummer FPU.
>> >
>> > What does this mean? =A0Needs more details here.
>> >

okay, I've changed it a bit in [V2], if you want more I can do my best.

>> "Each of the two FPU units contains 32 64-bit floating point registers
>> for a total of 64 FP registers per processor." which would seem to
>> point to the kittyhawk version - but they have a second SAVE_32SFPRS
>> for the second hummer.  What wasn't clear to me with this version of
>> the code was whether or not they were doing something clever like
>> saving the pair of the 64-bit FPU registers in a single 128-bit slot
>> (seems plausible).
>
> Ok, sounds like there is 32*8*2 bytes of data, rather than the normal
> 32*8 bytes for FP only (ignoring VSX).  If this is the case, then you'll
> need make 'fpr' in the thread struct bigger which you can do by setting
> TS_FPRWIDTH = 2 like we do for VSX.
>

Okay, I'll incorporate that into [V3].

> If there is some instruction that saves and restores two of these at a
> time (which LFPDX/STFPDX might I guess), then we can use that, otherwise
> we'll have to do 64 saves/restores.  Double load/stores will be faster
> I'm guessing though.

I assume that's true.

>
> If two at a time, do we need to increase the index in pairs?
>

I don't believe so.

>> If this is not the way to go, I can certainly
>> switch the kittyhawk version of the patch with the *, the extra
>> SAVE32SFPR and the extra double hummer specific storage space in the
>> thread_struct.
>
> I'd be tempted to keep it in the 'fpr' part of the struct so you can
> then access it with ptrace/signals/core dumps.
>
>> If it would help I can post an alternate version of the patch for
>> discussion with the kittyhawk version.
>
> Sure.
>

Kittyhawk version can be seen here:

http://git.kernel.org/?p=linux/kernel/git/ericvh/bluegene.git;a=commitdiff;h=94bffe786324b9bd07187b11afd836e3ec362d95

>
> The most useful thing would be to see the instruction definition for
> STFPDX/LFPDX.
>

https://wiki.alcf.anl.gov/images/d/d9/PPC440_FP2_arch.pdf

>>
>> >> =A0/*
>> >> diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms=
>> /44x/
>> > Kconfig
>> >> index f485fc5f..24a515e 100644
>> >> --- a/arch/powerpc/platforms/44x/Kconfig
>> >> +++ b/arch/powerpc/platforms/44x/Kconfig
>> >> @@ -169,6 +169,15 @@ config YOSEMITE
>> >> =A0 =A0 =A0 help
>> >> =A0 =A0 =A0 =A0 This option enables support for the AMCC PPC440EP evalua=
>> tion board.
>> >>
>> >> +config =A0 =A0 =A0 BGP
>> >
>> > Does this FPU feature have a specific name like double hammer? =A0I'd
>> > rather have the BGP defconfig depend on PPC_FPU_DOUBLE_HUMMER, or
>> > something like that...
>> >
>> >> + =A0 =A0 bool "Blue Gene/P"
>> >> + =A0 =A0 depends on 44x
>> >> + =A0 =A0 default n
>> >> + =A0 =A0 select PPC_FPU
>> >> + =A0 =A0 select PPC_DOUBLE_FPU
>> >
>> > ... in fact, it seem you are doing something like these here but you
>> > don't use PPC_DOUBLE_FPU anywhere?
>> >
>>
>> A fair point.  I'm fine with calling it DOUBLE_HUMMER, but I wasn't sure if
>> that was "too internal" of a name for the kernel.  Let me know and
>> I'll fix it up.
>
> What I'm mostly concerned about is disassociating it with a particular
> CPU.
>
> If it has an external name, then all the better.
>

Since it isn't available on other chips, shoudl it just be PPC_BGP_FPU
or PPC_BGP_DOUBLE_FPU?

      -eric
Benjamin Herrenschmidt May 20, 2011, 12:52 a.m. UTC | #6
On Thu, 2011-05-19 at 15:58 +1000, Michael Neuling wrote:

> > +
> >  #define SAVE_2GPRS(n, base)	SAVE_GPR(n, base); SAVE_GPR(n+1, base)
> >  #define SAVE_4GPRS(n, base)	SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
> >  #define SAVE_8GPRS(n, base)	SAVE_4GPRS(n, base); SAVE_4GPRS(n+4, base)
> > @@ -97,18 +104,26 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> >  #define REST_8GPRS(n, base)	REST_4GPRS(n, base); REST_4GPRS(n+4, base)
> >  #define REST_10GPRS(n, base)	REST_8GPRS(n, base); REST_2GPRS(n+8, base)
> >  
> > -#define SAVE_FPR(n, base)	stfd	n,THREAD_FPR0+8*TS_FPRWIDTH*(n)(base)
> > -#define SAVE_2FPRS(n, base)	SAVE_FPR(n, base); SAVE_FPR(n+1, base)
> > -#define SAVE_4FPRS(n, base)	SAVE_2FPRS(n, base); SAVE_2FPRS(n+2, base)
> > -#define SAVE_8FPRS(n, base)	SAVE_4FPRS(n, base); SAVE_4FPRS(n+4, base)
> > -#define SAVE_16FPRS(n, base)	SAVE_8FPRS(n, base); SAVE_8FPRS(n+8, base)
> > -#define SAVE_32FPRS(n, base)	SAVE_16FPRS(n, base); SAVE_16FPRS(n+16, base)
> > -#define REST_FPR(n, base)	lfd	n,THREAD_FPR0+8*TS_FPRWIDTH*(n)(base)
> > -#define REST_2FPRS(n, base)	REST_FPR(n, base); REST_FPR(n+1, base)
> > -#define REST_4FPRS(n, base)	REST_2FPRS(n, base); REST_2FPRS(n+2, base)
> > -#define REST_8FPRS(n, base)	REST_4FPRS(n, base); REST_4FPRS(n+4, base)
> > -#define REST_16FPRS(n, base)	REST_8FPRS(n, base); REST_8FPRS(n+8, base)
> > -#define REST_32FPRS(n, base)	REST_16FPRS(n, base); REST_16FPRS(n+16, base)
> > +#ifdef CONFIG_BGP
> > +#define SAVE_FPR(n, b, base)	li b, THREAD_FPR0+(16*(n)); STFPDX(n, base, b)
> > +#define REST_FPR(n, b, base)	li b, THREAD_FPR0+(16*(n)); LFPDX(n, base, b)
> 
> 16*?  Are these FP regs 64 or 128 bits wide?  If 128 you are doing to
> have to play with TS_WIDTH to get the size of the FPs correct in the
> thread_struct.
> 
> I think there's a bug here.

Regardless of that, btw, I don't think it's very sane to change those
macros that way. I'd rather have a separate set to save/restore the BG
stuff and separate code alltogether for loading/saving/flushing/etc...
like FSP SPE. The FPU save/restore code is already too complex as it is.

Also, should we aim to have this co-exist with other 4xx platforms in a
multiplatform kernel ? In that case it should not break the normal FP
case. Feel free to use CPU feature bits, there are 2 or 3 left available
in the 32-bit space, maybe pick a "combo" one for BGP (or one for hummer
and a MMU bit for the odd SMP tricks).

Hrm... thinking of which, what about doing it using the alternate
feature section ? This allows two "alternate" piece of codes to overlay,
the kernel will replace the original one with the alternative one if the
feature bits match. That way you can just stick an alternate around
SAVE/REST_32FPRS that replace them with your new SAVE/REST_32HFPRS (or
whatever you want to call you new set of macros).

Of course you'll probably need a separate area in the thread
struct/pt_regs etc... which mean a userspace ABI change, a change of the
sig context etc etc ....

Cheers,
Ben.
Benjamin Herrenschmidt May 20, 2011, 12:53 a.m. UTC | #7
On Thu, 2011-05-19 at 08:53 -0500, Eric Van Hensbergen wrote:
> On Thu, May 19, 2011 at 12:58 AM, Michael Neuling <mikey@neuling.org> wrote:
> > Eric,
> >
> >> This patch adds save/restore register support for the BlueGene/P
> >> double hummer FPU.
> >
> > What does this mean?  Needs more details here.
> >
> 
> Hi Mikey,
> 
> any specific details you are looking for here?  AFAIK these patches
> are required for the kernel to save/restore the double hummer
> properly.

A description of the double hummer would be good.

Cheers,
Ben.

> >>
> >> +#ifdef CONFIG_BGP
> >> +#define LFPDX(frt, ra, rb)   .long (31<<26)|((frt)<<21)|((ra)<<16)| \
> >> +                                                     ((rb)<<11)|(462<<1)
> >> +#define STFPDX(frt, ra, rb)  .long (31<<26)|((frt)<<21)|((ra)<<16)| \
> >> +                                                     ((rb)<<11)|(974<<1)
> >> +#endif /* CONFIG_BGP */
> >
> > Put these in arch/powerpc/include/asm/ppc-opcode.h and reformat to fit
> > whats there already.
> >
> > Also, don't need to put these defines inside a #ifdef.
> >
> 
> Sure, I'll fix that up.
> 
> >> +#ifdef CONFIG_BGP
> >> +#define SAVE_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); STFPDX(n, base, b)
> >> +#define REST_FPR(n, b, base) li b, THREAD_FPR0+(16*(n)); LFPDX(n, base, b)
> >
> > 16*?  Are these FP regs 64 or 128 bits wide?  If 128 you are doing to
> > have to play with TS_WIDTH to get the size of the FPs correct in the
> > thread_struct.
> >
> > I think there's a bug here.
> >
> 
> I actually have three different versions of this code from different
> source patches that I'm drawing from - so your help in figuring out
> the best way to approach this is appreciated.  The kittyhawk version
> of the code has 8* instead of 16*.  According to the docs:
> "Each of the two FPU units contains 32 64-bit floating point registers
> for a total of 64 FP registers per processor." which would seem to
> point to the kittyhawk version - but they have a second SAVE_32SFPRS
> for the second hummer.  What wasn't clear to me with this version of
> the code was whether or not they were doing something clever like
> saving the pair of the 64-bit FPU registers in a single 128-bit slot
> (seems plausible).  If this is not the way to go, I can certainly
> switch the kittyhawk version of the patch with the *, the extra
> SAVE32SFPR and the extra double hummer specific storage space in the
> thread_struct.  If it would help I can post an alternate version of
> the patch for discussion with the kittyhawk version.
> 
> >>  /*
> >> diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/
> > Kconfig
> >> index f485fc5f..24a515e 100644
> >> --- a/arch/powerpc/platforms/44x/Kconfig
> >> +++ b/arch/powerpc/platforms/44x/Kconfig
> >> @@ -169,6 +169,15 @@ config YOSEMITE
> >>       help
> >>         This option enables support for the AMCC PPC440EP evaluation board.
> >>
> >> +config       BGP
> >
> > Does this FPU feature have a specific name like double hammer?  I'd
> > rather have the BGP defconfig depend on PPC_FPU_DOUBLE_HUMMER, or
> > something like that...
> >
> >> +     bool "Blue Gene/P"
> >> +     depends on 44x
> >> +     default n
> >> +     select PPC_FPU
> >> +     select PPC_DOUBLE_FPU
> >
> > ... in fact, it seem you are doing something like these here but you
> > don't use PPC_DOUBLE_FPU anywhere?
> >
> 
> A fair point.  I'm fine with calling it DOUBLE_HUMMER, but I wasn't sure if
> that was "too internal" of a name for the kernel.  Let me know and
> I'll fix it up.
> I'll also change the CONFIG_BGP defines in the FPU code to PPC_DOUBLE_FPU
> or PPC_DOUBLE_HUMMER depending on what the community decides.
> 
> Thanks for the feedback!
> 
>         -eric
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 9821006..daa22bb 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -88,6 +88,13 @@  END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 				REST_10GPRS(22, base)
 #endif
 
+#ifdef CONFIG_BGP
+#define LFPDX(frt, ra, rb)	.long (31<<26)|((frt)<<21)|((ra)<<16)| \
+							((rb)<<11)|(462<<1)
+#define STFPDX(frt, ra, rb)	.long (31<<26)|((frt)<<21)|((ra)<<16)| \
+							((rb)<<11)|(974<<1)
+#endif /* CONFIG_BGP */
+
 #define SAVE_2GPRS(n, base)	SAVE_GPR(n, base); SAVE_GPR(n+1, base)
 #define SAVE_4GPRS(n, base)	SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
 #define SAVE_8GPRS(n, base)	SAVE_4GPRS(n, base); SAVE_4GPRS(n+4, base)
@@ -97,18 +104,26 @@  END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #define REST_8GPRS(n, base)	REST_4GPRS(n, base); REST_4GPRS(n+4, base)
 #define REST_10GPRS(n, base)	REST_8GPRS(n, base); REST_2GPRS(n+8, base)
 
-#define SAVE_FPR(n, base)	stfd	n,THREAD_FPR0+8*TS_FPRWIDTH*(n)(base)
-#define SAVE_2FPRS(n, base)	SAVE_FPR(n, base); SAVE_FPR(n+1, base)
-#define SAVE_4FPRS(n, base)	SAVE_2FPRS(n, base); SAVE_2FPRS(n+2, base)
-#define SAVE_8FPRS(n, base)	SAVE_4FPRS(n, base); SAVE_4FPRS(n+4, base)
-#define SAVE_16FPRS(n, base)	SAVE_8FPRS(n, base); SAVE_8FPRS(n+8, base)
-#define SAVE_32FPRS(n, base)	SAVE_16FPRS(n, base); SAVE_16FPRS(n+16, base)
-#define REST_FPR(n, base)	lfd	n,THREAD_FPR0+8*TS_FPRWIDTH*(n)(base)
-#define REST_2FPRS(n, base)	REST_FPR(n, base); REST_FPR(n+1, base)
-#define REST_4FPRS(n, base)	REST_2FPRS(n, base); REST_2FPRS(n+2, base)
-#define REST_8FPRS(n, base)	REST_4FPRS(n, base); REST_4FPRS(n+4, base)
-#define REST_16FPRS(n, base)	REST_8FPRS(n, base); REST_8FPRS(n+8, base)
-#define REST_32FPRS(n, base)	REST_16FPRS(n, base); REST_16FPRS(n+16, base)
+#ifdef CONFIG_BGP
+#define SAVE_FPR(n, b, base)	li b, THREAD_FPR0+(16*(n)); STFPDX(n, base, b)
+#define REST_FPR(n, b, base)	li b, THREAD_FPR0+(16*(n)); LFPDX(n, base, b)
+#else /* CONFIG_BGP */
+#define SAVE_FPR(n, b, base)	(stfd	n, THREAD_FPR0+8*TS_FPRWIDTH*(n)(base))
+#define REST_FPR(n, b, base)	(lfd	n, THREAD_FPR0+8*TS_FPRWIDTH*(n)(base))
+#endif /* CONFIG_BGP */
+
+#define SAVE_2FPRS(n, b, base)	SAVE_FPR(n, b, base); SAVE_FPR(n+1, b, base)
+#define SAVE_4FPRS(n, b, base)	SAVE_2FPRS(n, b, base); SAVE_2FPRS(n+2, b, base)
+#define SAVE_8FPRS(n, b, base)	SAVE_4FPRS(n, b, base); SAVE_4FPRS(n+4, b, base)
+#define SAVE_16FPRS(n, b, base)	SAVE_8FPRS(n, b, base); SAVE_8FPRS(n+8, b, base)
+#define SAVE_32FPRS(n, b, base)	SAVE_16FPRS(n, b, base); \
+				SAVE_16FPRS(n+16, b, base)
+#define REST_2FPRS(n, b, base)	REST_FPR(n, b, base); REST_FPR(n+1, b, base)
+#define REST_4FPRS(n, b, base)	REST_2FPRS(n, b, base); REST_2FPRS(n+2, b, base)
+#define REST_8FPRS(n, b, base)	REST_4FPRS(n, b, base); REST_4FPRS(n+4, b, base)
+#define REST_16FPRS(n, b, base)	REST_8FPRS(n, b, base); REST_8FPRS(n+8, b, base)
+#define REST_32FPRS(n, b, base)	REST_16FPRS(n, b, base); \
+				REST_16FPRS(n+16, b, base)
 
 #define SAVE_VR(n,b,base)	li b,THREAD_VR0+(16*(n));  stvx n,base,b
 #define SAVE_2VRS(n,b,base)	SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index de36955..9f11c66 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -30,7 +30,7 @@ 
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	REST_32FPRS(n,base);						\
+	REST_32FPRS(n,c,base);						\
 	b	3f;							\
 2:	REST_32VSRS(n,c,base);						\
 3:
@@ -39,13 +39,13 @@  END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
 BEGIN_FTR_SECTION							\
 	b	2f;							\
 END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
-	SAVE_32FPRS(n,base);						\
+	SAVE_32FPRS(n,c,base);						\
 	b	3f;							\
 2:	SAVE_32VSRS(n,c,base);						\
 3:
 #else
-#define REST_32FPVSRS(n,b,base)	REST_32FPRS(n, base)
-#define SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n, base)
+#define REST_32FPVSRS(n,b,base)	REST_32FPRS(n,b,base)
+#define SAVE_32FPVSRS(n,b,base)	SAVE_32FPRS(n,b,base)
 #endif
 
 /*
diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig
index f485fc5f..24a515e 100644
--- a/arch/powerpc/platforms/44x/Kconfig
+++ b/arch/powerpc/platforms/44x/Kconfig
@@ -169,6 +169,15 @@  config YOSEMITE
 	help
 	  This option enables support for the AMCC PPC440EP evaluation board.
 
+config	BGP
+	bool "Blue Gene/P"
+	depends on 44x
+	default n
+	select PPC_FPU
+	select PPC_DOUBLE_FPU
+	help
+	  This option enables support for the IBM BlueGene/P supercomputer.
+
 config ISS4xx
 	bool "ISS 4xx Simulator"
 	depends on (44x || 40x)