diff mbox

[RFC,v2] MPC5121 TLB errata workaround

Message ID 200903131121.00077.david.jander@protonic.nl (mailing list archive)
State Superseded, archived
Headers show

Commit Message

David Jander March 13, 2009, 10:20 a.m. UTC
Complete workaround for DTLB errata in MPC5121e processors of die M36P and 
older (all currently existing versions).

Due to the bug, the hardware-implemented LRU algorythm always goes to way 1 of 
the TLB. This fix implements the proposed software workaround in form of a LRW table for chosing the TLB-way.

Signed-off-by: David Jander <david@protonic.nl>

---
 arch/powerpc/kernel/head_32.S |   65 ++++++++++++++++++++++++++++++++++++++++
 1 files changed, 65 insertions(+), 0 deletions(-)

Comments

David Jander March 13, 2009, 10:26 a.m. UTC | #1
Forgot to mention: The patch is based on denx git tree head 'ads5121', but 
it should apply without problem (some offset at most) to mainline.

Best regards,
Kumar Gala March 13, 2009, 1:21 p.m. UTC | #2
>

What does cat /proc/cpuinfo show on this board?

> +#ifdef CONFIG_PPC_MPC512x
> +/* MPC512x: workaround for errata in die M36P and earlier:
> + * Implement LRW for TLB way.
> + */

This errata impacts a number of cores and so we should make this a CPU  
feature fixup rather than #ifdef code.

> +       mfspr   r3,SPRN_DMISS
> +       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
> +       lis     r2,lrw@ha       /* Search index in lrw[] */
> +       addi    r2,r2,lrw@l
> +       tophys(r2,r2)
> +       lwzx    r1,r3,r2       /* Get item from lrw[] */
> +       cmpwi   0,r1,0         /* Was it way 0 last time? */

Why not use a bit vector since we only need one bit of information.   
Additionally we can use a single SPRG at that point instead to keep  
track of the LRU information.

> +       beq-    0,113f         /* Then goto 113: */
> +
> +       mfspr   r1,SPRN_SRR1
> +       rlwinm  r1,r1,0,15,13  /* Mask out SRR1[WAY] */
> +       mtspr   SPRN_SRR1,r1
> +
> +       li      r0,0
> +       stwx    r0,r3,r2       /* Make lrw[] entry 0 */
> +       b       114f
> +113:
> +       li      r0,1
> +       stwx    r0,r3,r2       /* Make lrw[] entry 1 */
> +114:
> +#endif
>        mfctr   r0
>        /* Get PTE (linux-style) and check access */
>        mfspr   r3,SPRN_DMISS
> @@ -688,6 +717,34 @@ DataStoreTLBMiss:
>        .globl mol_trampoline
>        .set mol_trampoline, i0x2f00
>
> +#ifdef CONFIG_PPC_MPC512x
> +TlbWo:
> +/* MPC512x: workaround for errata in die M36P and earlier:
> + * Implement LRW for TLB way.
> + */
> +       mfspr   r3,SPRN_DMISS
> +       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
> +       lis     r2,lrw@ha       /* Search index in lrw[] */
> +       addi    r2,r2,lrw@l
> +       tophys(r2,r2)
> +       lwzx    r1,r3,r2       /* Get item from lrw[] */
> +       cmpwi   0,r1,0         /* Was it way 0 last time? */
> +       beq-    0,113f         /* Then goto 113: */
> +
> +       mfspr   r1,SPRN_SRR1
> +       rlwinm  r1,r1,0,15,13  /* Mask out SRR1[WAY] */
> +       mtspr   SPRN_SRR1,r1
> +
> +       li      r0,0
> +       stwx    r0,r3,r2       /* Make lrw[] entry 0 */
> +       b       114f
> +113:
> +       li      r0,1
> +       stwx    r0,r3,r2       /* Make lrw[] entry 1 */
> +114:
> +       b       RFTlbWo
> +#endif
> +
>        . = 0x3000
>
> AltiVecUnavailable:
> @@ -1321,6 +1378,14 @@ intercept_table:
>        .long 0, 0, 0, 0, 0, 0, 0, 0
>        .long 0, 0, 0, 0, 0, 0, 0, 0
>        .long 0, 0, 0, 0, 0, 0, 0, 0
> +
> +#ifdef CONFIG_PPC_MPC512x
> +lrw:
> +       .long 0, 0, 0, 0, 0, 0, 0, 0
> +       .long 0, 0, 0, 0, 0, 0, 0, 0
> +       .long 0, 0, 0, 0, 0, 0, 0, 0
> +       .long 0, 0, 0, 0, 0, 0, 0, 0
> +#endif
>
> /* Room for two PTE pointers, usually the kernel and current user  
> pointers
>  * to their respective root page table.
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/linuxppc-dev
Kumar Gala March 13, 2009, 1:22 p.m. UTC | #3
On Mar 13, 2009, at 5:26 AM, David Jander wrote:

>
> Forgot to mention: The patch is based on denx git tree head  
> 'ads5121', but
> it should apply without problem (some offset at most) to mainline.
>
> Best regards,
>

Out of interest did this version produce better performance on the  
benchmarks than your v1 version?

- k
David Jander March 13, 2009, 2:16 p.m. UTC | #4
On Friday 13 March 2009 14:22:22 Kumar Gala wrote:
> 
> On Mar 13, 2009, at 5:26 AM, David Jander wrote:
> 
> >
> > Forgot to mention: The patch is based on denx git tree head  
> > 'ads5121', but
> > it should apply without problem (some offset at most) to mainline.
> >
> > Best regards,
> >
> 
> Out of interest did this version produce better performance on the  
> benchmarks than your v1 version?

Some examples:

1.- mplayer -nosound -benchmark testfile.mpeg (a DVD-mpeg2 file):

No fix at all:
VC: 30.5s VO: 53.4s Sys:1.95s Total: 85.8s

First fix (force writes to way 0):
VC: 24.3s VO: 40.6s Sys:1.95s Total: 66.9s

Complete fix (implementing lrw):
VC: 23.1s VO: 31.5s Sys:1.03s Total: 55.6s


2.- prboom -timedemo doombench1 (where doombench1.lmp is prerecorded demo):

No fix at all: 14.1 fps
First fix (force writes to way 0): 16.7 fps
Complete fix (implementing lrw): 18.1 fps


3.- Synthetic and pathologic memcpy() benchmark:
No fix at all: 26 Mbyte/s
First fix (force writes to way 0): 160 MByte/s
Complete fix (implementing lrw): 163 MByte/s

Note, that this benchmark should't really show any difference between v1 
and v2, since v1 is almost the best possible fix for copy's only.

Tell me if you know of some other interesting benchmarks to try.

Best regards,
David Jander March 13, 2009, 2:24 p.m. UTC | #5
On Friday 13 March 2009 14:21:57 Kumar Gala wrote:
> >
> 
> What does cat /proc/cpuinfo show on this board?

# cat /proc/cpuinfo
processor       : 0
cpu             : e300c4
clock           : 400.000000MHz
revision        : 1.0 (pvr 8086 2010)
bogomips        : 99.84
timebase        : 50000000
platform        : MPC5121 Generic

> > +#ifdef CONFIG_PPC_MPC512x
> > +/* MPC512x: workaround for errata in die M36P and earlier:
> > + * Implement LRW for TLB way.
> > + */
> 
> This errata impacts a number of cores and so we should make this a CPU  
> feature fixup rather than #ifdef code.

It should impact only MPC5121e and probably MPC5123, but according to
Freescale no other processors that use this core...
Anyway, I'll try to investigate about how to write a "CPU feature fixup",
I've never done that before (If you could give me a hint?)

> > +       mfspr   r3,SPRN_DMISS
> > +       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
> > +       lis     r2,lrw@ha       /* Search index in lrw[] */
> > +       addi    r2,r2,lrw@l
> > +       tophys(r2,r2)
> > +       lwzx    r1,r3,r2       /* Get item from lrw[] */
> > +       cmpwi   0,r1,0         /* Was it way 0 last time? */
> 
> Why not use a bit vector since we only need one bit of information.   
> Additionally we can use a single SPRG at that point instead to keep  
> track of the LRU information.

Sounds interesting. I am just learning my first steps in powerpc-assembly, so
please forgive if this is a little inefficient still. I'll try again next week.

Greetings,
Kumar Gala March 13, 2009, 3:23 p.m. UTC | #6
On Mar 13, 2009, at 9:24 AM, David Jander wrote:

> On Friday 13 March 2009 14:21:57 Kumar Gala wrote:
>>>
>>
>> What does cat /proc/cpuinfo show on this board?
>
> # cat /proc/cpuinfo
> processor       : 0
> cpu             : e300c4
> clock           : 400.000000MHz
> revision        : 1.0 (pvr 8086 2010)
> bogomips        : 99.84
> timebase        : 50000000
> platform        : MPC5121 Generic
>
>>> +#ifdef CONFIG_PPC_MPC512x
>>> +/* MPC512x: workaround for errata in die M36P and earlier:
>>> + * Implement LRW for TLB way.
>>> + */
>>
>> This errata impacts a number of cores and so we should make this a  
>> CPU
>> feature fixup rather than #ifdef code.
>
> It should impact only MPC5121e and probably MPC5123, but according to
> Freescale no other processors that use this core...

Not sure about that.. But the errata impacts all e300c2/c3/c4 parts.

> Anyway, I'll try to investigate about how to write a "CPU feature  
> fixup",
> I've never done that before (If you could give me a hint?)

I've posted a patch that should add the CPU feature support.  This is  
only compile tested.  You'll need to try it out on real HW :)

>>> +       mfspr   r3,SPRN_DMISS
>>> +       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
>>> +       lis     r2,lrw@ha       /* Search index in lrw[] */
>>> +       addi    r2,r2,lrw@l
>>> +       tophys(r2,r2)
>>> +       lwzx    r1,r3,r2       /* Get item from lrw[] */
>>> +       cmpwi   0,r1,0         /* Was it way 0 last time? */
>>
>> Why not use a bit vector since we only need one bit of information.
>> Additionally we can use a single SPRG at that point instead to keep
>> track of the LRU information.
>
> Sounds interesting. I am just learning my first steps in powerpc- 
> assembly, so
> please forgive if this is a little inefficient still. I'll try again  
> next week.

Not at all.  This has been on my todo list just not high priority so  
I'm happy to get someone to work on it and have setup already that can  
show perf differences.

I might work up a newer version w/the SPRG idea if I'm feeling up to it.

- k
David Jander March 16, 2009, 10:44 a.m. UTC | #7
On Friday 13 March 2009 16:23:15 Kumar Gala wrote:
>[...]
> >> This errata impacts a number of cores and so we should make this a
> >> CPU
> >> feature fixup rather than #ifdef code.
> >
> > It should impact only MPC5121e and probably MPC5123, but according to
> > Freescale no other processors that use this core...
>
> Not sure about that.. But the errata impacts all e300c2/c3/c4 parts.

Can someone please check if this is true? There should be errata's for all 
other parts that use one of these cores then.

> > Anyway, I'll try to investigate about how to write a "CPU feature
> > fixup",
> > I've never done that before (If you could give me a hint?)
>
> I've posted a patch that should add the CPU feature support.  This is
> only compile tested.  You'll need to try it out on real HW :)

That's a problem right now: The only useable kernel for the MPC5121e is the 
one on the 'ads5121' head from denx, and that is version 2.6.24.6. Your patch 
(v3) does not apply to that kernel, so I would have to change a few things 
before I can actually try it out:

> #define CPU_FTR_NEED_DTLB_SW_LRU       ASM_CONST(0x0000000000010000)

In 2.6.24.6 this constant is used for something else. Would it be possible to 
pick anotherone, in order to make dual-kernel patches easier to maintain for 
now?

> >>> +       mfspr   r3,SPRN_DMISS
> >>> +       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
> >>> +       lis     r2,lrw@ha       /* Search index in lrw[] */
> >>> +       addi    r2,r2,lrw@l
> >>> +       tophys(r2,r2)
> >>> +       lwzx    r1,r3,r2       /* Get item from lrw[] */
> >>> +       cmpwi   0,r1,0         /* Was it way 0 last time? */
> >>
> >> Why not use a bit vector since we only need one bit of information.
> >> Additionally we can use a single SPRG at that point instead to keep
> >> track of the LRU information.
> >
> > Sounds interesting. I am just learning my first steps in powerpc-
> > assembly, so
> > please forgive if this is a little inefficient still. I'll try again
> > next week.
>
> Not at all.  This has been on my todo list just not high priority so
> I'm happy to get someone to work on it and have setup already that can
> show perf differences.

On your Todo list? Does that mean you know about another processor that has 
the same problem?

> I might work up a newer version w/the SPRG idea if I'm feeling up to it.

Do you mean it is possible to just pick an SPRG that will be used only by this 
handler and make sure no other piece of software will touch it? Would be 
great. Is there a way of knowing which SPRG's are used by linux? I read in 
the e300 core-RM, that SPRG4...7 are unique to this iteration of the G2 
anyway, so one of those might be a good candidate?

I have quite a lot of work pressure right now, and unfortunately very little 
time I can dedicate to this. Given the fact that I also cannot test patches 
for mainline, because MPC5121e support is just not complete enough yet, do 
you agree if I modify my own patch (with ifdef's instead of CPU_FTR...) to 
give you feedback on performance impacts, while you implement it as CPU_FTR 
afterwards for mainline? That way I can avoid doing double work, and spend 
more time on testing it actually....
If you agree, I'll start hacking away on the SPRG version immediately :-)

Best regards,
Kumar Gala March 16, 2009, 11:41 a.m. UTC | #8
On Mar 16, 2009, at 5:44 AM, David Jander wrote:

> On Friday 13 March 2009 16:23:15 Kumar Gala wrote:
>> [...]
>>>> This errata impacts a number of cores and so we should make this a
>>>> CPU
>>>> feature fixup rather than #ifdef code.
>>>
>>> It should impact only MPC5121e and probably MPC5123, but according  
>>> to
>>> Freescale no other processors that use this core...
>>
>> Not sure about that.. But the errata impacts all e300c2/c3/c4 parts.
>
> Can someone please check if this is true? There should be errata's  
> for all
> other parts that use one of these cores then.
>
>>> Anyway, I'll try to investigate about how to write a "CPU feature
>>> fixup",
>>> I've never done that before (If you could give me a hint?)
>>
>> I've posted a patch that should add the CPU feature support.  This is
>> only compile tested.  You'll need to try it out on real HW :)
>
> That's a problem right now: The only useable kernel for the MPC5121e  
> is the
> one on the 'ads5121' head from denx, and that is version 2.6.24.6.  
> Your patch
> (v3) does not apply to that kernel, so I would have to change a few  
> things
> before I can actually try it out:
>
>> #define CPU_FTR_NEED_DTLB_SW_LRU       ASM_CONST(0x0000000000010000)
>
> In 2.6.24.6 this constant is used for something else. Would it be  
> possible to
> pick anotherone, in order to make dual-kernel patches easier to  
> maintain for
> now?
>
>>>>> +       mfspr   r3,SPRN_DMISS
>>>>> +       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
>>>>> +       lis     r2,lrw@ha       /* Search index in lrw[] */
>>>>> +       addi    r2,r2,lrw@l
>>>>> +       tophys(r2,r2)
>>>>> +       lwzx    r1,r3,r2       /* Get item from lrw[] */
>>>>> +       cmpwi   0,r1,0         /* Was it way 0 last time? */
>>>>
>>>> Why not use a bit vector since we only need one bit of information.
>>>> Additionally we can use a single SPRG at that point instead to keep
>>>> track of the LRU information.
>>>
>>> Sounds interesting. I am just learning my first steps in powerpc-
>>> assembly, so
>>> please forgive if this is a little inefficient still. I'll try again
>>> next week.
>>
>> Not at all.  This has been on my todo list just not high priority so
>> I'm happy to get someone to work on it and have setup already that  
>> can
>> show perf differences.
>
> On your Todo list? Does that mean you know about another processor  
> that has
> the same problem?

Yes, I work at Freescale and am aware that the errata impacts all c2/ 
c3/c4 class chips.

>> I might work up a newer version w/the SPRG idea if I'm feeling up  
>> to it.
>
> Do you mean it is possible to just pick an SPRG that will be used  
> only by this
> handler and make sure no other piece of software will touch it?  
> Would be
> great. Is there a way of knowing which SPRG's are used by linux? I  
> read in
> the e300 core-RM, that SPRG4...7 are unique to this iteration of the  
> G2
> anyway, so one of those might be a good candidate?

I was thinking one from SPRG4..7 should be fine since Linux doesn't  
use them for anything.

We should try this w/SPRG & w/memory to see if we notice any perf  
difference.

> I have quite a lot of work pressure right now, and unfortunately  
> very little
> time I can dedicate to this. Given the fact that I also cannot test  
> patches
> for mainline, because MPC5121e support is just not complete enough  
> yet, do
> you agree if I modify my own patch (with ifdef's instead of  
> CPU_FTR...) to
> give you feedback on performance impacts, while you implement it as  
> CPU_FTR
> afterwards for mainline? That way I can avoid doing double work, and  
> spend
> more time on testing it actually....
> If you agree, I'll start hacking away on the SPRG version  
> immediately :-)

I think that's fine.  The CPU feature bit is minor to deal with and  
Ben has stated he wants it to be a MMU feature which is something new  
as well.  It should be easy to convert the CPU_FTR wrapping into  
#ifdefs instead.

I have a few other patches related to this that I am not sure if they  
will apply to the 2.6.24.6 tree you are based on.

- k
diff mbox

Patch

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 0f4fac5..a88b3aa 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -540,6 +540,10 @@  DataLoadTLBMiss:
  * r2: ptr to linux-style pte
  * r3: scratch
  */
+#ifdef CONFIG_PPC_MPC512x
+       b      TlbWo    /* Code for TLB-errata workaround doesn't fit here */
+RFTlbWo:
+#endif
        mfctr   r0
        /* Get PTE (linux-style) and check access */
        mfspr   r3,SPRN_DMISS
@@ -612,6 +616,31 @@  DataStoreTLBMiss:
  * r2: ptr to linux-style pte
  * r3: scratch
  */
+#ifdef CONFIG_PPC_MPC512x
+/* MPC512x: workaround for errata in die M36P and earlier:
+ * Implement LRW for TLB way.
+ */
+       mfspr   r3,SPRN_DMISS
+       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
+       lis     r2,lrw@ha       /* Search index in lrw[] */
+       addi    r2,r2,lrw@l
+       tophys(r2,r2)
+       lwzx    r1,r3,r2       /* Get item from lrw[] */
+       cmpwi   0,r1,0         /* Was it way 0 last time? */
+       beq-    0,113f         /* Then goto 113: */
+
+       mfspr   r1,SPRN_SRR1
+       rlwinm  r1,r1,0,15,13  /* Mask out SRR1[WAY] */
+       mtspr   SPRN_SRR1,r1
+
+       li      r0,0
+       stwx    r0,r3,r2       /* Make lrw[] entry 0 */
+       b       114f
+113:
+       li      r0,1
+       stwx    r0,r3,r2       /* Make lrw[] entry 1 */
+114:
+#endif
        mfctr   r0
        /* Get PTE (linux-style) and check access */
        mfspr   r3,SPRN_DMISS
@@ -688,6 +717,34 @@  DataStoreTLBMiss:
        .globl mol_trampoline
        .set mol_trampoline, i0x2f00

+#ifdef CONFIG_PPC_MPC512x
+TlbWo:
+/* MPC512x: workaround for errata in die M36P and earlier:
+ * Implement LRW for TLB way.
+ */
+       mfspr   r3,SPRN_DMISS
+       rlwinm  r3,r3,19,25,29 /* Get Address bits 19:15 */
+       lis     r2,lrw@ha       /* Search index in lrw[] */
+       addi    r2,r2,lrw@l
+       tophys(r2,r2)
+       lwzx    r1,r3,r2       /* Get item from lrw[] */
+       cmpwi   0,r1,0         /* Was it way 0 last time? */
+       beq-    0,113f         /* Then goto 113: */
+
+       mfspr   r1,SPRN_SRR1
+       rlwinm  r1,r1,0,15,13  /* Mask out SRR1[WAY] */
+       mtspr   SPRN_SRR1,r1
+
+       li      r0,0
+       stwx    r0,r3,r2       /* Make lrw[] entry 0 */
+       b       114f
+113:
+       li      r0,1
+       stwx    r0,r3,r2       /* Make lrw[] entry 1 */
+114:
+       b       RFTlbWo
+#endif
+
        . = 0x3000

 AltiVecUnavailable:
@@ -1321,6 +1378,14 @@  intercept_table:
        .long 0, 0, 0, 0, 0, 0, 0, 0
        .long 0, 0, 0, 0, 0, 0, 0, 0
        .long 0, 0, 0, 0, 0, 0, 0, 0
+
+#ifdef CONFIG_PPC_MPC512x
+lrw:
+       .long 0, 0, 0, 0, 0, 0, 0, 0
+       .long 0, 0, 0, 0, 0, 0, 0, 0
+       .long 0, 0, 0, 0, 0, 0, 0, 0
+       .long 0, 0, 0, 0, 0, 0, 0, 0
+#endif

 /* Room for two PTE pointers, usually the kernel and current user pointers
  * to their respective root page table.