[v7,7/9] powerpc/pseries: Dump the SLB contents on SLB MCE errors.

Message ID 153365145460.14256.11932687379471923123.stgit@jupiter.in.ibm.com
State Changes Requested
Headers show
Series
  • powerpc/pseries: Machine check handler improvements.
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch

Commit Message

Mahesh J Salgaonkar Aug. 7, 2018, 2:17 p.m.
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

If we get a machine check exceptions due to SLB errors then dump the
current SLB contents which will be very much helpful in debugging the
root cause of SLB errors. Introduce an exclusive buffer per cpu to hold
faulty SLB entries. In real mode mce handler saves the old SLB contents
into this buffer accessible through paca and print it out later in virtual
mode.

With this patch the console will log SLB contents like below on SLB MCE
errors:

[  507.297236] SLB contents of cpu 0x1
[  507.297237] Last SLB entry inserted at slot 16
[  507.297238] 00 c000000008000000 400ea1b217000500
[  507.297239]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
[  507.297240] 01 d000000008000000 400d43642f000510
[  507.297242]   1T  ESID=   d00000  VSID=      d43642f LLP:110
[  507.297243] 11 f000000008000000 400a86c85f000500
[  507.297244]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
[  507.297245] 12 00007f0008000000 4008119624000d90
[  507.297246]   1T  ESID=       7f  VSID=      8119624 LLP:110
[  507.297247] 13 0000000018000000 00092885f5150d90
[  507.297247]  256M ESID=        1  VSID=   92885f5150 LLP:110
[  507.297248] 14 0000010008000000 4009e7cb50000d90
[  507.297249]   1T  ESID=        1  VSID=      9e7cb50 LLP:110
[  507.297250] 15 d000000008000000 400d43642f000510
[  507.297251]   1T  ESID=   d00000  VSID=      d43642f LLP:110
[  507.297252] 16 d000000008000000 400d43642f000510
[  507.297253]   1T  ESID=   d00000  VSID=      d43642f LLP:110
[  507.297253] ----------------------------------
[  507.297254] SLB cache ptr value = 3
[  507.297254] Valid SLB cache entries:
[  507.297255] 00 EA[0-35]=    7f000
[  507.297256] 01 EA[0-35]=        1
[  507.297257] 02 EA[0-35]=     1000
[  507.297257] Rest of SLB cache entries:
[  507.297258] 03 EA[0-35]=    7f000
[  507.297258] 04 EA[0-35]=        1
[  507.297259] 05 EA[0-35]=     1000
[  507.297260] 06 EA[0-35]=       12
[  507.297260] 07 EA[0-35]=    7f000

Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---

Changes in V7:
- Print slb cache ptr value and slb cache data
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |    7 ++
 arch/powerpc/include/asm/paca.h               |    4 +
 arch/powerpc/mm/slb.c                         |   73 +++++++++++++++++++++++++
 arch/powerpc/platforms/pseries/ras.c          |   10 +++
 arch/powerpc/platforms/pseries/setup.c        |   10 +++
 5 files changed, 103 insertions(+), 1 deletion(-)

Comments

Michael Ellerman Aug. 9, 2018, 1:05 a.m. | #1
Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> writes:

> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index 7f22929ce915..233d25ff6f64 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -254,6 +254,10 @@ struct paca_struct {
>  #endif
>  #ifdef CONFIG_PPC_PSERIES
>  	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
> +
> +	/* Capture SLB related old contents in MCE handler. */
> +	struct slb_entry *mce_faulty_slbs;
> +	u16 slb_save_cache_ptr;
>  #endif /* CONFIG_PPC_PSERIES */

             ^^^^^^^^^^^^^^^^^

> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
> index e89f675f1b5e..16a53689ffd4 100644
> --- a/arch/powerpc/mm/slb.c
> +++ b/arch/powerpc/mm/slb.c
> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
>  	get_paca()->slb_cache_ptr = 0;
>  }
>  
> +void slb_save_contents(struct slb_entry *slb_ptr)
> +{
> +	int i;
> +	unsigned long e, v;
> +
> +	/* Save slb_cache_ptr value. */
> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;

This isn't inside CONFIG_PPC_PSERIES which breaks lots of configs, eg
powernv.

  arch/powerpc/mm/slb.c:160:12: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
  arch/powerpc/mm/slb.c:218:27: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
  arch/powerpc/mm/slb.c:216:49: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'

http://kisskb.ozlabs.ibm.com/kisskb/head/219f20e490add009194d94fdeb480da2e385f1c6/

cheers
Mahesh J Salgaonkar Aug. 10, 2018, 10:32 a.m. | #2
On 08/09/2018 06:35 AM, Michael Ellerman wrote:
> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> writes:
> 
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index 7f22929ce915..233d25ff6f64 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -254,6 +254,10 @@ struct paca_struct {
>>  #endif
>>  #ifdef CONFIG_PPC_PSERIES
>>  	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
>> +
>> +	/* Capture SLB related old contents in MCE handler. */
>> +	struct slb_entry *mce_faulty_slbs;
>> +	u16 slb_save_cache_ptr;
>>  #endif /* CONFIG_PPC_PSERIES */
> 
>              ^^^^^^^^^^^^^^^^^

I will pull that out of CONFIG_PPC_PSERIES.

> 
>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>> index e89f675f1b5e..16a53689ffd4 100644
>> --- a/arch/powerpc/mm/slb.c
>> +++ b/arch/powerpc/mm/slb.c
>> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
>>  	get_paca()->slb_cache_ptr = 0;
>>  }
>>  
>> +void slb_save_contents(struct slb_entry *slb_ptr)
>> +{
>> +	int i;
>> +	unsigned long e, v;
>> +
>> +	/* Save slb_cache_ptr value. */
>> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
> 
> This isn't inside CONFIG_PPC_PSERIES which breaks lots of configs, eg
> powernv.
> 
>   arch/powerpc/mm/slb.c:160:12: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
>   arch/powerpc/mm/slb.c:218:27: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
>   arch/powerpc/mm/slb.c:216:49: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
> 
> http://kisskb.ozlabs.ibm.com/kisskb/head/219f20e490add009194d94fdeb480da2e385f1c6/
> 
> cheers
> 

Ouch.. my bad. Will fix it.

Thanks,
-Mahesh.
Mahesh J Salgaonkar Aug. 10, 2018, 10:49 a.m. | #3
On 08/10/2018 04:02 PM, Mahesh Jagannath Salgaonkar wrote:
> On 08/09/2018 06:35 AM, Michael Ellerman wrote:
>> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> writes:
>>
>>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>>> index 7f22929ce915..233d25ff6f64 100644
>>> --- a/arch/powerpc/include/asm/paca.h
>>> +++ b/arch/powerpc/include/asm/paca.h
>>> @@ -254,6 +254,10 @@ struct paca_struct {
>>>  #endif
>>>  #ifdef CONFIG_PPC_PSERIES
>>>  	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
>>> +
>>> +	/* Capture SLB related old contents in MCE handler. */
>>> +	struct slb_entry *mce_faulty_slbs;
>>> +	u16 slb_save_cache_ptr;
>>>  #endif /* CONFIG_PPC_PSERIES */
>>
>>              ^^^^^^^^^^^^^^^^^
> 
> I will pull that out of CONFIG_PPC_PSERIES.

I mean will pull 'mce_faulty_slbs' and 'slb_save_cache_ptr' and put it
under CONFIG_PPC_BOOK3S_64.

-Mahesh.

> 
>>
>>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>>> index e89f675f1b5e..16a53689ffd4 100644
>>> --- a/arch/powerpc/mm/slb.c
>>> +++ b/arch/powerpc/mm/slb.c
>>> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
>>>  	get_paca()->slb_cache_ptr = 0;
>>>  }
>>>  
>>> +void slb_save_contents(struct slb_entry *slb_ptr)
>>> +{
>>> +	int i;
>>> +	unsigned long e, v;
>>> +
>>> +	/* Save slb_cache_ptr value. */
>>> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
>>
>> This isn't inside CONFIG_PPC_PSERIES which breaks lots of configs, eg
>> powernv.
>>
>>   arch/powerpc/mm/slb.c:160:12: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
>>   arch/powerpc/mm/slb.c:218:27: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
>>   arch/powerpc/mm/slb.c:216:49: error: 'struct paca_struct' has no member named 'slb_save_cache_ptr'
>>
>> http://kisskb.ozlabs.ibm.com/kisskb/head/219f20e490add009194d94fdeb480da2e385f1c6/
>>
>> cheers
>>
> 
> Ouch.. my bad. Will fix it.
> 
> Thanks,
> -Mahesh.
>
Nicholas Piggin Aug. 11, 2018, 4:33 a.m. | #4
On Tue, 07 Aug 2018 19:47:39 +0530
Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:

> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> 
> If we get a machine check exceptions due to SLB errors then dump the
> current SLB contents which will be very much helpful in debugging the
> root cause of SLB errors. Introduce an exclusive buffer per cpu to hold
> faulty SLB entries. In real mode mce handler saves the old SLB contents
> into this buffer accessible through paca and print it out later in virtual
> mode.
> 
> With this patch the console will log SLB contents like below on SLB MCE
> errors:
> 
> [  507.297236] SLB contents of cpu 0x1
> [  507.297237] Last SLB entry inserted at slot 16
> [  507.297238] 00 c000000008000000 400ea1b217000500
> [  507.297239]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
> [  507.297240] 01 d000000008000000 400d43642f000510
> [  507.297242]   1T  ESID=   d00000  VSID=      d43642f LLP:110
> [  507.297243] 11 f000000008000000 400a86c85f000500
> [  507.297244]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
> [  507.297245] 12 00007f0008000000 4008119624000d90
> [  507.297246]   1T  ESID=       7f  VSID=      8119624 LLP:110
> [  507.297247] 13 0000000018000000 00092885f5150d90
> [  507.297247]  256M ESID=        1  VSID=   92885f5150 LLP:110
> [  507.297248] 14 0000010008000000 4009e7cb50000d90
> [  507.297249]   1T  ESID=        1  VSID=      9e7cb50 LLP:110
> [  507.297250] 15 d000000008000000 400d43642f000510
> [  507.297251]   1T  ESID=   d00000  VSID=      d43642f LLP:110
> [  507.297252] 16 d000000008000000 400d43642f000510
> [  507.297253]   1T  ESID=   d00000  VSID=      d43642f LLP:110
> [  507.297253] ----------------------------------
> [  507.297254] SLB cache ptr value = 3
> [  507.297254] Valid SLB cache entries:
> [  507.297255] 00 EA[0-35]=    7f000
> [  507.297256] 01 EA[0-35]=        1
> [  507.297257] 02 EA[0-35]=     1000
> [  507.297257] Rest of SLB cache entries:
> [  507.297258] 03 EA[0-35]=    7f000
> [  507.297258] 04 EA[0-35]=        1
> [  507.297259] 05 EA[0-35]=     1000
> [  507.297260] 06 EA[0-35]=       12
> [  507.297260] 07 EA[0-35]=    7f000
> 
> Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> ---
> 
> Changes in V7:
> - Print slb cache ptr value and slb cache data
> ---
>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |    7 ++
>  arch/powerpc/include/asm/paca.h               |    4 +
>  arch/powerpc/mm/slb.c                         |   73 +++++++++++++++++++++++++
>  arch/powerpc/platforms/pseries/ras.c          |   10 +++
>  arch/powerpc/platforms/pseries/setup.c        |   10 +++
>  5 files changed, 103 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index cc00a7088cf3..5a3fe282076d 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -485,9 +485,16 @@ static inline void hpte_init_pseries(void) { }
>  
>  extern void hpte_init_native(void);
>  
> +struct slb_entry {
> +	u64	esid;
> +	u64	vsid;
> +};
> +
>  extern void slb_initialize(void);
>  extern void slb_flush_and_rebolt(void);
>  extern void slb_flush_and_rebolt_realmode(void);
> +extern void slb_save_contents(struct slb_entry *slb_ptr);
> +extern void slb_dump_contents(struct slb_entry *slb_ptr);
>  
>  extern void slb_vmalloc_update(void);
>  extern void slb_set_size(u16 size);
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index 7f22929ce915..233d25ff6f64 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -254,6 +254,10 @@ struct paca_struct {
>  #endif
>  #ifdef CONFIG_PPC_PSERIES
>  	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
> +
> +	/* Capture SLB related old contents in MCE handler. */
> +	struct slb_entry *mce_faulty_slbs;
> +	u16 slb_save_cache_ptr;
>  #endif /* CONFIG_PPC_PSERIES */
>  } ____cacheline_aligned;
>  
> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
> index e89f675f1b5e..16a53689ffd4 100644
> --- a/arch/powerpc/mm/slb.c
> +++ b/arch/powerpc/mm/slb.c
> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
>  	get_paca()->slb_cache_ptr = 0;
>  }
>  
> +void slb_save_contents(struct slb_entry *slb_ptr)
> +{
> +	int i;
> +	unsigned long e, v;
> +
> +	/* Save slb_cache_ptr value. */
> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;

What's the point of saving this?

> +
> +	if (!slb_ptr)
> +		return;

Can this ever happen?

> +
> +	for (i = 0; i < mmu_slb_size; i++) {
> +		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
> +		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));

Does the UM say these instructions can cause machine checks if the SLB
is corrupted? It talks about mfslb instruction causing MCE, but there
seems to be no such instruction so I wonder if that's a typo for slbmf?

Seems like a parity error in the SLB should cause a MCE, at least,
because it can't guarantee valid data for the instruction in that case
(multi-hit may be different because you aren't searching by EA).

You could limit slb saving to a single level of recursion to avoid
the problem.

Thanks,
Nick
Mahesh J Salgaonkar Aug. 13, 2018, 4:17 a.m. | #5
On 08/11/2018 10:03 AM, Nicholas Piggin wrote:
> On Tue, 07 Aug 2018 19:47:39 +0530
> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
> 
>> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>
>> If we get a machine check exceptions due to SLB errors then dump the
>> current SLB contents which will be very much helpful in debugging the
>> root cause of SLB errors. Introduce an exclusive buffer per cpu to hold
>> faulty SLB entries. In real mode mce handler saves the old SLB contents
>> into this buffer accessible through paca and print it out later in virtual
>> mode.
>>
>> With this patch the console will log SLB contents like below on SLB MCE
>> errors:
>>
>> [  507.297236] SLB contents of cpu 0x1
>> [  507.297237] Last SLB entry inserted at slot 16
>> [  507.297238] 00 c000000008000000 400ea1b217000500
>> [  507.297239]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
>> [  507.297240] 01 d000000008000000 400d43642f000510
>> [  507.297242]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>> [  507.297243] 11 f000000008000000 400a86c85f000500
>> [  507.297244]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
>> [  507.297245] 12 00007f0008000000 4008119624000d90
>> [  507.297246]   1T  ESID=       7f  VSID=      8119624 LLP:110
>> [  507.297247] 13 0000000018000000 00092885f5150d90
>> [  507.297247]  256M ESID=        1  VSID=   92885f5150 LLP:110
>> [  507.297248] 14 0000010008000000 4009e7cb50000d90
>> [  507.297249]   1T  ESID=        1  VSID=      9e7cb50 LLP:110
>> [  507.297250] 15 d000000008000000 400d43642f000510
>> [  507.297251]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>> [  507.297252] 16 d000000008000000 400d43642f000510
>> [  507.297253]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>> [  507.297253] ----------------------------------
>> [  507.297254] SLB cache ptr value = 3
>> [  507.297254] Valid SLB cache entries:
>> [  507.297255] 00 EA[0-35]=    7f000
>> [  507.297256] 01 EA[0-35]=        1
>> [  507.297257] 02 EA[0-35]=     1000
>> [  507.297257] Rest of SLB cache entries:
>> [  507.297258] 03 EA[0-35]=    7f000
>> [  507.297258] 04 EA[0-35]=        1
>> [  507.297259] 05 EA[0-35]=     1000
>> [  507.297260] 06 EA[0-35]=       12
>> [  507.297260] 07 EA[0-35]=    7f000
>>
>> Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
>> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>> ---
>>
>> Changes in V7:
>> - Print slb cache ptr value and slb cache data
>> ---
>>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |    7 ++
>>  arch/powerpc/include/asm/paca.h               |    4 +
>>  arch/powerpc/mm/slb.c                         |   73 +++++++++++++++++++++++++
>>  arch/powerpc/platforms/pseries/ras.c          |   10 +++
>>  arch/powerpc/platforms/pseries/setup.c        |   10 +++
>>  5 files changed, 103 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> index cc00a7088cf3..5a3fe282076d 100644
>> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> @@ -485,9 +485,16 @@ static inline void hpte_init_pseries(void) { }
>>  
>>  extern void hpte_init_native(void);
>>  
>> +struct slb_entry {
>> +	u64	esid;
>> +	u64	vsid;
>> +};
>> +
>>  extern void slb_initialize(void);
>>  extern void slb_flush_and_rebolt(void);
>>  extern void slb_flush_and_rebolt_realmode(void);
>> +extern void slb_save_contents(struct slb_entry *slb_ptr);
>> +extern void slb_dump_contents(struct slb_entry *slb_ptr);
>>  
>>  extern void slb_vmalloc_update(void);
>>  extern void slb_set_size(u16 size);
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index 7f22929ce915..233d25ff6f64 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -254,6 +254,10 @@ struct paca_struct {
>>  #endif
>>  #ifdef CONFIG_PPC_PSERIES
>>  	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
>> +
>> +	/* Capture SLB related old contents in MCE handler. */
>> +	struct slb_entry *mce_faulty_slbs;
>> +	u16 slb_save_cache_ptr;
>>  #endif /* CONFIG_PPC_PSERIES */
>>  } ____cacheline_aligned;
>>  
>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>> index e89f675f1b5e..16a53689ffd4 100644
>> --- a/arch/powerpc/mm/slb.c
>> +++ b/arch/powerpc/mm/slb.c
>> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
>>  	get_paca()->slb_cache_ptr = 0;
>>  }
>>  
>> +void slb_save_contents(struct slb_entry *slb_ptr)
>> +{
>> +	int i;
>> +	unsigned long e, v;
>> +
>> +	/* Save slb_cache_ptr value. */
>> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
> 
> What's the point of saving this?

This is to know how many valid cache entries were present at the time of
SLB mutlihit. We use this index value while dumping the slb cahce entries.

> 
>> +
>> +	if (!slb_ptr)
>> +		return;
> 
> Can this ever happen?

May be Never. We allocate the memory at very early stage. But just added
as sanity check.

> 
>> +
>> +	for (i = 0; i < mmu_slb_size; i++) {
>> +		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
>> +		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
> 
> Does the UM say these instructions can cause machine checks if the SLB
> is corrupted? It talks about mfslb instruction causing MCE, but there
> seems to be no such instruction so I wonder if that's a typo for slbmf?
> 
> Seems like a parity error in the SLB should cause a MCE, at least,
> because it can't guarantee valid data for the instruction in that case
> (multi-hit may be different because you aren't searching by EA).
> 
> You could limit slb saving to a single level of recursion to avoid
> the problem.

Yeah, we could do this OR restrict slb saving only for SLB multi-hit.
Parity errors are anyway hardware errors. If parity error is transient
then saving of SLBs may not trigger another MCE. In that case old SLB
content would look ok even if we dump them on console. What do you say ?

> 
> Thanks,
> Nick
>
Nicholas Piggin Aug. 13, 2018, 2:27 p.m. | #6
On Mon, 13 Aug 2018 09:47:04 +0530
Mahesh Jagannath Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:

> On 08/11/2018 10:03 AM, Nicholas Piggin wrote:
> > On Tue, 07 Aug 2018 19:47:39 +0530
> > Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
> >   
> >> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> >>
> >> If we get a machine check exceptions due to SLB errors then dump the
> >> current SLB contents which will be very much helpful in debugging the
> >> root cause of SLB errors. Introduce an exclusive buffer per cpu to hold
> >> faulty SLB entries. In real mode mce handler saves the old SLB contents
> >> into this buffer accessible through paca and print it out later in virtual
> >> mode.
> >>
> >> With this patch the console will log SLB contents like below on SLB MCE
> >> errors:
> >>
> >> [  507.297236] SLB contents of cpu 0x1
> >> [  507.297237] Last SLB entry inserted at slot 16
> >> [  507.297238] 00 c000000008000000 400ea1b217000500
> >> [  507.297239]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
> >> [  507.297240] 01 d000000008000000 400d43642f000510
> >> [  507.297242]   1T  ESID=   d00000  VSID=      d43642f LLP:110
> >> [  507.297243] 11 f000000008000000 400a86c85f000500
> >> [  507.297244]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
> >> [  507.297245] 12 00007f0008000000 4008119624000d90
> >> [  507.297246]   1T  ESID=       7f  VSID=      8119624 LLP:110
> >> [  507.297247] 13 0000000018000000 00092885f5150d90
> >> [  507.297247]  256M ESID=        1  VSID=   92885f5150 LLP:110
> >> [  507.297248] 14 0000010008000000 4009e7cb50000d90
> >> [  507.297249]   1T  ESID=        1  VSID=      9e7cb50 LLP:110
> >> [  507.297250] 15 d000000008000000 400d43642f000510
> >> [  507.297251]   1T  ESID=   d00000  VSID=      d43642f LLP:110
> >> [  507.297252] 16 d000000008000000 400d43642f000510
> >> [  507.297253]   1T  ESID=   d00000  VSID=      d43642f LLP:110
> >> [  507.297253] ----------------------------------
> >> [  507.297254] SLB cache ptr value = 3
> >> [  507.297254] Valid SLB cache entries:
> >> [  507.297255] 00 EA[0-35]=    7f000
> >> [  507.297256] 01 EA[0-35]=        1
> >> [  507.297257] 02 EA[0-35]=     1000
> >> [  507.297257] Rest of SLB cache entries:
> >> [  507.297258] 03 EA[0-35]=    7f000
> >> [  507.297258] 04 EA[0-35]=        1
> >> [  507.297259] 05 EA[0-35]=     1000
> >> [  507.297260] 06 EA[0-35]=       12
> >> [  507.297260] 07 EA[0-35]=    7f000
> >>
> >> Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> >> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
> >> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> >> ---
> >>
> >> Changes in V7:
> >> - Print slb cache ptr value and slb cache data
> >> ---
> >>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |    7 ++
> >>  arch/powerpc/include/asm/paca.h               |    4 +
> >>  arch/powerpc/mm/slb.c                         |   73 +++++++++++++++++++++++++
> >>  arch/powerpc/platforms/pseries/ras.c          |   10 +++
> >>  arch/powerpc/platforms/pseries/setup.c        |   10 +++
> >>  5 files changed, 103 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> >> index cc00a7088cf3..5a3fe282076d 100644
> >> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> >> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> >> @@ -485,9 +485,16 @@ static inline void hpte_init_pseries(void) { }
> >>  
> >>  extern void hpte_init_native(void);
> >>  
> >> +struct slb_entry {
> >> +	u64	esid;
> >> +	u64	vsid;
> >> +};
> >> +
> >>  extern void slb_initialize(void);
> >>  extern void slb_flush_and_rebolt(void);
> >>  extern void slb_flush_and_rebolt_realmode(void);
> >> +extern void slb_save_contents(struct slb_entry *slb_ptr);
> >> +extern void slb_dump_contents(struct slb_entry *slb_ptr);
> >>  
> >>  extern void slb_vmalloc_update(void);
> >>  extern void slb_set_size(u16 size);
> >> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> >> index 7f22929ce915..233d25ff6f64 100644
> >> --- a/arch/powerpc/include/asm/paca.h
> >> +++ b/arch/powerpc/include/asm/paca.h
> >> @@ -254,6 +254,10 @@ struct paca_struct {
> >>  #endif
> >>  #ifdef CONFIG_PPC_PSERIES
> >>  	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
> >> +
> >> +	/* Capture SLB related old contents in MCE handler. */
> >> +	struct slb_entry *mce_faulty_slbs;
> >> +	u16 slb_save_cache_ptr;
> >>  #endif /* CONFIG_PPC_PSERIES */
> >>  } ____cacheline_aligned;
> >>  
> >> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
> >> index e89f675f1b5e..16a53689ffd4 100644
> >> --- a/arch/powerpc/mm/slb.c
> >> +++ b/arch/powerpc/mm/slb.c
> >> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
> >>  	get_paca()->slb_cache_ptr = 0;
> >>  }
> >>  
> >> +void slb_save_contents(struct slb_entry *slb_ptr)
> >> +{
> >> +	int i;
> >> +	unsigned long e, v;
> >> +
> >> +	/* Save slb_cache_ptr value. */
> >> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;  
> > 
> > What's the point of saving this?  
> 
> This is to know how many valid cache entries were present at the time of
> SLB mutlihit. We use this index value while dumping the slb cahce entries.

Oh I see you're dumping that thing as well. I don't know if that's
worth doing, it just gives you the first 8 SLB entries installed but
you already have those (or they're overwritten and irrelevat).

> 
> >   
> >> +
> >> +	if (!slb_ptr)
> >> +		return;  
> > 
> > Can this ever happen?  
> 
> May be Never. We allocate the memory at very early stage. But just added
> as sanity check.

Okay if you think it's needed.

> 
> >   
> >> +
> >> +	for (i = 0; i < mmu_slb_size; i++) {
> >> +		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
> >> +		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));  
> > 
> > Does the UM say these instructions can cause machine checks if the SLB
> > is corrupted? It talks about mfslb instruction causing MCE, but there
> > seems to be no such instruction so I wonder if that's a typo for slbmf?
> > 
> > Seems like a parity error in the SLB should cause a MCE, at least,
> > because it can't guarantee valid data for the instruction in that case
> > (multi-hit may be different because you aren't searching by EA).
> > 
> > You could limit slb saving to a single level of recursion to avoid
> > the problem.  
> 
> Yeah, we could do this OR restrict slb saving only for SLB multi-hit.
> Parity errors are anyway hardware errors. If parity error is transient
> then saving of SLBs may not trigger another MCE. In that case old SLB
> content would look ok even if we dump them on console. What do you say ?

I'm not sure. A parity error I think can cause a multi hit. Can you be
sure of a software caused multi hit? Would be a good idea if you can I
think. It may be a good idea to avoid recursion as well, just in case.

Thanks,
Nick
Mahesh J Salgaonkar Aug. 14, 2018, 10:57 a.m. | #7
On 08/13/2018 07:57 PM, Nicholas Piggin wrote:
> On Mon, 13 Aug 2018 09:47:04 +0530
> Mahesh Jagannath Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
> 
>> On 08/11/2018 10:03 AM, Nicholas Piggin wrote:
>>> On Tue, 07 Aug 2018 19:47:39 +0530
>>> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
>>>   
>>>> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>>>
>>>> If we get a machine check exceptions due to SLB errors then dump the
>>>> current SLB contents which will be very much helpful in debugging the
>>>> root cause of SLB errors. Introduce an exclusive buffer per cpu to hold
>>>> faulty SLB entries. In real mode mce handler saves the old SLB contents
>>>> into this buffer accessible through paca and print it out later in virtual
>>>> mode.
>>>>
>>>> With this patch the console will log SLB contents like below on SLB MCE
>>>> errors:
>>>>
>>>> [  507.297236] SLB contents of cpu 0x1
>>>> [  507.297237] Last SLB entry inserted at slot 16
>>>> [  507.297238] 00 c000000008000000 400ea1b217000500
>>>> [  507.297239]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
>>>> [  507.297240] 01 d000000008000000 400d43642f000510
>>>> [  507.297242]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>>>> [  507.297243] 11 f000000008000000 400a86c85f000500
>>>> [  507.297244]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
>>>> [  507.297245] 12 00007f0008000000 4008119624000d90
>>>> [  507.297246]   1T  ESID=       7f  VSID=      8119624 LLP:110
>>>> [  507.297247] 13 0000000018000000 00092885f5150d90
>>>> [  507.297247]  256M ESID=        1  VSID=   92885f5150 LLP:110
>>>> [  507.297248] 14 0000010008000000 4009e7cb50000d90
>>>> [  507.297249]   1T  ESID=        1  VSID=      9e7cb50 LLP:110
>>>> [  507.297250] 15 d000000008000000 400d43642f000510
>>>> [  507.297251]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>>>> [  507.297252] 16 d000000008000000 400d43642f000510
>>>> [  507.297253]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>>>> [  507.297253] ----------------------------------
>>>> [  507.297254] SLB cache ptr value = 3
>>>> [  507.297254] Valid SLB cache entries:
>>>> [  507.297255] 00 EA[0-35]=    7f000
>>>> [  507.297256] 01 EA[0-35]=        1
>>>> [  507.297257] 02 EA[0-35]=     1000
>>>> [  507.297257] Rest of SLB cache entries:
>>>> [  507.297258] 03 EA[0-35]=    7f000
>>>> [  507.297258] 04 EA[0-35]=        1
>>>> [  507.297259] 05 EA[0-35]=     1000
>>>> [  507.297260] 06 EA[0-35]=       12
>>>> [  507.297260] 07 EA[0-35]=    7f000
>>>>
>>>> Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>>> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
>>>> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>>> ---
>>>>
>>>> Changes in V7:
>>>> - Print slb cache ptr value and slb cache data
>>>> ---
>>>>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |    7 ++
>>>>  arch/powerpc/include/asm/paca.h               |    4 +
>>>>  arch/powerpc/mm/slb.c                         |   73 +++++++++++++++++++++++++
>>>>  arch/powerpc/platforms/pseries/ras.c          |   10 +++
>>>>  arch/powerpc/platforms/pseries/setup.c        |   10 +++
>>>>  5 files changed, 103 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>>>> index cc00a7088cf3..5a3fe282076d 100644
>>>> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>>>> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>>>> @@ -485,9 +485,16 @@ static inline void hpte_init_pseries(void) { }
>>>>  
>>>>  extern void hpte_init_native(void);
>>>>  
>>>> +struct slb_entry {
>>>> +	u64	esid;
>>>> +	u64	vsid;
>>>> +};
>>>> +
>>>>  extern void slb_initialize(void);
>>>>  extern void slb_flush_and_rebolt(void);
>>>>  extern void slb_flush_and_rebolt_realmode(void);
>>>> +extern void slb_save_contents(struct slb_entry *slb_ptr);
>>>> +extern void slb_dump_contents(struct slb_entry *slb_ptr);
>>>>  
>>>>  extern void slb_vmalloc_update(void);
>>>>  extern void slb_set_size(u16 size);
>>>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>>>> index 7f22929ce915..233d25ff6f64 100644
>>>> --- a/arch/powerpc/include/asm/paca.h
>>>> +++ b/arch/powerpc/include/asm/paca.h
>>>> @@ -254,6 +254,10 @@ struct paca_struct {
>>>>  #endif
>>>>  #ifdef CONFIG_PPC_PSERIES
>>>>  	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
>>>> +
>>>> +	/* Capture SLB related old contents in MCE handler. */
>>>> +	struct slb_entry *mce_faulty_slbs;
>>>> +	u16 slb_save_cache_ptr;
>>>>  #endif /* CONFIG_PPC_PSERIES */
>>>>  } ____cacheline_aligned;
>>>>  
>>>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>>>> index e89f675f1b5e..16a53689ffd4 100644
>>>> --- a/arch/powerpc/mm/slb.c
>>>> +++ b/arch/powerpc/mm/slb.c
>>>> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
>>>>  	get_paca()->slb_cache_ptr = 0;
>>>>  }
>>>>  
>>>> +void slb_save_contents(struct slb_entry *slb_ptr)
>>>> +{
>>>> +	int i;
>>>> +	unsigned long e, v;
>>>> +
>>>> +	/* Save slb_cache_ptr value. */
>>>> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;  
>>>
>>> What's the point of saving this?  
>>
>> This is to know how many valid cache entries were present at the time of
>> SLB mutlihit. We use this index value while dumping the slb cahce entries.
> 
> Oh I see you're dumping that thing as well. I don't know if that's
> worth doing, it just gives you the first 8 SLB entries installed but
> you already have those (or they're overwritten and irrelevat).

Aneesh, Can you comment on this ?

> 
>>
>>>   
>>>> +
>>>> +	if (!slb_ptr)
>>>> +		return;  
>>>
>>> Can this ever happen?  
>>
>> May be Never. We allocate the memory at very early stage. But just added
>> as sanity check.
> 
> Okay if you think it's needed.
> 
>>
>>>   
>>>> +
>>>> +	for (i = 0; i < mmu_slb_size; i++) {
>>>> +		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
>>>> +		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));  
>>>
>>> Does the UM say these instructions can cause machine checks if the SLB
>>> is corrupted? It talks about mfslb instruction causing MCE, but there
>>> seems to be no such instruction so I wonder if that's a typo for slbmf?
>>>
>>> Seems like a parity error in the SLB should cause a MCE, at least,
>>> because it can't guarantee valid data for the instruction in that case
>>> (multi-hit may be different because you aren't searching by EA).
>>>
>>> You could limit slb saving to a single level of recursion to avoid
>>> the problem.  
>>
>> Yeah, we could do this OR restrict slb saving only for SLB multi-hit.
>> Parity errors are anyway hardware errors. If parity error is transient
>> then saving of SLBs may not trigger another MCE. In that case old SLB
>> content would look ok even if we dump them on console. What do you say ?
> 
> I'm not sure. A parity error I think can cause a multi hit. Can you be
> sure of a software caused multi hit? Would be a good idea if you can I
> think. It may be a good idea to avoid recursion as well, just in case.

yeah, you are right. Parity errors can also cause multi-hit. Will limit
slb saving to single level of recursion.

Thanks for your review.

-Mahesh.

> 
> Thanks,
> Nick
>
Aneesh Kumar K.V Aug. 14, 2018, 12:47 p.m. | #8
On 08/14/2018 04:27 PM, Mahesh Jagannath Salgaonkar wrote:
> On 08/13/2018 07:57 PM, Nicholas Piggin wrote:
>> On Mon, 13 Aug 2018 09:47:04 +0530
>> Mahesh Jagannath Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
>>
>>> On 08/11/2018 10:03 AM, Nicholas Piggin wrote:
>>>> On Tue, 07 Aug 2018 19:47:39 +0530
>>>> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
>>>>    
>>>>> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>>>>
>>>>> If we get a machine check exceptions due to SLB errors then dump the
>>>>> current SLB contents which will be very much helpful in debugging the
>>>>> root cause of SLB errors. Introduce an exclusive buffer per cpu to hold
>>>>> faulty SLB entries. In real mode mce handler saves the old SLB contents
>>>>> into this buffer accessible through paca and print it out later in virtual
>>>>> mode.
>>>>>
>>>>> With this patch the console will log SLB contents like below on SLB MCE
>>>>> errors:
>>>>>
>>>>> [  507.297236] SLB contents of cpu 0x1
>>>>> [  507.297237] Last SLB entry inserted at slot 16
>>>>> [  507.297238] 00 c000000008000000 400ea1b217000500
>>>>> [  507.297239]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
>>>>> [  507.297240] 01 d000000008000000 400d43642f000510
>>>>> [  507.297242]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>>>>> [  507.297243] 11 f000000008000000 400a86c85f000500
>>>>> [  507.297244]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
>>>>> [  507.297245] 12 00007f0008000000 4008119624000d90
>>>>> [  507.297246]   1T  ESID=       7f  VSID=      8119624 LLP:110
>>>>> [  507.297247] 13 0000000018000000 00092885f5150d90
>>>>> [  507.297247]  256M ESID=        1  VSID=   92885f5150 LLP:110
>>>>> [  507.297248] 14 0000010008000000 4009e7cb50000d90
>>>>> [  507.297249]   1T  ESID=        1  VSID=      9e7cb50 LLP:110
>>>>> [  507.297250] 15 d000000008000000 400d43642f000510
>>>>> [  507.297251]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>>>>> [  507.297252] 16 d000000008000000 400d43642f000510
>>>>> [  507.297253]   1T  ESID=   d00000  VSID=      d43642f LLP:110
>>>>> [  507.297253] ----------------------------------
>>>>> [  507.297254] SLB cache ptr value = 3
>>>>> [  507.297254] Valid SLB cache entries:
>>>>> [  507.297255] 00 EA[0-35]=    7f000
>>>>> [  507.297256] 01 EA[0-35]=        1
>>>>> [  507.297257] 02 EA[0-35]=     1000
>>>>> [  507.297257] Rest of SLB cache entries:
>>>>> [  507.297258] 03 EA[0-35]=    7f000
>>>>> [  507.297258] 04 EA[0-35]=        1
>>>>> [  507.297259] 05 EA[0-35]=     1000
>>>>> [  507.297260] 06 EA[0-35]=       12
>>>>> [  507.297260] 07 EA[0-35]=    7f000
>>>>>
>>>>> Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>>>> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
>>>>> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>>>> ---
>>>>>
>>>>> Changes in V7:
>>>>> - Print slb cache ptr value and slb cache data
>>>>> ---
>>>>>   arch/powerpc/include/asm/book3s/64/mmu-hash.h |    7 ++
>>>>>   arch/powerpc/include/asm/paca.h               |    4 +
>>>>>   arch/powerpc/mm/slb.c                         |   73 +++++++++++++++++++++++++
>>>>>   arch/powerpc/platforms/pseries/ras.c          |   10 +++
>>>>>   arch/powerpc/platforms/pseries/setup.c        |   10 +++
>>>>>   5 files changed, 103 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>>>>> index cc00a7088cf3..5a3fe282076d 100644
>>>>> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>>>>> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>>>>> @@ -485,9 +485,16 @@ static inline void hpte_init_pseries(void) { }
>>>>>   
>>>>>   extern void hpte_init_native(void);
>>>>>   
>>>>> +struct slb_entry {
>>>>> +	u64	esid;
>>>>> +	u64	vsid;
>>>>> +};
>>>>> +
>>>>>   extern void slb_initialize(void);
>>>>>   extern void slb_flush_and_rebolt(void);
>>>>>   extern void slb_flush_and_rebolt_realmode(void);
>>>>> +extern void slb_save_contents(struct slb_entry *slb_ptr);
>>>>> +extern void slb_dump_contents(struct slb_entry *slb_ptr);
>>>>>   
>>>>>   extern void slb_vmalloc_update(void);
>>>>>   extern void slb_set_size(u16 size);
>>>>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>>>>> index 7f22929ce915..233d25ff6f64 100644
>>>>> --- a/arch/powerpc/include/asm/paca.h
>>>>> +++ b/arch/powerpc/include/asm/paca.h
>>>>> @@ -254,6 +254,10 @@ struct paca_struct {
>>>>>   #endif
>>>>>   #ifdef CONFIG_PPC_PSERIES
>>>>>   	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
>>>>> +
>>>>> +	/* Capture SLB related old contents in MCE handler. */
>>>>> +	struct slb_entry *mce_faulty_slbs;
>>>>> +	u16 slb_save_cache_ptr;
>>>>>   #endif /* CONFIG_PPC_PSERIES */
>>>>>   } ____cacheline_aligned;
>>>>>   
>>>>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>>>>> index e89f675f1b5e..16a53689ffd4 100644
>>>>> --- a/arch/powerpc/mm/slb.c
>>>>> +++ b/arch/powerpc/mm/slb.c
>>>>> @@ -151,6 +151,79 @@ void slb_flush_and_rebolt_realmode(void)
>>>>>   	get_paca()->slb_cache_ptr = 0;
>>>>>   }
>>>>>   
>>>>> +void slb_save_contents(struct slb_entry *slb_ptr)
>>>>> +{
>>>>> +	int i;
>>>>> +	unsigned long e, v;
>>>>> +
>>>>> +	/* Save slb_cache_ptr value. */
>>>>> +	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
>>>>
>>>> What's the point of saving this?
>>>
>>> This is to know how many valid cache entries were present at the time of
>>> SLB mutlihit. We use this index value while dumping the slb cahce entries.
>>
>> Oh I see you're dumping that thing as well. I don't know if that's
>> worth doing, it just gives you the first 8 SLB entries installed but
>> you already have those (or they're overwritten and irrelevat).
> 
> Aneesh, Can you comment on this ?
> 
>

We never clear slb_cache entries. We just update slb_cache_ptr. Now on 
debug we would like to find which entries are the valid 
slb_cache_entries for this run. slb_cache_ptr gives us that details. 
One of the ways we could end up with a slb multi hit is if we have 
slb_cache_ptr corruption. So instead of doing a flush_and_rebolt, we 
invalidated a subset of valid slb entries. But I understand that in that 
specific case, we context switched out with that corrupted value and the 
value we are dumping above really won't help in isolating. But if we are 
corrupting paca, we might continue to overwrite it again and we can 
compare the slb contents against slb_cache contents and see if there is 
any corruption.

-aneesh

Patch

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index cc00a7088cf3..5a3fe282076d 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -485,9 +485,16 @@  static inline void hpte_init_pseries(void) { }
 
 extern void hpte_init_native(void);
 
+struct slb_entry {
+	u64	esid;
+	u64	vsid;
+};
+
 extern void slb_initialize(void);
 extern void slb_flush_and_rebolt(void);
 extern void slb_flush_and_rebolt_realmode(void);
+extern void slb_save_contents(struct slb_entry *slb_ptr);
+extern void slb_dump_contents(struct slb_entry *slb_ptr);
 
 extern void slb_vmalloc_update(void);
 extern void slb_set_size(u16 size);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 7f22929ce915..233d25ff6f64 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -254,6 +254,10 @@  struct paca_struct {
 #endif
 #ifdef CONFIG_PPC_PSERIES
 	u8 *mce_data_buf;		/* buffer to hold per cpu rtas errlog */
+
+	/* Capture SLB related old contents in MCE handler. */
+	struct slb_entry *mce_faulty_slbs;
+	u16 slb_save_cache_ptr;
 #endif /* CONFIG_PPC_PSERIES */
 } ____cacheline_aligned;
 
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index e89f675f1b5e..16a53689ffd4 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -151,6 +151,79 @@  void slb_flush_and_rebolt_realmode(void)
 	get_paca()->slb_cache_ptr = 0;
 }
 
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+	int i;
+	unsigned long e, v;
+
+	/* Save slb_cache_ptr value. */
+	get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
+
+	if (!slb_ptr)
+		return;
+
+	for (i = 0; i < mmu_slb_size; i++) {
+		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
+		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
+		slb_ptr->esid = e;
+		slb_ptr->vsid = v;
+		slb_ptr++;
+	}
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+	int i, n;
+	unsigned long e, v;
+	unsigned long llp;
+
+	if (!slb_ptr)
+		return;
+
+	pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+	pr_err("Last SLB entry inserted at slot %lld\n", get_paca()->stab_rr);
+
+	for (i = 0; i < mmu_slb_size; i++) {
+		e = slb_ptr->esid;
+		v = slb_ptr->vsid;
+		slb_ptr++;
+
+		if (!e && !v)
+			continue;
+
+		pr_err("%02d %016lx %016lx\n", i, e, v);
+
+		if (!(e & SLB_ESID_V)) {
+			pr_err("\n");
+			continue;
+		}
+		llp = v & SLB_VSID_LLP;
+		if (v & SLB_VSID_B_1T) {
+			pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+				GET_ESID_1T(e),
+				(v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
+				llp);
+		} else {
+			pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+				GET_ESID(e),
+				(v & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
+				llp);
+		}
+	}
+	pr_err("----------------------------------\n");
+
+	/* Dump slb cache entires as well. */
+	pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+	pr_err("Valid SLB cache entries:\n");
+	n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+	for (i = 0; i < n; i++)
+		pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+	pr_err("Rest of SLB cache entries:\n");
+	for (i = n; i < SLB_CACHE_ENTRIES; i++)
+		pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+
+}
+
 void slb_vmalloc_update(void)
 {
 	unsigned long vflags;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 656b35a42d93..117ca2ff5456 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -515,6 +515,10 @@  static void pseries_print_mce_info(struct pt_regs *regs,
 		break;
 	}
 
+	/* Display faulty slb contents for SLB errors. */
+	if (error_type == PSERIES_MC_ERROR_TYPE_SLB)
+		slb_dump_contents(local_paca->mce_faulty_slbs);
+
 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
 		disposition == RTAS_DISP_FULLY_RECOVERED ?
 		"Recovered" : "Not recovered");
@@ -575,7 +579,11 @@  static int mce_handle_error(struct rtas_error_log *errp)
 
 	if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
 			(error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
-		/* Store the old slb content someplace. */
+		/*
+		 * Store the old slb content in paca before flushing. Print
+		 * this when we go to virtual mode.
+		 */
+		slb_save_contents(local_paca->mce_faulty_slbs);
 		slb_flush_and_rebolt_realmode();
 		disposition = RTAS_DISP_FULLY_RECOVERED;
 		rtas_set_disposition_recovered(errp);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 7a9421d089d8..53aee58a928b 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -105,6 +105,9 @@  static void __init fwnmi_init(void)
 	u8 *mce_data_buf;
 	unsigned int i;
 	int nr_cpus = num_possible_cpus();
+	struct slb_entry *slb_ptr;
+	size_t size;
+
 
 	int ibm_nmi_register = rtas_token("ibm,nmi-register");
 	if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
@@ -130,6 +133,13 @@  static void __init fwnmi_init(void)
 		paca_ptrs[i]->mce_data_buf = mce_data_buf +
 						(RTAS_ERROR_LOG_MAX * i);
 	}
+
+	/* Allocate per cpu slb area to save old slb contents during MCE */
+	size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
+	slb_ptr = __va(memblock_alloc_base(size, sizeof(struct slb_entry),
+							ppc64_rma_size));
+	for_each_possible_cpu(i)
+		paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i);
 }
 
 static void pseries_8259_cascade(struct irq_desc *desc)