[v7,5/9] powerpc/pseries: flush SLB contents on SLB MCE errors.

Message ID 153365142349.14256.9954484737438718329.stgit@jupiter.in.ibm.com
State Changes Requested
Headers show
Series
  • powerpc/pseries: Machine check handler improvements.
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch

Commit Message

Mahesh J Salgaonkar Aug. 7, 2018, 2:17 p.m.
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

On pseries, as of today system crashes if we get a machine check
exceptions due to SLB errors. These are soft errors and can be fixed by
flushing the SLBs so the kernel can continue to function instead of
system crash. We do this in real mode before turning on MMU. Otherwise
we would run into nested machine checks. This patch now fetches the
rtas error log in real mode and flushes the SLBs on SLB errors.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.com>
---

Changes in V7:
- Fold Michal's patch into this patch.
- Handle MSR_RI=0 and evil context case in MC handler.
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |    1 
 arch/powerpc/include/asm/machdep.h            |    1 
 arch/powerpc/kernel/exceptions-64s.S          |  112 +++++++++++++++++++++++++
 arch/powerpc/kernel/mce.c                     |   15 +++
 arch/powerpc/mm/slb.c                         |    6 +
 arch/powerpc/platforms/powernv/setup.c        |   11 ++
 arch/powerpc/platforms/pseries/pseries.h      |    1 
 arch/powerpc/platforms/pseries/ras.c          |   51 +++++++++++
 arch/powerpc/platforms/pseries/setup.c        |    1 
 9 files changed, 195 insertions(+), 4 deletions(-)

Comments

Michal Suchánek Aug. 7, 2018, 4:54 p.m. | #1
Hello,


On Tue, 07 Aug 2018 19:47:14 +0530
"Mahesh J Salgaonkar" <mahesh@linux.vnet.ibm.com> wrote:

> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> 
> On pseries, as of today system crashes if we get a machine check
> exceptions due to SLB errors. These are soft errors and can be fixed
> by flushing the SLBs so the kernel can continue to function instead of
> system crash. We do this in real mode before turning on MMU. Otherwise
> we would run into nested machine checks. This patch now fetches the
> rtas error log in real mode and flushes the SLBs on SLB errors.
> 
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> Signed-off-by: Michal Suchanek <msuchanek@suse.com>
> ---
> 
> Changes in V7:
> - Fold Michal's patch into this patch.
> - Handle MSR_RI=0 and evil context case in MC handler.
> ---
>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |    1 
>  arch/powerpc/include/asm/machdep.h            |    1 
>  arch/powerpc/kernel/exceptions-64s.S          |  112
> +++++++++++++++++++++++++
> arch/powerpc/kernel/mce.c                     |   15 +++
> arch/powerpc/mm/slb.c                         |    6 +
> arch/powerpc/platforms/powernv/setup.c        |   11 ++
> arch/powerpc/platforms/pseries/pseries.h      |    1
> arch/powerpc/platforms/pseries/ras.c          |   51 +++++++++++
> arch/powerpc/platforms/pseries/setup.c        |    1 9 files changed,
> 195 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index
> 50ed64fba4ae..cc00a7088cf3 100644 ---
> a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++
> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -487,6 +487,7 @@
> extern void hpte_init_native(void); 
>  extern void slb_initialize(void);
>  extern void slb_flush_and_rebolt(void);
> +extern void slb_flush_and_rebolt_realmode(void);
>  
>  extern void slb_vmalloc_update(void);
>  extern void slb_set_size(u16 size);
> diff --git a/arch/powerpc/include/asm/machdep.h
> b/arch/powerpc/include/asm/machdep.h index a47de82fb8e2..b4831f1338db
> 100644 --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -108,6 +108,7 @@ struct machdep_calls {
>  
>  	/* Early exception handlers called in realmode */
>  	int		(*hmi_exception_early)(struct pt_regs
> *regs);
> +	long		(*machine_check_early)(struct pt_regs
> *regs); 
>  	/* Called during machine check exception to retrive fixup
> address. */ bool		(*mce_check_early_recovery)(struct
> pt_regs *regs); diff --git a/arch/powerpc/kernel/exceptions-64s.S
> b/arch/powerpc/kernel/exceptions-64s.S index
> 285c6465324a..cb06f219570a 100644 ---
> a/arch/powerpc/kernel/exceptions-64s.S +++
> b/arch/powerpc/kernel/exceptions-64s.S @@ -332,6 +332,9 @@
> TRAMP_REAL_BEGIN(machine_check_pSeries) machine_check_fwnmi:
>  	SET_SCRATCH0(r13)		/* save r13 */
>  	EXCEPTION_PROLOG_0(PACA_EXMC)
> +BEGIN_FTR_SECTION
> +	b	machine_check_pSeries_early
> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
>  machine_check_pSeries_0:
>  	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
>  	/*
> @@ -343,6 +346,90 @@ machine_check_pSeries_0:
>  
>  TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
>  
> +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> +BEGIN_FTR_SECTION
> +	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> +	mr	r10,r1			/* Save r1 */
> +	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency
> stack */
> +	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack
> frame		*/
> +	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
> +	mfspr	r12,SPRN_SRR1		/* Save SRR1 */
> +	EXCEPTION_PROLOG_COMMON_1()
> +	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> +	EXCEPTION_PROLOG_COMMON_3(0x200)
> +	addi	r3,r1,STACK_FRAME_OVERHEAD
> +	BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI
> */
> +	ld	r12,_MSR(r1)
> +	andi.	r11,r12,MSR_PR		/* See if coming
> from user. */
> +	bne	2f			/* continue in V mode
> if we are. */ +
> +	/*
> +	 * At this point we are not sure about what context we come
> from.
> +	 * We may be in the middle of swithing stack. r1 may not be
> valid.
> +	 * Hence stay on emergency stack, call
> machine_check_exception and
> +	 * return from the interrupt.
> +	 * But before that, check if this is an un-recoverable
> exception.
> +	 * If yes, then stay on emergency stack and panic.
> +	 */
> +	andi.	r11,r12,MSR_RI
> +	bne	1f
> +
> +	/*
> +	 * Check if we have successfully handled/recovered from
> error, if not
> +	 * then stay on emergency stack and panic.
> +	 */
> +	cmpdi	r3,0		/* see if we handled MCE
> successfully */
> +	bne	1f		/* if handled then return from
> interrupt */ +
> +	LOAD_HANDLER(r10,unrecover_mce)
> +	mtspr	SPRN_SRR0,r10
> +	ld	r10,PACAKMSR(r13)
> +	/*
> +	 * We are going down. But there are chances that we might
> get hit by
> +	 * another MCE during panic path and we may run into
> unstable state
> +	 * with no way out. Hence, turn ME bit off while going down,
> so that
> +	 * when another MCE is hit during panic path, hypervisor will
> +	 * power cycle the lpar, instead of getting into MCE loop.
> +	 */
> +	li	r3,MSR_ME
> +	andc	r10,r10,r3		/* Turn off MSR_ME */
> +	mtspr	SPRN_SRR1,r10
> +	RFI_TO_KERNEL
> +	b	.
> +
> +	/* Stay on emergency stack and return from interrupt. */
> +1:	LOAD_HANDLER(r10,mce_return)
> +	mtspr	SPRN_SRR0,r10
> +	ld	r10,PACAKMSR(r13)
> +	mtspr	SPRN_SRR1,r10
> +	RFI_TO_KERNEL
> +	b	.

I think that the logic should be inverted here. That is we should check
for unrecoverable and unhandled exceptions and jump to unrecov_mce if
found, fallthrough to mce_return otherwise.

Thanks

Michal


> +
> +	/* Move original SRR0 and SRR1 into the respective regs */
> +2:	ld	r9,_MSR(r1)
> +	mtspr	SPRN_SRR1,r9
> +	ld	r3,_NIP(r1)
> +	mtspr	SPRN_SRR0,r3
> +	ld	r9,_CTR(r1)
> +	mtctr	r9
> +	ld	r9,_XER(r1)
> +	mtxer	r9
> +	ld	r9,_LINK(r1)
> +	mtlr	r9
> +	REST_GPR(0, r1)
> +	REST_8GPRS(2, r1)
> +	REST_GPR(10, r1)
> +	ld	r11,_CCR(r1)
> +	mtcr	r11
> +	REST_GPR(11, r1)
> +	REST_2GPRS(12, r1)
> +	/* restore original r1. */
> +	ld	r1,GPR1(r1)
> +	SET_SCRATCH0(r13)		/* save r13 */
> +	EXCEPTION_PROLOG_0(PACA_EXMC)
> +	b	machine_check_pSeries_0
> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> +
>  EXC_COMMON_BEGIN(machine_check_common)
>  	/*
>  	 * Machine check is different because we use a different
> @@ -536,6 +623,31 @@ EXC_COMMON_BEGIN(unrecover_mce)
>  	bl	unrecoverable_exception
>  	b	1b
>  
> +EXC_COMMON_BEGIN(mce_return)
> +	/* Invoke machine_check_exception to print MCE event and
> return. */
> +	addi	r3,r1,STACK_FRAME_OVERHEAD
> +	bl	machine_check_exception
> +	ld	r9,_MSR(r1)
> +	mtspr	SPRN_SRR1,r9
> +	ld	r3,_NIP(r1)
> +	mtspr	SPRN_SRR0,r3
> +	ld	r9,_CTR(r1)
> +	mtctr	r9
> +	ld	r9,_XER(r1)
> +	mtxer	r9
> +	ld	r9,_LINK(r1)
> +	mtlr	r9
> +	REST_GPR(0, r1)
> +	REST_8GPRS(2, r1)
> +	REST_GPR(10, r1)
> +	ld	r11,_CCR(r1)
> +	mtcr	r11
> +	REST_GPR(11, r1)
> +	REST_2GPRS(12, r1)
> +	/* restore original r1. */
> +	ld	r1,GPR1(r1)
> +	RFI_TO_KERNEL
> +	b	.
>  
>  EXC_REAL(data_access, 0x300, 0x80)
>  EXC_VIRT(data_access, 0x4300, 0x80, 0x300)
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index efdd16a79075..ae17d8aa60c4 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -488,10 +488,19 @@ long machine_check_early(struct pt_regs *regs)
>  {
>  	long handled = 0;
>  
> -	__this_cpu_inc(irq_stat.mce_exceptions);
> +	/*
> +	 * For pSeries we count mce when we go into virtual mode
> machine
> +	 * check handler. Hence skip it. Also, We can't access per
> cpu
> +	 * variables in real mode for LPAR.
> +	 */
> +	if (early_cpu_has_feature(CPU_FTR_HVMODE))
> +		__this_cpu_inc(irq_stat.mce_exceptions);
>  
> -	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> -		handled = cur_cpu_spec->machine_check_early(regs);
> +	/*
> +	 * See if platform is capable of handling machine check.
> +	 */
> +	if (ppc_md.machine_check_early)
> +		handled = ppc_md.machine_check_early(regs);
>  	return handled;
>  }
>  
> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
> index cb796724a6fc..e89f675f1b5e 100644
> --- a/arch/powerpc/mm/slb.c
> +++ b/arch/powerpc/mm/slb.c
> @@ -145,6 +145,12 @@ void slb_flush_and_rebolt(void)
>  	get_paca()->slb_cache_ptr = 0;
>  }
>  
> +void slb_flush_and_rebolt_realmode(void)
> +{
> +	__slb_flush_and_rebolt();
> +	get_paca()->slb_cache_ptr = 0;
> +}
> +
>  void slb_vmalloc_update(void)
>  {
>  	unsigned long vflags;
> diff --git a/arch/powerpc/platforms/powernv/setup.c
> b/arch/powerpc/platforms/powernv/setup.c index
> f96df0a25d05..b74c93bc2e55 100644 ---
> a/arch/powerpc/platforms/powernv/setup.c +++
> b/arch/powerpc/platforms/powernv/setup.c @@ -431,6 +431,16 @@ static
> unsigned long pnv_get_proc_freq(unsigned int cpu) return ret_freq;
>  }
>  
> +static long pnv_machine_check_early(struct pt_regs *regs)
> +{
> +	long handled = 0;
> +
> +	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> +		handled = cur_cpu_spec->machine_check_early(regs);
> +
> +	return handled;
> +}
> +
>  define_machine(powernv) {
>  	.name			= "PowerNV",
>  	.probe			= pnv_probe,
> @@ -442,6 +452,7 @@ define_machine(powernv) {
>  	.machine_shutdown	= pnv_shutdown,
>  	.power_save             = NULL,
>  	.calibrate_decr		= generic_calibrate_decr,
> +	.machine_check_early	= pnv_machine_check_early,
>  #ifdef CONFIG_KEXEC_CORE
>  	.kexec_cpu_down		= pnv_kexec_cpu_down,
>  #endif
> diff --git a/arch/powerpc/platforms/pseries/pseries.h
> b/arch/powerpc/platforms/pseries/pseries.h index
> 60db2ee511fb..ec2a5f61d4a4 100644 ---
> a/arch/powerpc/platforms/pseries/pseries.h +++
> b/arch/powerpc/platforms/pseries/pseries.h @@ -24,6 +24,7 @@ struct
> pt_regs; 
>  extern int pSeries_system_reset_exception(struct pt_regs *regs);
>  extern int pSeries_machine_check_exception(struct pt_regs *regs);
> +extern long pSeries_machine_check_realmode(struct pt_regs *regs);
>  
>  #ifdef CONFIG_SMP
>  extern void smp_init_pseries(void);
> diff --git a/arch/powerpc/platforms/pseries/ras.c
> b/arch/powerpc/platforms/pseries/ras.c index
> 851ce326874a..e4420f7c8fda 100644 ---
> a/arch/powerpc/platforms/pseries/ras.c +++
> b/arch/powerpc/platforms/pseries/ras.c @@ -427,6 +427,35 @@ int
> pSeries_system_reset_exception(struct pt_regs *regs) return 0; /*
> need to perform reset */ }
>  
> +static int mce_handle_error(struct rtas_error_log *errp)
> +{
> +	struct pseries_errorlog *pseries_log;
> +	struct pseries_mc_errorlog *mce_log;
> +	int disposition = rtas_error_disposition(errp);
> +	uint8_t error_type;
> +
> +	if (!rtas_error_extended(errp))
> +		goto out;
> +
> +	pseries_log = get_pseries_errorlog(errp,
> PSERIES_ELOG_SECT_ID_MCE);
> +	if (pseries_log == NULL)
> +		goto out;
> +
> +	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
> +	error_type = rtas_mc_error_type(mce_log);
> +
> +	if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
> +			(error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
> +		/* Store the old slb content someplace. */
> +		slb_flush_and_rebolt_realmode();
> +		disposition = RTAS_DISP_FULLY_RECOVERED;
> +		rtas_set_disposition_recovered(errp);
> +	}
> +
> +out:
> +	return disposition;
> +}
> +
>  /*
>   * Process MCE rtas errlog event.
>   */
> @@ -503,11 +532,31 @@ int pSeries_machine_check_exception(struct
> pt_regs *regs) struct rtas_error_log *errp;
>  
>  	if (fwnmi_active) {
> -		errp = fwnmi_get_errinfo(regs);
>  		fwnmi_release_errinfo();
> +		errp = fwnmi_get_errlog();
>  		if (errp && recover_mce(regs, errp))
>  			return 1;
>  	}
>  
>  	return 0;
>  }
> +
> +long pSeries_machine_check_realmode(struct pt_regs *regs)
> +{
> +	struct rtas_error_log *errp;
> +	int disposition;
> +
> +	if (fwnmi_active) {
> +		errp = fwnmi_get_errinfo(regs);
> +		/*
> +		 * Call to fwnmi_release_errinfo() in real mode
> causes kernel
> +		 * to panic. Hence we will call it as soon as we go
> into
> +		 * virtual mode.
> +		 */
> +		disposition = mce_handle_error(errp);
> +		if (disposition == RTAS_DISP_FULLY_RECOVERED)
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> diff --git a/arch/powerpc/platforms/pseries/setup.c
> b/arch/powerpc/platforms/pseries/setup.c index
> b42087cd8c6b..7a9421d089d8 100644 ---
> a/arch/powerpc/platforms/pseries/setup.c +++
> b/arch/powerpc/platforms/pseries/setup.c @@ -1000,6 +1000,7 @@
> define_machine(pseries) { .calibrate_decr		=
> generic_calibrate_decr, .progress		= rtas_progress,
>  	.system_reset_exception = pSeries_system_reset_exception,
> +	.machine_check_early	= pSeries_machine_check_realmode,
>  	.machine_check_exception = pSeries_machine_check_exception,
>  #ifdef CONFIG_KEXEC_CORE
>  	.machine_kexec          = pSeries_machine_kexec,
> 
>
Nicholas Piggin Aug. 8, 2018, 9:04 a.m. | #2
On Tue, 07 Aug 2018 19:47:14 +0530
Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:

> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> 
> On pseries, as of today system crashes if we get a machine check
> exceptions due to SLB errors. These are soft errors and can be fixed by
> flushing the SLBs so the kernel can continue to function instead of
> system crash. We do this in real mode before turning on MMU. Otherwise
> we would run into nested machine checks. This patch now fetches the
> rtas error log in real mode and flushes the SLBs on SLB errors.
> 
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> Signed-off-by: Michal Suchanek <msuchanek@suse.com>
> ---
> 
> Changes in V7:
> - Fold Michal's patch into this patch.
> - Handle MSR_RI=0 and evil context case in MC handler.
> ---


> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
> index cb796724a6fc..e89f675f1b5e 100644
> --- a/arch/powerpc/mm/slb.c
> +++ b/arch/powerpc/mm/slb.c
> @@ -145,6 +145,12 @@ void slb_flush_and_rebolt(void)
>  	get_paca()->slb_cache_ptr = 0;
>  }
>  
> +void slb_flush_and_rebolt_realmode(void)
> +{
> +	__slb_flush_and_rebolt();
> +	get_paca()->slb_cache_ptr = 0;
> +}
> +
>  void slb_vmalloc_update(void)
>  {
>  	unsigned long vflags;

Can you use this patch for the SLB flush?

https://patchwork.ozlabs.org/patch/953034/

Thanks,
Nick
Mahesh J Salgaonkar Aug. 10, 2018, 10:30 a.m. | #3
On 08/07/2018 10:24 PM, Michal Suchánek wrote:
> Hello,
> 
> 
> On Tue, 07 Aug 2018 19:47:14 +0530
> "Mahesh J Salgaonkar" <mahesh@linux.vnet.ibm.com> wrote:
> 
>> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>
>> On pseries, as of today system crashes if we get a machine check
>> exceptions due to SLB errors. These are soft errors and can be fixed
>> by flushing the SLBs so the kernel can continue to function instead of
>> system crash. We do this in real mode before turning on MMU. Otherwise
>> we would run into nested machine checks. This patch now fetches the
>> rtas error log in real mode and flushes the SLBs on SLB errors.
>>
>> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>> Signed-off-by: Michal Suchanek <msuchanek@suse.com>
>> ---
>>
>> Changes in V7:
>> - Fold Michal's patch into this patch.
>> - Handle MSR_RI=0 and evil context case in MC handler.
>> ---
>>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |    1 
>>  arch/powerpc/include/asm/machdep.h            |    1 
>>  arch/powerpc/kernel/exceptions-64s.S          |  112
>> +++++++++++++++++++++++++
>> arch/powerpc/kernel/mce.c                     |   15 +++
>> arch/powerpc/mm/slb.c                         |    6 +
>> arch/powerpc/platforms/powernv/setup.c        |   11 ++
>> arch/powerpc/platforms/pseries/pseries.h      |    1
>> arch/powerpc/platforms/pseries/ras.c          |   51 +++++++++++
>> arch/powerpc/platforms/pseries/setup.c        |    1 9 files changed,
>> 195 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index
>> 50ed64fba4ae..cc00a7088cf3 100644 ---
>> a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++
>> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -487,6 +487,7 @@
>> extern void hpte_init_native(void); 
>>  extern void slb_initialize(void);
>>  extern void slb_flush_and_rebolt(void);
>> +extern void slb_flush_and_rebolt_realmode(void);
>>  
>>  extern void slb_vmalloc_update(void);
>>  extern void slb_set_size(u16 size);
>> diff --git a/arch/powerpc/include/asm/machdep.h
>> b/arch/powerpc/include/asm/machdep.h index a47de82fb8e2..b4831f1338db
>> 100644 --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -108,6 +108,7 @@ struct machdep_calls {
>>  
>>  	/* Early exception handlers called in realmode */
>>  	int		(*hmi_exception_early)(struct pt_regs
>> *regs);
>> +	long		(*machine_check_early)(struct pt_regs
>> *regs); 
>>  	/* Called during machine check exception to retrive fixup
>> address. */ bool		(*mce_check_early_recovery)(struct
>> pt_regs *regs); diff --git a/arch/powerpc/kernel/exceptions-64s.S
>> b/arch/powerpc/kernel/exceptions-64s.S index
>> 285c6465324a..cb06f219570a 100644 ---
>> a/arch/powerpc/kernel/exceptions-64s.S +++
>> b/arch/powerpc/kernel/exceptions-64s.S @@ -332,6 +332,9 @@
>> TRAMP_REAL_BEGIN(machine_check_pSeries) machine_check_fwnmi:
>>  	SET_SCRATCH0(r13)		/* save r13 */
>>  	EXCEPTION_PROLOG_0(PACA_EXMC)
>> +BEGIN_FTR_SECTION
>> +	b	machine_check_pSeries_early
>> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
>>  machine_check_pSeries_0:
>>  	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
>>  	/*
>> @@ -343,6 +346,90 @@ machine_check_pSeries_0:
>>  
>>  TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
>>  
>> +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
>> +BEGIN_FTR_SECTION
>> +	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
>> +	mr	r10,r1			/* Save r1 */
>> +	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency
>> stack */
>> +	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack
>> frame		*/
>> +	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
>> +	mfspr	r12,SPRN_SRR1		/* Save SRR1 */
>> +	EXCEPTION_PROLOG_COMMON_1()
>> +	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
>> +	EXCEPTION_PROLOG_COMMON_3(0x200)
>> +	addi	r3,r1,STACK_FRAME_OVERHEAD
>> +	BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI
>> */
>> +	ld	r12,_MSR(r1)
>> +	andi.	r11,r12,MSR_PR		/* See if coming
>> from user. */
>> +	bne	2f			/* continue in V mode
>> if we are. */ +
>> +	/*
>> +	 * At this point we are not sure about what context we come
>> from.
>> +	 * We may be in the middle of swithing stack. r1 may not be
>> valid.
>> +	 * Hence stay on emergency stack, call
>> machine_check_exception and
>> +	 * return from the interrupt.
>> +	 * But before that, check if this is an un-recoverable
>> exception.
>> +	 * If yes, then stay on emergency stack and panic.
>> +	 */
>> +	andi.	r11,r12,MSR_RI
>> +	bne	1f
>> +
>> +	/*
>> +	 * Check if we have successfully handled/recovered from
>> error, if not
>> +	 * then stay on emergency stack and panic.
>> +	 */
>> +	cmpdi	r3,0		/* see if we handled MCE
>> successfully */
>> +	bne	1f		/* if handled then return from
>> interrupt */ +
>> +	LOAD_HANDLER(r10,unrecover_mce)
>> +	mtspr	SPRN_SRR0,r10
>> +	ld	r10,PACAKMSR(r13)
>> +	/*
>> +	 * We are going down. But there are chances that we might
>> get hit by
>> +	 * another MCE during panic path and we may run into
>> unstable state
>> +	 * with no way out. Hence, turn ME bit off while going down,
>> so that
>> +	 * when another MCE is hit during panic path, hypervisor will
>> +	 * power cycle the lpar, instead of getting into MCE loop.
>> +	 */
>> +	li	r3,MSR_ME
>> +	andc	r10,r10,r3		/* Turn off MSR_ME */
>> +	mtspr	SPRN_SRR1,r10
>> +	RFI_TO_KERNEL
>> +	b	.
>> +
>> +	/* Stay on emergency stack and return from interrupt. */
>> +1:	LOAD_HANDLER(r10,mce_return)
>> +	mtspr	SPRN_SRR0,r10
>> +	ld	r10,PACAKMSR(r13)
>> +	mtspr	SPRN_SRR1,r10
>> +	RFI_TO_KERNEL
>> +	b	.
> 
> I think that the logic should be inverted here. That is we should check
> for unrecoverable and unhandled exceptions and jump to unrecov_mce if
> found, fallthrough to mce_return otherwise.

sure. will make that change in next revision.

Thanks,
-Mahesh.

> 
> Thanks
> 
> Michal
> 
> 
>> +
>> +	/* Move original SRR0 and SRR1 into the respective regs */
>> +2:	ld	r9,_MSR(r1)
>> +	mtspr	SPRN_SRR1,r9
>> +	ld	r3,_NIP(r1)
>> +	mtspr	SPRN_SRR0,r3
>> +	ld	r9,_CTR(r1)
>> +	mtctr	r9
>> +	ld	r9,_XER(r1)
>> +	mtxer	r9
>> +	ld	r9,_LINK(r1)
>> +	mtlr	r9
>> +	REST_GPR(0, r1)
>> +	REST_8GPRS(2, r1)
>> +	REST_GPR(10, r1)
>> +	ld	r11,_CCR(r1)
>> +	mtcr	r11
>> +	REST_GPR(11, r1)
>> +	REST_2GPRS(12, r1)
>> +	/* restore original r1. */
>> +	ld	r1,GPR1(r1)
>> +	SET_SCRATCH0(r13)		/* save r13 */
>> +	EXCEPTION_PROLOG_0(PACA_EXMC)
>> +	b	machine_check_pSeries_0
>> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
>> +
>>  EXC_COMMON_BEGIN(machine_check_common)
>>  	/*
>>  	 * Machine check is different because we use a different
>> @@ -536,6 +623,31 @@ EXC_COMMON_BEGIN(unrecover_mce)
>>  	bl	unrecoverable_exception
>>  	b	1b
>>  
>> +EXC_COMMON_BEGIN(mce_return)
>> +	/* Invoke machine_check_exception to print MCE event and
>> return. */
>> +	addi	r3,r1,STACK_FRAME_OVERHEAD
>> +	bl	machine_check_exception
>> +	ld	r9,_MSR(r1)
>> +	mtspr	SPRN_SRR1,r9
>> +	ld	r3,_NIP(r1)
>> +	mtspr	SPRN_SRR0,r3
>> +	ld	r9,_CTR(r1)
>> +	mtctr	r9
>> +	ld	r9,_XER(r1)
>> +	mtxer	r9
>> +	ld	r9,_LINK(r1)
>> +	mtlr	r9
>> +	REST_GPR(0, r1)
>> +	REST_8GPRS(2, r1)
>> +	REST_GPR(10, r1)
>> +	ld	r11,_CCR(r1)
>> +	mtcr	r11
>> +	REST_GPR(11, r1)
>> +	REST_2GPRS(12, r1)
>> +	/* restore original r1. */
>> +	ld	r1,GPR1(r1)
>> +	RFI_TO_KERNEL
>> +	b	.
>>  
>>  EXC_REAL(data_access, 0x300, 0x80)
>>  EXC_VIRT(data_access, 0x4300, 0x80, 0x300)
>> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
>> index efdd16a79075..ae17d8aa60c4 100644
>> --- a/arch/powerpc/kernel/mce.c
>> +++ b/arch/powerpc/kernel/mce.c
>> @@ -488,10 +488,19 @@ long machine_check_early(struct pt_regs *regs)
>>  {
>>  	long handled = 0;
>>  
>> -	__this_cpu_inc(irq_stat.mce_exceptions);
>> +	/*
>> +	 * For pSeries we count mce when we go into virtual mode
>> machine
>> +	 * check handler. Hence skip it. Also, We can't access per
>> cpu
>> +	 * variables in real mode for LPAR.
>> +	 */
>> +	if (early_cpu_has_feature(CPU_FTR_HVMODE))
>> +		__this_cpu_inc(irq_stat.mce_exceptions);
>>  
>> -	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
>> -		handled = cur_cpu_spec->machine_check_early(regs);
>> +	/*
>> +	 * See if platform is capable of handling machine check.
>> +	 */
>> +	if (ppc_md.machine_check_early)
>> +		handled = ppc_md.machine_check_early(regs);
>>  	return handled;
>>  }
>>  
>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>> index cb796724a6fc..e89f675f1b5e 100644
>> --- a/arch/powerpc/mm/slb.c
>> +++ b/arch/powerpc/mm/slb.c
>> @@ -145,6 +145,12 @@ void slb_flush_and_rebolt(void)
>>  	get_paca()->slb_cache_ptr = 0;
>>  }
>>  
>> +void slb_flush_and_rebolt_realmode(void)
>> +{
>> +	__slb_flush_and_rebolt();
>> +	get_paca()->slb_cache_ptr = 0;
>> +}
>> +
>>  void slb_vmalloc_update(void)
>>  {
>>  	unsigned long vflags;
>> diff --git a/arch/powerpc/platforms/powernv/setup.c
>> b/arch/powerpc/platforms/powernv/setup.c index
>> f96df0a25d05..b74c93bc2e55 100644 ---
>> a/arch/powerpc/platforms/powernv/setup.c +++
>> b/arch/powerpc/platforms/powernv/setup.c @@ -431,6 +431,16 @@ static
>> unsigned long pnv_get_proc_freq(unsigned int cpu) return ret_freq;
>>  }
>>  
>> +static long pnv_machine_check_early(struct pt_regs *regs)
>> +{
>> +	long handled = 0;
>> +
>> +	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
>> +		handled = cur_cpu_spec->machine_check_early(regs);
>> +
>> +	return handled;
>> +}
>> +
>>  define_machine(powernv) {
>>  	.name			= "PowerNV",
>>  	.probe			= pnv_probe,
>> @@ -442,6 +452,7 @@ define_machine(powernv) {
>>  	.machine_shutdown	= pnv_shutdown,
>>  	.power_save             = NULL,
>>  	.calibrate_decr		= generic_calibrate_decr,
>> +	.machine_check_early	= pnv_machine_check_early,
>>  #ifdef CONFIG_KEXEC_CORE
>>  	.kexec_cpu_down		= pnv_kexec_cpu_down,
>>  #endif
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h
>> b/arch/powerpc/platforms/pseries/pseries.h index
>> 60db2ee511fb..ec2a5f61d4a4 100644 ---
>> a/arch/powerpc/platforms/pseries/pseries.h +++
>> b/arch/powerpc/platforms/pseries/pseries.h @@ -24,6 +24,7 @@ struct
>> pt_regs; 
>>  extern int pSeries_system_reset_exception(struct pt_regs *regs);
>>  extern int pSeries_machine_check_exception(struct pt_regs *regs);
>> +extern long pSeries_machine_check_realmode(struct pt_regs *regs);
>>  
>>  #ifdef CONFIG_SMP
>>  extern void smp_init_pseries(void);
>> diff --git a/arch/powerpc/platforms/pseries/ras.c
>> b/arch/powerpc/platforms/pseries/ras.c index
>> 851ce326874a..e4420f7c8fda 100644 ---
>> a/arch/powerpc/platforms/pseries/ras.c +++
>> b/arch/powerpc/platforms/pseries/ras.c @@ -427,6 +427,35 @@ int
>> pSeries_system_reset_exception(struct pt_regs *regs) return 0; /*
>> need to perform reset */ }
>>  
>> +static int mce_handle_error(struct rtas_error_log *errp)
>> +{
>> +	struct pseries_errorlog *pseries_log;
>> +	struct pseries_mc_errorlog *mce_log;
>> +	int disposition = rtas_error_disposition(errp);
>> +	uint8_t error_type;
>> +
>> +	if (!rtas_error_extended(errp))
>> +		goto out;
>> +
>> +	pseries_log = get_pseries_errorlog(errp,
>> PSERIES_ELOG_SECT_ID_MCE);
>> +	if (pseries_log == NULL)
>> +		goto out;
>> +
>> +	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
>> +	error_type = rtas_mc_error_type(mce_log);
>> +
>> +	if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
>> +			(error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
>> +		/* Store the old slb content someplace. */
>> +		slb_flush_and_rebolt_realmode();
>> +		disposition = RTAS_DISP_FULLY_RECOVERED;
>> +		rtas_set_disposition_recovered(errp);
>> +	}
>> +
>> +out:
>> +	return disposition;
>> +}
>> +
>>  /*
>>   * Process MCE rtas errlog event.
>>   */
>> @@ -503,11 +532,31 @@ int pSeries_machine_check_exception(struct
>> pt_regs *regs) struct rtas_error_log *errp;
>>  
>>  	if (fwnmi_active) {
>> -		errp = fwnmi_get_errinfo(regs);
>>  		fwnmi_release_errinfo();
>> +		errp = fwnmi_get_errlog();
>>  		if (errp && recover_mce(regs, errp))
>>  			return 1;
>>  	}
>>  
>>  	return 0;
>>  }
>> +
>> +long pSeries_machine_check_realmode(struct pt_regs *regs)
>> +{
>> +	struct rtas_error_log *errp;
>> +	int disposition;
>> +
>> +	if (fwnmi_active) {
>> +		errp = fwnmi_get_errinfo(regs);
>> +		/*
>> +		 * Call to fwnmi_release_errinfo() in real mode
>> causes kernel
>> +		 * to panic. Hence we will call it as soon as we go
>> into
>> +		 * virtual mode.
>> +		 */
>> +		disposition = mce_handle_error(errp);
>> +		if (disposition == RTAS_DISP_FULLY_RECOVERED)
>> +			return 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> diff --git a/arch/powerpc/platforms/pseries/setup.c
>> b/arch/powerpc/platforms/pseries/setup.c index
>> b42087cd8c6b..7a9421d089d8 100644 ---
>> a/arch/powerpc/platforms/pseries/setup.c +++
>> b/arch/powerpc/platforms/pseries/setup.c @@ -1000,6 +1000,7 @@
>> define_machine(pseries) { .calibrate_decr		=
>> generic_calibrate_decr, .progress		= rtas_progress,
>>  	.system_reset_exception = pSeries_system_reset_exception,
>> +	.machine_check_early	= pSeries_machine_check_realmode,
>>  	.machine_check_exception = pSeries_machine_check_exception,
>>  #ifdef CONFIG_KEXEC_CORE
>>  	.machine_kexec          = pSeries_machine_kexec,
>>
>>
>
Mahesh J Salgaonkar Aug. 10, 2018, 10:30 a.m. | #4
On 08/08/2018 02:34 PM, Nicholas Piggin wrote:
> On Tue, 07 Aug 2018 19:47:14 +0530
> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
> 
>> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>>
>> On pseries, as of today system crashes if we get a machine check
>> exceptions due to SLB errors. These are soft errors and can be fixed by
>> flushing the SLBs so the kernel can continue to function instead of
>> system crash. We do this in real mode before turning on MMU. Otherwise
>> we would run into nested machine checks. This patch now fetches the
>> rtas error log in real mode and flushes the SLBs on SLB errors.
>>
>> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>> Signed-off-by: Michal Suchanek <msuchanek@suse.com>
>> ---
>>
>> Changes in V7:
>> - Fold Michal's patch into this patch.
>> - Handle MSR_RI=0 and evil context case in MC handler.
>> ---
> 
> 
>> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
>> index cb796724a6fc..e89f675f1b5e 100644
>> --- a/arch/powerpc/mm/slb.c
>> +++ b/arch/powerpc/mm/slb.c
>> @@ -145,6 +145,12 @@ void slb_flush_and_rebolt(void)
>>  	get_paca()->slb_cache_ptr = 0;
>>  }
>>  
>> +void slb_flush_and_rebolt_realmode(void)
>> +{
>> +	__slb_flush_and_rebolt();
>> +	get_paca()->slb_cache_ptr = 0;
>> +}
>> +
>>  void slb_vmalloc_update(void)
>>  {
>>  	unsigned long vflags;
> 
> Can you use this patch for the SLB flush?
> 
> https://patchwork.ozlabs.org/patch/953034/

Will use your v2.

Thanks,
-Mahesh.

> 
> Thanks,
> Nick
>

Patch

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 50ed64fba4ae..cc00a7088cf3 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -487,6 +487,7 @@  extern void hpte_init_native(void);
 
 extern void slb_initialize(void);
 extern void slb_flush_and_rebolt(void);
+extern void slb_flush_and_rebolt_realmode(void);
 
 extern void slb_vmalloc_update(void);
 extern void slb_set_size(u16 size);
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index a47de82fb8e2..b4831f1338db 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -108,6 +108,7 @@  struct machdep_calls {
 
 	/* Early exception handlers called in realmode */
 	int		(*hmi_exception_early)(struct pt_regs *regs);
+	long		(*machine_check_early)(struct pt_regs *regs);
 
 	/* Called during machine check exception to retrive fixup address. */
 	bool		(*mce_check_early_recovery)(struct pt_regs *regs);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 285c6465324a..cb06f219570a 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -332,6 +332,9 @@  TRAMP_REAL_BEGIN(machine_check_pSeries)
 machine_check_fwnmi:
 	SET_SCRATCH0(r13)		/* save r13 */
 	EXCEPTION_PROLOG_0(PACA_EXMC)
+BEGIN_FTR_SECTION
+	b	machine_check_pSeries_early
+END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
 machine_check_pSeries_0:
 	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
 	/*
@@ -343,6 +346,90 @@  machine_check_pSeries_0:
 
 TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
+TRAMP_REAL_BEGIN(machine_check_pSeries_early)
+BEGIN_FTR_SECTION
+	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
+	mr	r10,r1			/* Save r1 */
+	ld	r1,PACAMCEMERGSP(r13)	/* Use MC emergency stack */
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
+	mfspr	r11,SPRN_SRR0		/* Save SRR0 */
+	mfspr	r12,SPRN_SRR1		/* Save SRR1 */
+	EXCEPTION_PROLOG_COMMON_1()
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
+	EXCEPTION_PROLOG_COMMON_3(0x200)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */
+	ld	r12,_MSR(r1)
+	andi.	r11,r12,MSR_PR		/* See if coming from user. */
+	bne	2f			/* continue in V mode if we are. */
+
+	/*
+	 * At this point we are not sure about what context we come from.
+	 * We may be in the middle of swithing stack. r1 may not be valid.
+	 * Hence stay on emergency stack, call machine_check_exception and
+	 * return from the interrupt.
+	 * But before that, check if this is an un-recoverable exception.
+	 * If yes, then stay on emergency stack and panic.
+	 */
+	andi.	r11,r12,MSR_RI
+	bne	1f
+
+	/*
+	 * Check if we have successfully handled/recovered from error, if not
+	 * then stay on emergency stack and panic.
+	 */
+	cmpdi	r3,0		/* see if we handled MCE successfully */
+	bne	1f		/* if handled then return from interrupt */
+
+	LOAD_HANDLER(r10,unrecover_mce)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	/*
+	 * We are going down. But there are chances that we might get hit by
+	 * another MCE during panic path and we may run into unstable state
+	 * with no way out. Hence, turn ME bit off while going down, so that
+	 * when another MCE is hit during panic path, hypervisor will
+	 * power cycle the lpar, instead of getting into MCE loop.
+	 */
+	li	r3,MSR_ME
+	andc	r10,r10,r3		/* Turn off MSR_ME */
+	mtspr	SPRN_SRR1,r10
+	RFI_TO_KERNEL
+	b	.
+
+	/* Stay on emergency stack and return from interrupt. */
+1:	LOAD_HANDLER(r10,mce_return)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	mtspr	SPRN_SRR1,r10
+	RFI_TO_KERNEL
+	b	.
+
+	/* Move original SRR0 and SRR1 into the respective regs */
+2:	ld	r9,_MSR(r1)
+	mtspr	SPRN_SRR1,r9
+	ld	r3,_NIP(r1)
+	mtspr	SPRN_SRR0,r3
+	ld	r9,_CTR(r1)
+	mtctr	r9
+	ld	r9,_XER(r1)
+	mtxer	r9
+	ld	r9,_LINK(r1)
+	mtlr	r9
+	REST_GPR(0, r1)
+	REST_8GPRS(2, r1)
+	REST_GPR(10, r1)
+	ld	r11,_CCR(r1)
+	mtcr	r11
+	REST_GPR(11, r1)
+	REST_2GPRS(12, r1)
+	/* restore original r1. */
+	ld	r1,GPR1(r1)
+	SET_SCRATCH0(r13)		/* save r13 */
+	EXCEPTION_PROLOG_0(PACA_EXMC)
+	b	machine_check_pSeries_0
+END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
+
 EXC_COMMON_BEGIN(machine_check_common)
 	/*
 	 * Machine check is different because we use a different
@@ -536,6 +623,31 @@  EXC_COMMON_BEGIN(unrecover_mce)
 	bl	unrecoverable_exception
 	b	1b
 
+EXC_COMMON_BEGIN(mce_return)
+	/* Invoke machine_check_exception to print MCE event and return. */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	ld	r9,_MSR(r1)
+	mtspr	SPRN_SRR1,r9
+	ld	r3,_NIP(r1)
+	mtspr	SPRN_SRR0,r3
+	ld	r9,_CTR(r1)
+	mtctr	r9
+	ld	r9,_XER(r1)
+	mtxer	r9
+	ld	r9,_LINK(r1)
+	mtlr	r9
+	REST_GPR(0, r1)
+	REST_8GPRS(2, r1)
+	REST_GPR(10, r1)
+	ld	r11,_CCR(r1)
+	mtcr	r11
+	REST_GPR(11, r1)
+	REST_2GPRS(12, r1)
+	/* restore original r1. */
+	ld	r1,GPR1(r1)
+	RFI_TO_KERNEL
+	b	.
 
 EXC_REAL(data_access, 0x300, 0x80)
 EXC_VIRT(data_access, 0x4300, 0x80, 0x300)
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index efdd16a79075..ae17d8aa60c4 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -488,10 +488,19 @@  long machine_check_early(struct pt_regs *regs)
 {
 	long handled = 0;
 
-	__this_cpu_inc(irq_stat.mce_exceptions);
+	/*
+	 * For pSeries we count mce when we go into virtual mode machine
+	 * check handler. Hence skip it. Also, We can't access per cpu
+	 * variables in real mode for LPAR.
+	 */
+	if (early_cpu_has_feature(CPU_FTR_HVMODE))
+		__this_cpu_inc(irq_stat.mce_exceptions);
 
-	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
-		handled = cur_cpu_spec->machine_check_early(regs);
+	/*
+	 * See if platform is capable of handling machine check.
+	 */
+	if (ppc_md.machine_check_early)
+		handled = ppc_md.machine_check_early(regs);
 	return handled;
 }
 
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index cb796724a6fc..e89f675f1b5e 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -145,6 +145,12 @@  void slb_flush_and_rebolt(void)
 	get_paca()->slb_cache_ptr = 0;
 }
 
+void slb_flush_and_rebolt_realmode(void)
+{
+	__slb_flush_and_rebolt();
+	get_paca()->slb_cache_ptr = 0;
+}
+
 void slb_vmalloc_update(void)
 {
 	unsigned long vflags;
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index f96df0a25d05..b74c93bc2e55 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -431,6 +431,16 @@  static unsigned long pnv_get_proc_freq(unsigned int cpu)
 	return ret_freq;
 }
 
+static long pnv_machine_check_early(struct pt_regs *regs)
+{
+	long handled = 0;
+
+	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
+		handled = cur_cpu_spec->machine_check_early(regs);
+
+	return handled;
+}
+
 define_machine(powernv) {
 	.name			= "PowerNV",
 	.probe			= pnv_probe,
@@ -442,6 +452,7 @@  define_machine(powernv) {
 	.machine_shutdown	= pnv_shutdown,
 	.power_save             = NULL,
 	.calibrate_decr		= generic_calibrate_decr,
+	.machine_check_early	= pnv_machine_check_early,
 #ifdef CONFIG_KEXEC_CORE
 	.kexec_cpu_down		= pnv_kexec_cpu_down,
 #endif
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
index 60db2ee511fb..ec2a5f61d4a4 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -24,6 +24,7 @@  struct pt_regs;
 
 extern int pSeries_system_reset_exception(struct pt_regs *regs);
 extern int pSeries_machine_check_exception(struct pt_regs *regs);
+extern long pSeries_machine_check_realmode(struct pt_regs *regs);
 
 #ifdef CONFIG_SMP
 extern void smp_init_pseries(void);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 851ce326874a..e4420f7c8fda 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -427,6 +427,35 @@  int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
+static int mce_handle_error(struct rtas_error_log *errp)
+{
+	struct pseries_errorlog *pseries_log;
+	struct pseries_mc_errorlog *mce_log;
+	int disposition = rtas_error_disposition(errp);
+	uint8_t error_type;
+
+	if (!rtas_error_extended(errp))
+		goto out;
+
+	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+	if (pseries_log == NULL)
+		goto out;
+
+	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+	error_type = rtas_mc_error_type(mce_log);
+
+	if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
+			(error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
+		/* Store the old slb content someplace. */
+		slb_flush_and_rebolt_realmode();
+		disposition = RTAS_DISP_FULLY_RECOVERED;
+		rtas_set_disposition_recovered(errp);
+	}
+
+out:
+	return disposition;
+}
+
 /*
  * Process MCE rtas errlog event.
  */
@@ -503,11 +532,31 @@  int pSeries_machine_check_exception(struct pt_regs *regs)
 	struct rtas_error_log *errp;
 
 	if (fwnmi_active) {
-		errp = fwnmi_get_errinfo(regs);
 		fwnmi_release_errinfo();
+		errp = fwnmi_get_errlog();
 		if (errp && recover_mce(regs, errp))
 			return 1;
 	}
 
 	return 0;
 }
+
+long pSeries_machine_check_realmode(struct pt_regs *regs)
+{
+	struct rtas_error_log *errp;
+	int disposition;
+
+	if (fwnmi_active) {
+		errp = fwnmi_get_errinfo(regs);
+		/*
+		 * Call to fwnmi_release_errinfo() in real mode causes kernel
+		 * to panic. Hence we will call it as soon as we go into
+		 * virtual mode.
+		 */
+		disposition = mce_handle_error(errp);
+		if (disposition == RTAS_DISP_FULLY_RECOVERED)
+			return 1;
+	}
+
+	return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index b42087cd8c6b..7a9421d089d8 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -1000,6 +1000,7 @@  define_machine(pseries) {
 	.calibrate_decr		= generic_calibrate_decr,
 	.progress		= rtas_progress,
 	.system_reset_exception = pSeries_system_reset_exception,
+	.machine_check_early	= pSeries_machine_check_realmode,
 	.machine_check_exception = pSeries_machine_check_exception,
 #ifdef CONFIG_KEXEC_CORE
 	.machine_kexec          = pSeries_machine_kexec,