capi: CAPP recovery

Message ID 1505322189-3661-1-git-send-email-clombard@linux.vnet.ibm.com
State Superseded
Headers show
Series
  • capi: CAPP recovery
Related show

Commit Message

christophe lombard Sept. 13, 2017, 5:03 p.m.
CAPP recovery is initiated when a CAPP Machine Check is detected.
The capp recovery procedure is initiated via a Hypervisor Maintenance
interrupt (HMI).

CAPP Machine Check may arise from either an error that results in a PHB
freeze or from an internal CAPP error with CAPP checkstop FIR action.
An error that causes a PHB freeze will result in the link down signal
being asserted. The system continues running and the CAPP and PSL will
be re-initialized.

Tests performed on some of the old/new hardware.

Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
---
 hw/phb4.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 5 deletions(-)

Comments

Frederic Barrat Sept. 15, 2017, 10:16 a.m. | #1
Le 13/09/2017 à 19:03, Christophe Lombard a écrit :
> CAPP recovery is initiated when a CAPP Machine Check is detected.
> The capp recovery procedure is initiated via a Hypervisor Maintenance
> interrupt (HMI).
> 
> CAPP Machine Check may arise from either an error that results in a PHB
> freeze or from an internal CAPP error with CAPP checkstop FIR action.
> An error that causes a PHB freeze will result in the link down signal
> being asserted. The system continues running and the CAPP and PSL will
> be re-initialized.
> 
> Tests performed on some of the old/new hardware.
> 
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> ---

Looks ok to me.
Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>


>   hw/phb4.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/phb4.c b/hw/phb4.c
> index c469797..69920b9 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -2786,6 +2786,27 @@ static int64_t load_capp_ucode(struct phb4 *p)
>   	return rc;
>   }
> 
> +static void do_capp_recovery_scoms(struct phb4 *p)
> +{
> +	uint64_t reg;
> +	uint32_t offset;
> +
> +	PHBDBG(p, "Doing CAPP recovery scoms\n");
> +
> +	offset = PHB4_CAPP_REG_OFFSET(p);
> +	/* disable snoops */
> +	xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
> +	load_capp_ucode(p);
> +	/* clear err rpt reg*/
> +	xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
> +	/* clear capp fir */
> +	xscom_write(p->chip_id, CAPP_FIR + offset, 0);
> +
> +	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
> +	reg &= ~(PPC_BIT(0) | PPC_BIT(1));
> +	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
> +}
> +
>   static int64_t phb4_creset(struct pci_slot *slot)
>   {
>   	struct phb4 *p = phb_to_phb4(slot->phb);
> @@ -2800,11 +2821,9 @@ static int64_t phb4_creset(struct pci_slot *slot)
>   	case PHB4_SLOT_CRESET_START:
>   		PHBDBG(p, "CRESET: Starts\n");
> 
> -		/* do steps 3-5 of capp recovery procedure */
> -#if 0
> +		/* capp recovery */
>   		if (p->flags & PHB4_CAPP_RECOVERY)
>   			do_capp_recovery_scoms(p);
> -#endif
> 
>   		phb4_prepare_link_change(slot, false);
>   		/* Clear error inject register, preventing recursive errors */
> @@ -3517,6 +3536,10 @@ static void phb4_init_capp_regs(struct phb4 *p)
>   	xscom_read(p->chip_id, SNOOP_CONTROL + offset, &reg);
>   	reg |= PPC_BIT(0);
>   	reg |= PPC_BIT(35);
> +	reg |= PPC_BIT(45);
> +	reg |= PPC_BIT(46);
> +	reg |= PPC_BIT(47);
> +	reg |= PPC_BIT(50);
>   	xscom_write(p->chip_id, SNOOP_CONTROL + offset, reg);
> 
>   	/* Transport Control Register */
> @@ -3532,6 +3555,7 @@ static void phb4_init_capp_regs(struct phb4 *p)
>   		reg |= PPC_BIT(11); /* Set CI Store Buffer Threshold=5 */
>   		reg |= PPC_BIT(13); /* Set CI Store Buffer Threshold=5 */
>   	}
> +	reg |= PPC_BIT(15);
>   	reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
>   	reg |= PPC_BIT(60); /* Set lowest CI Store buffer used bits */
>   	xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg);
> @@ -3547,12 +3571,12 @@ static void phb4_init_capp_regs(struct phb4 *p)
> 
>   	/* Flush SUE State Map Register */
>   	xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset,
> -		    0x1DCF5F6600000000);
> +		    0x08020A0000000000);
> 
>   	if (p->rev == PHB4_REV_NIMBUS_DD20) {
>   		/* Flush SUE uOP1 Register */
>   		xscom_write(p->chip_id, FLUSH_SUE_UOP1 + offset,
> -			    0xE310280428000000);
> +			    0xDCE0280428000000);
>   	}
> 
>   	/* capp owns PHB read buffers */
> @@ -3570,6 +3594,15 @@ static void phb4_init_capp_regs(struct phb4 *p)
>   			    0xFFFFF00E00000000);
>   	}
> 
> +	/* CAPP FIR Action 0 */
> +	xscom_write(p->chip_id, CAPP_FIR_ACTION0 + offset, 0x0b1c000104060000);
> +
> +	/* CAPP FIR Action 1 */
> +	xscom_write(p->chip_id, CAPP_FIR_ACTION1 + offset, 0x2b9c0001240E0000);
> +
> +	/* CAPP FIR MASK */
> +	xscom_write(p->chip_id, CAPP_FIR_MASK + offset, 0x80031f98d8737000);
> +
>   	/* Deassert TLBI_FENCED and tlbi_psl_is_dead */
>   	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0);
>   }
> @@ -3921,6 +3954,22 @@ static void phb4_set_p2p(struct phb *phb, uint64_t mode, uint64_t flags,
>   	}
>   }
> 
> +static int64_t phb4_set_capp_recovery(struct phb *phb)
> +{
> +	struct phb4 *p = phb_to_phb4(phb);
> +
> +	if (p->flags & PHB4_CAPP_RECOVERY)
> +		return 0;
> +
> +	/* set opal event flag to indicate eeh condition */
> +	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> +				OPAL_EVENT_PCI_ERROR);
> +
> +	p->flags |= PHB4_CAPP_RECOVERY;
> +
> +	return 0;
> +}
> +
>   static const struct phb_ops phb4_ops = {
>   	.cfg_read8		= phb4_pcicfg_read8,
>   	.cfg_read16		= phb4_pcicfg_read16,
> @@ -3955,6 +4004,7 @@ static const struct phb_ops phb4_ops = {
>   	.tce_kill		= phb4_tce_kill,
>   	.set_capi_mode		= phb4_set_capi_mode,
>   	.set_p2p		= phb4_set_p2p,
> +	.set_capp_recovery	= phb4_set_capp_recovery,
>   };
> 
>   static void phb4_init_ioda3(struct phb4 *p)
>
Vaibhav Jain Sept. 18, 2017, 6:13 a.m. | #2
Hi Christophe,

Minor review comments:

Christophe Lombard <clombard@linux.vnet.ibm.com> writes:
> +
> +	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
> +	reg &= ~(PPC_BIT(0) | PPC_BIT(1));
> +	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
Please use xscom_write_mask instead.

>  	/* Transport Control Register */
> @@ -3532,6 +3555,7 @@ static void phb4_init_capp_regs(struct phb4 *p)
>  		reg |= PPC_BIT(11); /* Set CI Store Buffer Threshold=5 */
>  		reg |= PPC_BIT(13); /* Set CI Store Buffer Threshold=5 */
>  	}
> +	reg |= PPC_BIT(15);
>  	reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
And mask is resetting the BIT(15). Also should field be set to b0101 as
per workbook.
christophe lombard Sept. 18, 2017, 3:16 p.m. | #3
Le 18/09/2017 à 08:13, Vaibhav Jain a écrit :
> Hi Christophe,
>
> Minor review comments:
>
> Christophe Lombard <clombard@linux.vnet.ibm.com> writes:
>> +
>> +	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
>> +	reg &= ~(PPC_BIT(0) | PPC_BIT(1));
>> +	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
> Please use xscom_write_mask instead.

I would prefer keep this method, because it's similar compared what we 
have on phb3.

>>   	/* Transport Control Register */
>> @@ -3532,6 +3555,7 @@ static void phb4_init_capp_regs(struct phb4 *p)
>>   		reg |= PPC_BIT(11); /* Set CI Store Buffer Threshold=5 */
>>   		reg |= PPC_BIT(13); /* Set CI Store Buffer Threshold=5 */
>>   	}
>> +	reg |= PPC_BIT(15);
>>   	reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
> And mask is resetting the BIT(15).

correct.

> Also should field be set to b0101 as
> per workbook.

You are right: b’0101’ = use 6 buffers, but we follow Oren's init.

#Set Max LPC CI store buffer to zeros - bits[14:17] = 0
putscom pu 0401081C  $2 -ib 14 4 0000

Patch

diff --git a/hw/phb4.c b/hw/phb4.c
index c469797..69920b9 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -2786,6 +2786,27 @@  static int64_t load_capp_ucode(struct phb4 *p)
 	return rc;
 }
 
+static void do_capp_recovery_scoms(struct phb4 *p)
+{
+	uint64_t reg;
+	uint32_t offset;
+
+	PHBDBG(p, "Doing CAPP recovery scoms\n");
+
+	offset = PHB4_CAPP_REG_OFFSET(p);
+	/* disable snoops */
+	xscom_write(p->chip_id, SNOOP_CAPI_CONFIG + offset, 0);
+	load_capp_ucode(p);
+	/* clear err rpt reg*/
+	xscom_write(p->chip_id, CAPP_ERR_RPT_CLR + offset, 0);
+	/* clear capp fir */
+	xscom_write(p->chip_id, CAPP_FIR + offset, 0);
+
+	xscom_read(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, &reg);
+	reg &= ~(PPC_BIT(0) | PPC_BIT(1));
+	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, reg);
+}
+
 static int64_t phb4_creset(struct pci_slot *slot)
 {
 	struct phb4 *p = phb_to_phb4(slot->phb);
@@ -2800,11 +2821,9 @@  static int64_t phb4_creset(struct pci_slot *slot)
 	case PHB4_SLOT_CRESET_START:
 		PHBDBG(p, "CRESET: Starts\n");
 
-		/* do steps 3-5 of capp recovery procedure */
-#if 0
+		/* capp recovery */
 		if (p->flags & PHB4_CAPP_RECOVERY)
 			do_capp_recovery_scoms(p);
-#endif
 
 		phb4_prepare_link_change(slot, false);
 		/* Clear error inject register, preventing recursive errors */
@@ -3517,6 +3536,10 @@  static void phb4_init_capp_regs(struct phb4 *p)
 	xscom_read(p->chip_id, SNOOP_CONTROL + offset, &reg);
 	reg |= PPC_BIT(0);
 	reg |= PPC_BIT(35);
+	reg |= PPC_BIT(45);
+	reg |= PPC_BIT(46);
+	reg |= PPC_BIT(47);
+	reg |= PPC_BIT(50);
 	xscom_write(p->chip_id, SNOOP_CONTROL + offset, reg);
 
 	/* Transport Control Register */
@@ -3532,6 +3555,7 @@  static void phb4_init_capp_regs(struct phb4 *p)
 		reg |= PPC_BIT(11); /* Set CI Store Buffer Threshold=5 */
 		reg |= PPC_BIT(13); /* Set CI Store Buffer Threshold=5 */
 	}
+	reg |= PPC_BIT(15);
 	reg &= ~PPC_BITMASK(14, 17); /* Set Max LPC CI store buffer to zeros */
 	reg |= PPC_BIT(60); /* Set lowest CI Store buffer used bits */
 	xscom_write(p->chip_id, TRANSPORT_CONTROL + offset, reg);
@@ -3547,12 +3571,12 @@  static void phb4_init_capp_regs(struct phb4 *p)
 
 	/* Flush SUE State Map Register */
 	xscom_write(p->chip_id, FLUSH_SUE_STATE_MAP + offset,
-		    0x1DCF5F6600000000);
+		    0x08020A0000000000);
 
 	if (p->rev == PHB4_REV_NIMBUS_DD20) {
 		/* Flush SUE uOP1 Register */
 		xscom_write(p->chip_id, FLUSH_SUE_UOP1 + offset,
-			    0xE310280428000000);
+			    0xDCE0280428000000);
 	}
 
 	/* capp owns PHB read buffers */
@@ -3570,6 +3594,15 @@  static void phb4_init_capp_regs(struct phb4 *p)
 			    0xFFFFF00E00000000);
 	}
 
+	/* CAPP FIR Action 0 */
+	xscom_write(p->chip_id, CAPP_FIR_ACTION0 + offset, 0x0b1c000104060000);
+
+	/* CAPP FIR Action 1 */
+	xscom_write(p->chip_id, CAPP_FIR_ACTION1 + offset, 0x2b9c0001240E0000);
+
+	/* CAPP FIR MASK */
+	xscom_write(p->chip_id, CAPP_FIR_MASK + offset, 0x80031f98d8737000);
+
 	/* Deassert TLBI_FENCED and tlbi_psl_is_dead */
 	xscom_write(p->chip_id, CAPP_ERR_STATUS_CTRL + offset, 0);
 }
@@ -3921,6 +3954,22 @@  static void phb4_set_p2p(struct phb *phb, uint64_t mode, uint64_t flags,
 	}
 }
 
+static int64_t phb4_set_capp_recovery(struct phb *phb)
+{
+	struct phb4 *p = phb_to_phb4(phb);
+
+	if (p->flags & PHB4_CAPP_RECOVERY)
+		return 0;
+
+	/* set opal event flag to indicate eeh condition */
+	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+				OPAL_EVENT_PCI_ERROR);
+
+	p->flags |= PHB4_CAPP_RECOVERY;
+
+	return 0;
+}
+
 static const struct phb_ops phb4_ops = {
 	.cfg_read8		= phb4_pcicfg_read8,
 	.cfg_read16		= phb4_pcicfg_read16,
@@ -3955,6 +4004,7 @@  static const struct phb_ops phb4_ops = {
 	.tce_kill		= phb4_tce_kill,
 	.set_capi_mode		= phb4_set_capi_mode,
 	.set_p2p		= phb4_set_p2p,
+	.set_capp_recovery	= phb4_set_capp_recovery,
 };
 
 static void phb4_init_ioda3(struct phb4 *p)