diff mbox series

phb4: Don't probe a PHB if its garded

Message ID 20180821101405.22032-1-vaibhav@linux.ibm.com
State Superseded
Headers show
Series phb4: Don't probe a PHB if its garded | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied
snowpatch_ozlabs/make_check success Test make_check on branch master

Commit Message

Vaibhav Jain Aug. 21, 2018, 10:14 a.m. UTC
Presently phb4_probe_stack() causes an exception while trying to probe
a PHB if its garded. This causes skiboot to go into a reboot loop with
following exception log:

 ***********************************************
 Fatal MCE at 000000003006ecd4   .probe_phb4+0x570
 CFAR : 00000000300b98a0
 <snip>
 Aborting!
CPU 0018 Backtrace:
 S: 0000000031cc37e0 R: 000000003001a51c   ._abort+0x4c
 S: 0000000031cc3860 R: 0000000030028170   .exception_entry+0x180
 S: 0000000031cc3a40 R: 0000000000001f10 *
 S: 0000000031cc3c20 R: 000000003006ecb0   .probe_phb4+0x54c
 S: 0000000031cc3e30 R: 0000000030014ca4   .main_cpu_entry+0x5b0
 S: 0000000031cc3f00 R: 0000000030002700   boot_entry+0x1b8

This is caused as phb4_probe_stack() will ignore all xscom read/write
errors to enable PHB Bars and then tries to perform an mmio to read
PHB Version registers that cause the fatal MCE.

We fix this by ignoring the PHB probe if the first xscom_write() to
populate the PHB Bar register fails, which indicates that there is
something wrong with the PHB.

Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
---
 hw/phb4.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Comments

Oliver O'Halloran Aug. 22, 2018, 5:29 a.m. UTC | #1
On Tue, Aug 21, 2018 at 8:14 PM, Vaibhav Jain <vaibhav@linux.ibm.com> wrote:
> Presently phb4_probe_stack() causes an exception while trying to probe
> a PHB if its garded. This causes skiboot to go into a reboot loop with
> following exception log:
>
>  ***********************************************
>  Fatal MCE at 000000003006ecd4   .probe_phb4+0x570
>  CFAR : 00000000300b98a0
>  <snip>
>  Aborting!
> CPU 0018 Backtrace:
>  S: 0000000031cc37e0 R: 000000003001a51c   ._abort+0x4c
>  S: 0000000031cc3860 R: 0000000030028170   .exception_entry+0x180
>  S: 0000000031cc3a40 R: 0000000000001f10 *
>  S: 0000000031cc3c20 R: 000000003006ecb0   .probe_phb4+0x54c
>  S: 0000000031cc3e30 R: 0000000030014ca4   .main_cpu_entry+0x5b0
>  S: 0000000031cc3f00 R: 0000000030002700   boot_entry+0x1b8
>
> This is caused as phb4_probe_stack() will ignore all xscom read/write
> errors to enable PHB Bars and then tries to perform an mmio to read
> PHB Version registers that cause the fatal MCE.
>
> We fix this by ignoring the PHB probe if the first xscom_write() to
> populate the PHB Bar register fails, which indicates that there is
> something wrong with the PHB.

There's a fix for this in upstream hostboot too. Unfortunately, the HB
version in op-build hasn't been bumped in months due to compiler
issues so this is probably the best way to fix this bug.

Reviewed-by: Oliver O'Halloran <oohall@gmail.com>

> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>

This should probably be CCed to stable so it goes into the sable builds too.

> ---
>  hw/phb4.c | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/hw/phb4.c b/hw/phb4.c
> index d1245dce..89b0b859 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -5546,6 +5546,7 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>         char *path;
>         uint64_t capp_ucode_base;
>         unsigned int max_link_speed;
> +       int rc;
>
>         gcid = dt_get_chip_id(stk_node);
>         stk_index = dt_prop_get_u32(stk_node, "reg");
> @@ -5567,9 +5568,17 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>
>         /* Initialize PHB register BAR */
>         phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
> -       xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR, phb_bar << 8);
> -       bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
> +       rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
> +                        phb_bar << 8);
> +
> +       /* A scom error here probably indicates a defective/garded PHB */
> +       if (rc != OPAL_SUCCESS) {
> +               prerror("PHB[%d:%d] Unable to set PHB BAR. Error=%d\n",
> +                     gcid, phb_num, rc);
> +               return;

Setting status = "broken" in the device-tree might also be a good idea.

> +       }
>
> +       bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
>
>         /* Same with INT BAR (ESB) */
>         phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);
> --
> 2.17.1
>
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Stewart Smith Sept. 13, 2018, 8:50 a.m. UTC | #2
Oliver <oohall@gmail.com> writes:
> On Tue, Aug 21, 2018 at 8:14 PM, Vaibhav Jain <vaibhav@linux.ibm.com> wrote:
>> Presently phb4_probe_stack() causes an exception while trying to probe
>> a PHB if its garded. This causes skiboot to go into a reboot loop with
>> following exception log:
>>
>>  ***********************************************
>>  Fatal MCE at 000000003006ecd4   .probe_phb4+0x570
>>  CFAR : 00000000300b98a0
>>  <snip>
>>  Aborting!
>> CPU 0018 Backtrace:
>>  S: 0000000031cc37e0 R: 000000003001a51c   ._abort+0x4c
>>  S: 0000000031cc3860 R: 0000000030028170   .exception_entry+0x180
>>  S: 0000000031cc3a40 R: 0000000000001f10 *
>>  S: 0000000031cc3c20 R: 000000003006ecb0   .probe_phb4+0x54c
>>  S: 0000000031cc3e30 R: 0000000030014ca4   .main_cpu_entry+0x5b0
>>  S: 0000000031cc3f00 R: 0000000030002700   boot_entry+0x1b8
>>
>> This is caused as phb4_probe_stack() will ignore all xscom read/write
>> errors to enable PHB Bars and then tries to perform an mmio to read
>> PHB Version registers that cause the fatal MCE.
>>
>> We fix this by ignoring the PHB probe if the first xscom_write() to
>> populate the PHB Bar register fails, which indicates that there is
>> something wrong with the PHB.
>
> There's a fix for this in upstream hostboot too. Unfortunately, the HB
> version in op-build hasn't been bumped in months due to compiler
> issues so this is probably the best way to fix this bug.
>
> Reviewed-by: Oliver O'Halloran <oohall@gmail.com>
>
>> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
>
> This should probably be CCed to stable so it goes into the sable builds too.
>
>> ---
>>  hw/phb4.c | 13 +++++++++++--
>>  1 file changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/phb4.c b/hw/phb4.c
>> index d1245dce..89b0b859 100644
>> --- a/hw/phb4.c
>> +++ b/hw/phb4.c
>> @@ -5546,6 +5546,7 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>>         char *path;
>>         uint64_t capp_ucode_base;
>>         unsigned int max_link_speed;
>> +       int rc;
>>
>>         gcid = dt_get_chip_id(stk_node);
>>         stk_index = dt_prop_get_u32(stk_node, "reg");
>> @@ -5567,9 +5568,17 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>>
>>         /* Initialize PHB register BAR */
>>         phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
>> -       xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR, phb_bar << 8);
>> -       bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
>> +       rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
>> +                        phb_bar << 8);
>> +
>> +       /* A scom error here probably indicates a defective/garded PHB */
>> +       if (rc != OPAL_SUCCESS) {
>> +               prerror("PHB[%d:%d] Unable to set PHB BAR. Error=%d\n",
>> +                     gcid, phb_num, rc);
>> +               return;
>
> Setting status = "broken" in the device-tree might also be a good
> idea.

Yeah, it would be. I'll take it in a follow-up patch though.

We really need to get some good degraded mode testing into op-test.
Stewart Smith Sept. 13, 2018, 8:50 a.m. UTC | #3
Vaibhav Jain <vaibhav@linux.ibm.com> writes:
> Presently phb4_probe_stack() causes an exception while trying to probe
> a PHB if its garded. This causes skiboot to go into a reboot loop with
> following exception log:
>
>  ***********************************************
>  Fatal MCE at 000000003006ecd4   .probe_phb4+0x570
>  CFAR : 00000000300b98a0
>  <snip>
>  Aborting!
> CPU 0018 Backtrace:
>  S: 0000000031cc37e0 R: 000000003001a51c   ._abort+0x4c
>  S: 0000000031cc3860 R: 0000000030028170   .exception_entry+0x180
>  S: 0000000031cc3a40 R: 0000000000001f10 *
>  S: 0000000031cc3c20 R: 000000003006ecb0   .probe_phb4+0x54c
>  S: 0000000031cc3e30 R: 0000000030014ca4   .main_cpu_entry+0x5b0
>  S: 0000000031cc3f00 R: 0000000030002700   boot_entry+0x1b8
>
> This is caused as phb4_probe_stack() will ignore all xscom read/write
> errors to enable PHB Bars and then tries to perform an mmio to read
> PHB Version registers that cause the fatal MCE.
>
> We fix this by ignoring the PHB probe if the first xscom_write() to
> populate the PHB Bar register fails, which indicates that there is
> something wrong with the PHB.
>
> Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
> ---
>  hw/phb4.c | 13 +++++++++++--
>  1 file changed, 11 insertions(+), 2 deletions(-)

Thanks, merged to master as of 1520d6a1e3aaec74228d213083b68da70729121a
and to 6.0.x as of 54ac06ff404bf13d0a8035985ff67e0610213f5a
diff mbox series

Patch

diff --git a/hw/phb4.c b/hw/phb4.c
index d1245dce..89b0b859 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -5546,6 +5546,7 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	char *path;
 	uint64_t capp_ucode_base;
 	unsigned int max_link_speed;
+	int rc;
 
 	gcid = dt_get_chip_id(stk_node);
 	stk_index = dt_prop_get_u32(stk_node, "reg");
@@ -5567,9 +5568,17 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 
 	/* Initialize PHB register BAR */
 	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
-	xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR, phb_bar << 8);
-	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
+	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
+			 phb_bar << 8);
+
+	/* A scom error here probably indicates a defective/garded PHB */
+	if (rc != OPAL_SUCCESS) {
+		prerror("PHB[%d:%d] Unable to set PHB BAR. Error=%d\n",
+		      gcid, phb_num, rc);
+		return;
+	}
 
+	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
 
 	/* Same with INT BAR (ESB) */
 	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);