diff mbox series

[12/16] npu2-opencapi: Detect PHY reset errors

Message ID 20190909123151.21944-13-fbarrat@linux.ibm.com
State Superseded
Headers show
Series opencapi: enable card reset and link retraining | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (470ffb5f29d741c3bed600f7bb7bf0cbb270e05a)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Frederic Barrat Sept. 9, 2019, 12:31 p.m. UTC
PHY reset can fail! Though past problems are now fixed, let's handle
any future failure.

Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
---
 hw/npu2-hw-procedures.c | 13 ++++++++++---
 hw/npu2-opencapi.c      |  5 ++++-
 include/npu2.h          |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

Comments

Christophe Lombard Sept. 17, 2019, 10 a.m. UTC | #1
On 09/09/2019 14:31, Frederic Barrat wrote:
> PHY reset can fail! Though past problems are now fixed, let's handle
> any future failure.
> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> ---
>   hw/npu2-hw-procedures.c | 13 ++++++++++---
>   hw/npu2-opencapi.c      |  5 ++++-
>   include/npu2.h          |  2 +-
>   3 files changed, 15 insertions(+), 5 deletions(-)
> 

May be you could add an error message when we face to this specific 
issue ...

Reviewed-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
Frederic Barrat Sept. 17, 2019, 3:18 p.m. UTC | #2
Le 17/09/2019 à 12:00, christophe lombard a écrit :
> On 09/09/2019 14:31, Frederic Barrat wrote:
>> PHY reset can fail! Though past problems are now fixed, let's handle
>> any future failure.
>>
>> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
>> ---
>>   hw/npu2-hw-procedures.c | 13 ++++++++++---
>>   hw/npu2-opencapi.c      |  5 ++++-
>>   include/npu2.h          |  2 +-
>>   3 files changed, 15 insertions(+), 5 deletions(-)
>>
> 
> May be you could add an error message when we face to this specific 
> issue ...


Good point. We only log something if a procedure times out, but not in 
case of straight errors. I'll add it.

   Fred


> 
> Reviewed-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
>
Andrew Donnellan Sept. 25, 2019, 3:25 p.m. UTC | #3
On 9/9/19 2:31 pm, Frederic Barrat wrote:
> PHY reset can fail! Though past problems are now fixed, let's handle
> any future failure.
> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>

Error message would be nice. I think it should be fine to leave 
everything else in reset if the PHY reset fails.

Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>

> ---
>   hw/npu2-hw-procedures.c | 13 ++++++++++---
>   hw/npu2-opencapi.c      |  5 ++++-
>   include/npu2.h          |  2 +-
>   3 files changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
> index ad1627ae..8379cbbe 100644
> --- a/hw/npu2-hw-procedures.c
> +++ b/hw/npu2-hw-procedures.c
> @@ -1041,10 +1041,17 @@ void npu2_opencapi_phy_init(struct npu2_dev *dev)
>   	}
>   }
>   
> -void npu2_opencapi_phy_reset(struct npu2_dev *dev)
> +int npu2_opencapi_phy_reset(struct npu2_dev *dev)
>   {
> -	run_procedure(dev, 4); /* procedure_phy_reset */
> -	run_procedure(dev, 6); /* procedure_phy_rx_dccal */
> +	int rc;
> +
> +	rc = run_procedure(dev, 4); /* procedure_phy_reset */
> +	if (rc != PROCEDURE_COMPLETE)
> +		return -1;
> +	rc = run_procedure(dev, 6); /* procedure_phy_rx_dccal */
> +	if (rc != PROCEDURE_COMPLETE)
> +		return -1;
> +	return 0;
>   }
>   
>   void npu2_opencapi_phy_prbs31(struct npu2_dev *dev)
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index ead6f5fa..efec162d 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1187,6 +1187,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot)
>   	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
>   	uint32_t chip_id = dev->npu->chip_id;
>   	uint8_t presence = 1;
> +	int rc;
>   
>   	switch (slot->state) {
>   	case OCAPI_SLOT_NORMAL:
> @@ -1216,7 +1217,9 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot)
>   		return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
>   
>   	case OCAPI_SLOT_FRESET_ASSERT_DELAY:
> -		npu2_opencapi_phy_reset(dev);
> +		rc = npu2_opencapi_phy_reset(dev);
> +		if (rc)
> +			return OPAL_HARDWARE;
>   		deassert_odl_reset(chip_id, dev->brick_index);
>   		deassert_adapter_reset(dev);
>   		pci_slot_set_state(slot,
> diff --git a/include/npu2.h b/include/npu2.h
> index 6b1063da..6171cd3c 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -234,7 +234,7 @@ void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag);
>   uint32_t reset_ntl(struct npu2_dev *ndev);
>   extern int nv_zcal_nominal;
>   void npu2_opencapi_phy_init(struct npu2_dev *dev);
> -void npu2_opencapi_phy_reset(struct npu2_dev *dev);
> +int npu2_opencapi_phy_reset(struct npu2_dev *dev);
>   void npu2_opencapi_phy_prbs31(struct npu2_dev *dev);
>   void npu2_opencapi_bump_ui_lane(struct npu2_dev *dev);
>   int64_t npu2_freeze_status(struct phb *phb __unused,
>
diff mbox series

Patch

diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
index ad1627ae..8379cbbe 100644
--- a/hw/npu2-hw-procedures.c
+++ b/hw/npu2-hw-procedures.c
@@ -1041,10 +1041,17 @@  void npu2_opencapi_phy_init(struct npu2_dev *dev)
 	}
 }
 
-void npu2_opencapi_phy_reset(struct npu2_dev *dev)
+int npu2_opencapi_phy_reset(struct npu2_dev *dev)
 {
-	run_procedure(dev, 4); /* procedure_phy_reset */
-	run_procedure(dev, 6); /* procedure_phy_rx_dccal */
+	int rc;
+
+	rc = run_procedure(dev, 4); /* procedure_phy_reset */
+	if (rc != PROCEDURE_COMPLETE)
+		return -1;
+	rc = run_procedure(dev, 6); /* procedure_phy_rx_dccal */
+	if (rc != PROCEDURE_COMPLETE)
+		return -1;
+	return 0;
 }
 
 void npu2_opencapi_phy_prbs31(struct npu2_dev *dev)
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
index ead6f5fa..efec162d 100644
--- a/hw/npu2-opencapi.c
+++ b/hw/npu2-opencapi.c
@@ -1187,6 +1187,7 @@  static int64_t npu2_opencapi_freset(struct pci_slot *slot)
 	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
 	uint32_t chip_id = dev->npu->chip_id;
 	uint8_t presence = 1;
+	int rc;
 
 	switch (slot->state) {
 	case OCAPI_SLOT_NORMAL:
@@ -1216,7 +1217,9 @@  static int64_t npu2_opencapi_freset(struct pci_slot *slot)
 		return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
 
 	case OCAPI_SLOT_FRESET_ASSERT_DELAY:
-		npu2_opencapi_phy_reset(dev);
+		rc = npu2_opencapi_phy_reset(dev);
+		if (rc)
+			return OPAL_HARDWARE;
 		deassert_odl_reset(chip_id, dev->brick_index);
 		deassert_adapter_reset(dev);
 		pci_slot_set_state(slot,
diff --git a/include/npu2.h b/include/npu2.h
index 6b1063da..6171cd3c 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -234,7 +234,7 @@  void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag);
 uint32_t reset_ntl(struct npu2_dev *ndev);
 extern int nv_zcal_nominal;
 void npu2_opencapi_phy_init(struct npu2_dev *dev);
-void npu2_opencapi_phy_reset(struct npu2_dev *dev);
+int npu2_opencapi_phy_reset(struct npu2_dev *dev);
 void npu2_opencapi_phy_prbs31(struct npu2_dev *dev);
 void npu2_opencapi_bump_ui_lane(struct npu2_dev *dev);
 int64_t npu2_freeze_status(struct phb *phb __unused,