diff mbox series

[10/16] npu2-opencapi: Tweak fundamental reset sequence

Message ID 20190909123151.21944-11-fbarrat@linux.ibm.com
State Superseded
Headers show
Series opencapi: enable card reset and link retraining | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (470ffb5f29d741c3bed600f7bb7bf0cbb270e05a)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Frederic Barrat Sept. 9, 2019, 12:31 p.m. UTC
Modify slightly the ordering of a few steps in our init sequence on
fundamental reset, so that it can be called from the OS, when the link
is already up:

- when the card is reset, the link goes down, so we need to fence the
  brick to prevent errors propagating to the NPU and OS
- since fencing and unfencing don't require any delay, let's also
  fence/unfence during the very first reset at boot. It's useless but
  doesn't hurt and keep the code simpler.
- resetting the PHY must be done a bit later, while fenced and the ODL
  and DLx in reset

Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
---
 hw/npu2-opencapi.c | 48 +++++++++++++++++++++++++---------------------
 include/npu2.h     |  2 --
 2 files changed, 26 insertions(+), 24 deletions(-)

Comments

Christophe Lombard Sept. 17, 2019, 9:45 a.m. UTC | #1
On 09/09/2019 14:31, Frederic Barrat wrote:
> Modify slightly the ordering of a few steps in our init sequence on
> fundamental reset, so that it can be called from the OS, when the link
> is already up:
> 
> - when the card is reset, the link goes down, so we need to fence the
>    brick to prevent errors propagating to the NPU and OS
> - since fencing and unfencing don't require any delay, let's also
>    fence/unfence during the very first reset at boot. It's useless but
>    doesn't hurt and keep the code simpler.
> - resetting the PHY must be done a bit later, while fenced and the ODL
>    and DLx in reset
> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> ---
>   hw/npu2-opencapi.c | 48 +++++++++++++++++++++++++---------------------
>   include/npu2.h     |  2 --
>   2 files changed, 26 insertions(+), 24 deletions(-)
> 
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index f7be9f09..619d4be8 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1039,6 +1039,28 @@ static int64_t npu2_opencapi_get_presence_state(struct pci_slot __unused *slot,
>   	return OPAL_SUCCESS;
>   }
>   
> +static void fence_brick(struct npu2_dev *dev)
> +{
> +	OCAPIDBG(dev, "Fencing brick\n");
> +	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
> +			  dev->brick_index, 0b11);
> +	/* from 13.2.1, Quiesce Fence State */
> +	npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
> +		   PPC_BIT(dev->brick_index + 6));
> +}
> +
> +static void unfence_brick(struct npu2_dev *dev)
> +{
> +	OCAPIDBG(dev, "Unfencing brick\n");
> +	npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
> +		   PPC_BIT(dev->brick_index));
> +
> +	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
> +			  dev->brick_index, 0b10);

Unfence the brick implies to reset the PowerBus (0b10)?
This has been already done in fence_brick() ?

> +	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
> +			  dev->brick_index, 0b00);
> +}
> +
>   static enum OpalShpcLinkState get_link_width(uint64_t odl_status)
>   {
>   	uint64_t tx_lanes, rx_lanes, state;
> @@ -1153,7 +1175,7 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
>   	return OPAL_HARDWARE;
>   }
>   
> -static int64_t npu2_opencapi_creset(struct pci_slot *slot __unused)
> +static int64_t npu2_opencapi_creset(struct pci_slot *slot)
>   {
>   	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
>   
> @@ -1183,19 +1205,10 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot)
>   			OCAPIINF(dev, "no card detected\n");
>   			return OPAL_SUCCESS;
>   		}
> -		if (dev->train_need_fence) {
> -			OCAPIDBG(dev, "Fencing OTL during reset\n");
> -			set_fence_control(chip_id, dev->npu->xscom_base,
> -					dev->brick_index, 0b11);
> -			npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
> -				PPC_BIT(dev->brick_index + 6));
> -			dev->train_fenced = true;
> -		}
> -		dev->train_need_fence = true;
>   		slot->link_retries = OCAPI_LINK_TRAINING_RETRIES;
> -		npu2_opencapi_phy_reset(dev);
>   		/* fall-through */
>   	case OCAPI_SLOT_FRESET_INIT:
> +		fence_brick(dev);
>   		assert_odl_reset(chip_id, dev->brick_index);
>   		assert_adapter_reset(dev);
>   		pci_slot_set_state(slot,
> @@ -1204,6 +1217,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot)
>   		return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
>   
>   	case OCAPI_SLOT_FRESET_ASSERT_DELAY:
> +		npu2_opencapi_phy_reset(dev);
>   		deassert_odl_reset(chip_id, dev->brick_index);
>   		pci_slot_set_state(slot,
>   				OCAPI_SLOT_FRESET_DEASSERT_DELAY);
> @@ -1221,15 +1235,7 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot)
>   		return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
>   
>   	case OCAPI_SLOT_FRESET_DEASSERT_DELAY2:
> -		if (dev->train_fenced) {
> -			OCAPIDBG(dev, "Unfencing OTL after reset\n");
> -			npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
> -				   PPC_BIT(dev->brick_index));
> -			set_fence_control(chip_id, dev->npu->xscom_base,
> -					  dev->brick_index, 0b00);
> -			dev->train_fenced = false;
> -		}
> -
> +		unfence_brick(dev);
>   		set_init_pattern(chip_id, dev);
>   		pci_slot_set_state(slot,
>   				OCAPI_SLOT_FRESET_INIT_DELAY);
> @@ -1692,8 +1698,6 @@ static void setup_device(struct npu2_dev *dev)
>   
>   	dev->bdfn = 0;
>   	dev->linux_pe = -1;
> -	dev->train_need_fence = false;
> -	dev->train_fenced = false;
>   
>   	/* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */
>   	/* Procedure 13.1.3.8 - AFU MMIO Range BARs */
> diff --git a/include/npu2.h b/include/npu2.h
> index d2316dc1..6b1063da 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -145,8 +145,6 @@ struct npu2_dev {
>   	/* OpenCAPI */
>   	struct phb		phb_ocapi;
>   	uint64_t		linux_pe;
> -	bool			train_need_fence;
> -	bool			train_fenced;
>   	unsigned long		train_start;
>   	unsigned long		train_timeout;
>   };
>
Frederic Barrat Sept. 17, 2019, 3:12 p.m. UTC | #2
Le 17/09/2019 à 11:45, christophe lombard a écrit :
> On 09/09/2019 14:31, Frederic Barrat wrote:
>> Modify slightly the ordering of a few steps in our init sequence on
>> fundamental reset, so that it can be called from the OS, when the link
>> is already up:
>>
>> - when the card is reset, the link goes down, so we need to fence the
>>    brick to prevent errors propagating to the NPU and OS
>> - since fencing and unfencing don't require any delay, let's also
>>    fence/unfence during the very first reset at boot. It's useless but
>>    doesn't hurt and keep the code simpler.
>> - resetting the PHY must be done a bit later, while fenced and the ODL
>>    and DLx in reset
>>
>> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
>> ---
>>   hw/npu2-opencapi.c | 48 +++++++++++++++++++++++++---------------------
>>   include/npu2.h     |  2 --
>>   2 files changed, 26 insertions(+), 24 deletions(-)
>>
>> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
>> index f7be9f09..619d4be8 100644
>> --- a/hw/npu2-opencapi.c
>> +++ b/hw/npu2-opencapi.c
>> @@ -1039,6 +1039,28 @@ static int64_t 
>> npu2_opencapi_get_presence_state(struct pci_slot __unused *slot,
>>       return OPAL_SUCCESS;
>>   }
>> +static void fence_brick(struct npu2_dev *dev)
>> +{
>> +    OCAPIDBG(dev, "Fencing brick\n");
>> +    set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
>> +              dev->brick_index, 0b11);
>> +    /* from 13.2.1, Quiesce Fence State */
>> +    npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
>> +           PPC_BIT(dev->brick_index + 6));
>> +}
>> +
>> +static void unfence_brick(struct npu2_dev *dev)
>> +{
>> +    OCAPIDBG(dev, "Unfencing brick\n");
>> +    npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
>> +           PPC_BIT(dev->brick_index));
>> +
>> +    set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
>> +              dev->brick_index, 0b10);
> 
> Unfence the brick implies to reset the PowerBus (0b10)?
> This has been already done in fence_brick() ?


There are 2 sides for the fence: on the power bus and on the link. To go 
from fully fenced to unfenced, we need to go through the transitional 
0b10 state (i.e. unfence the link first. a.k.a half-fenced). I'm pretty 
sure it's described as such in the workbook, though I can't find it now 
that I'm looking for it. But it's showing up in section "13.2.3 Exiting 
NPU Quiesce state". And we also already do the same during the NPU init, 
as we go through a fence->unfence transition in set_npcq_config()

   Fred



>> +    set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
>> +              dev->brick_index, 0b00);
>> +}
>> +
>>   static enum OpalShpcLinkState get_link_width(uint64_t odl_status)
>>   {
>>       uint64_t tx_lanes, rx_lanes, state;
>> @@ -1153,7 +1175,7 @@ static int64_t npu2_opencapi_poll_link(struct 
>> pci_slot *slot)
>>       return OPAL_HARDWARE;
>>   }
>> -static int64_t npu2_opencapi_creset(struct pci_slot *slot __unused)
>> +static int64_t npu2_opencapi_creset(struct pci_slot *slot)
>>   {
>>       struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
>> @@ -1183,19 +1205,10 @@ static int64_t npu2_opencapi_freset(struct 
>> pci_slot *slot)
>>               OCAPIINF(dev, "no card detected\n");
>>               return OPAL_SUCCESS;
>>           }
>> -        if (dev->train_need_fence) {
>> -            OCAPIDBG(dev, "Fencing OTL during reset\n");
>> -            set_fence_control(chip_id, dev->npu->xscom_base,
>> -                    dev->brick_index, 0b11);
>> -            npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
>> -                PPC_BIT(dev->brick_index + 6));
>> -            dev->train_fenced = true;
>> -        }
>> -        dev->train_need_fence = true;
>>           slot->link_retries = OCAPI_LINK_TRAINING_RETRIES;
>> -        npu2_opencapi_phy_reset(dev);
>>           /* fall-through */
>>       case OCAPI_SLOT_FRESET_INIT:
>> +        fence_brick(dev);
>>           assert_odl_reset(chip_id, dev->brick_index);
>>           assert_adapter_reset(dev);
>>           pci_slot_set_state(slot,
>> @@ -1204,6 +1217,7 @@ static int64_t npu2_opencapi_freset(struct 
>> pci_slot *slot)
>>           return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
>>       case OCAPI_SLOT_FRESET_ASSERT_DELAY:
>> +        npu2_opencapi_phy_reset(dev);
>>           deassert_odl_reset(chip_id, dev->brick_index);
>>           pci_slot_set_state(slot,
>>                   OCAPI_SLOT_FRESET_DEASSERT_DELAY);
>> @@ -1221,15 +1235,7 @@ static int64_t npu2_opencapi_freset(struct 
>> pci_slot *slot)
>>           return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
>>       case OCAPI_SLOT_FRESET_DEASSERT_DELAY2:
>> -        if (dev->train_fenced) {
>> -            OCAPIDBG(dev, "Unfencing OTL after reset\n");
>> -            npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
>> -                   PPC_BIT(dev->brick_index));
>> -            set_fence_control(chip_id, dev->npu->xscom_base,
>> -                      dev->brick_index, 0b00);
>> -            dev->train_fenced = false;
>> -        }
>> -
>> +        unfence_brick(dev);
>>           set_init_pattern(chip_id, dev);
>>           pci_slot_set_state(slot,
>>                   OCAPI_SLOT_FRESET_INIT_DELAY);
>> @@ -1692,8 +1698,6 @@ static void setup_device(struct npu2_dev *dev)
>>       dev->bdfn = 0;
>>       dev->linux_pe = -1;
>> -    dev->train_need_fence = false;
>> -    dev->train_fenced = false;
>>       /* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */
>>       /* Procedure 13.1.3.8 - AFU MMIO Range BARs */
>> diff --git a/include/npu2.h b/include/npu2.h
>> index d2316dc1..6b1063da 100644
>> --- a/include/npu2.h
>> +++ b/include/npu2.h
>> @@ -145,8 +145,6 @@ struct npu2_dev {
>>       /* OpenCAPI */
>>       struct phb        phb_ocapi;
>>       uint64_t        linux_pe;
>> -    bool            train_need_fence;
>> -    bool            train_fenced;
>>       unsigned long        train_start;
>>       unsigned long        train_timeout;
>>   };
>>
>
diff mbox series

Patch

diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
index f7be9f09..619d4be8 100644
--- a/hw/npu2-opencapi.c
+++ b/hw/npu2-opencapi.c
@@ -1039,6 +1039,28 @@  static int64_t npu2_opencapi_get_presence_state(struct pci_slot __unused *slot,
 	return OPAL_SUCCESS;
 }
 
+static void fence_brick(struct npu2_dev *dev)
+{
+	OCAPIDBG(dev, "Fencing brick\n");
+	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+			  dev->brick_index, 0b11);
+	/* from 13.2.1, Quiesce Fence State */
+	npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
+		   PPC_BIT(dev->brick_index + 6));
+}
+
+static void unfence_brick(struct npu2_dev *dev)
+{
+	OCAPIDBG(dev, "Unfencing brick\n");
+	npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
+		   PPC_BIT(dev->brick_index));
+
+	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+			  dev->brick_index, 0b10);
+	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base,
+			  dev->brick_index, 0b00);
+}
+
 static enum OpalShpcLinkState get_link_width(uint64_t odl_status)
 {
 	uint64_t tx_lanes, rx_lanes, state;
@@ -1153,7 +1175,7 @@  static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
 	return OPAL_HARDWARE;
 }
 
-static int64_t npu2_opencapi_creset(struct pci_slot *slot __unused)
+static int64_t npu2_opencapi_creset(struct pci_slot *slot)
 {
 	struct npu2_dev *dev = phb_to_npu2_dev_ocapi(slot->phb);
 
@@ -1183,19 +1205,10 @@  static int64_t npu2_opencapi_freset(struct pci_slot *slot)
 			OCAPIINF(dev, "no card detected\n");
 			return OPAL_SUCCESS;
 		}
-		if (dev->train_need_fence) {
-			OCAPIDBG(dev, "Fencing OTL during reset\n");
-			set_fence_control(chip_id, dev->npu->xscom_base,
-					dev->brick_index, 0b11);
-			npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
-				PPC_BIT(dev->brick_index + 6));
-			dev->train_fenced = true;
-		}
-		dev->train_need_fence = true;
 		slot->link_retries = OCAPI_LINK_TRAINING_RETRIES;
-		npu2_opencapi_phy_reset(dev);
 		/* fall-through */
 	case OCAPI_SLOT_FRESET_INIT:
+		fence_brick(dev);
 		assert_odl_reset(chip_id, dev->brick_index);
 		assert_adapter_reset(dev);
 		pci_slot_set_state(slot,
@@ -1204,6 +1217,7 @@  static int64_t npu2_opencapi_freset(struct pci_slot *slot)
 		return pci_slot_set_sm_timeout(slot, msecs_to_tb(5));
 
 	case OCAPI_SLOT_FRESET_ASSERT_DELAY:
+		npu2_opencapi_phy_reset(dev);
 		deassert_odl_reset(chip_id, dev->brick_index);
 		pci_slot_set_state(slot,
 				OCAPI_SLOT_FRESET_DEASSERT_DELAY);
@@ -1221,15 +1235,7 @@  static int64_t npu2_opencapi_freset(struct pci_slot *slot)
 		return pci_slot_set_sm_timeout(slot, msecs_to_tb(250));
 
 	case OCAPI_SLOT_FRESET_DEASSERT_DELAY2:
-		if (dev->train_fenced) {
-			OCAPIDBG(dev, "Unfencing OTL after reset\n");
-			npu2_write(dev->npu, NPU2_MISC_FENCE_STATE,
-				   PPC_BIT(dev->brick_index));
-			set_fence_control(chip_id, dev->npu->xscom_base,
-					  dev->brick_index, 0b00);
-			dev->train_fenced = false;
-		}
-
+		unfence_brick(dev);
 		set_init_pattern(chip_id, dev);
 		pci_slot_set_state(slot,
 				OCAPI_SLOT_FRESET_INIT_DELAY);
@@ -1692,8 +1698,6 @@  static void setup_device(struct npu2_dev *dev)
 
 	dev->bdfn = 0;
 	dev->linux_pe = -1;
-	dev->train_need_fence = false;
-	dev->train_fenced = false;
 
 	/* TODO: Procedure 13.1.3.7 - AFU Memory Range BARs */
 	/* Procedure 13.1.3.8 - AFU MMIO Range BARs */
diff --git a/include/npu2.h b/include/npu2.h
index d2316dc1..6b1063da 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -145,8 +145,6 @@  struct npu2_dev {
 	/* OpenCAPI */
 	struct phb		phb_ocapi;
 	uint64_t		linux_pe;
-	bool			train_need_fence;
-	bool			train_fenced;
 	unsigned long		train_start;
 	unsigned long		train_timeout;
 };