hw/npu2: support creset of npu2 devices

Message ID 20171215025942.3983-1-bsingharora@gmail.com
State Accepted
Headers show
Series
  • hw/npu2: support creset of npu2 devices
Related show

Commit Message

Balbir Singh Dec. 15, 2017, 2:59 a.m.
creset calls in the hw procedure that resets the PHY, we don't
take them out of reset, just put them in reset.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
---
 hw/npu2-hw-procedures.c |  2 +-
 hw/npu2.c               | 21 ++++++++++++++++++++-
 include/npu2.h          |  1 +
 3 files changed, 22 insertions(+), 2 deletions(-)

Comments

Alistair Popple Dec. 15, 2017, 3:38 a.m. | #1
On Friday, 15 December 2017 1:59:42 PM AEDT Balbir Singh wrote:
> creset calls in the hw procedure that resets the PHY, we don't
> take them out of reset, just put them in reset.
> 
> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
> ---
>  hw/npu2-hw-procedures.c |  2 +-
>  hw/npu2.c               | 21 ++++++++++++++++++++-
>  include/npu2.h          |  1 +
>  3 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
> index 1318e867..dfff4a2a 100644
> --- a/hw/npu2-hw-procedures.c
> +++ b/hw/npu2-hw-procedures.c
> @@ -233,7 +233,7 @@ static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val)
>  }
>  
>  /* Procedure 1.2.1 - Reset NPU/NDL */
> -static uint32_t reset_ntl(struct npu2_dev *ndev)
> +uint32_t reset_ntl(struct npu2_dev *ndev)
>  {
>  	uint64_t val;
>  
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 7ffb0941..ee75b6fd 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -1170,6 +1170,25 @@ static int64_t npu2_freset(struct pci_slot *slot __unused)
>  	return OPAL_SUCCESS;
>  }
>  
> +static int64_t npu2_creset(struct pci_slot *slot)
> +{
> +	struct npu2 *p;
> +	int i;
> +	struct npu2_dev *ndev;
> +
> +	p = phb_to_npu2(slot->phb);
> +	NPU2INF(p, "Creset PHB state\n");
> +
> +	for (i = 0; i < p->total_devices; i++) {

Wont this reset every link on the NPU rather than just the given pci_slot? We
need to make sure this resets just the specified device.

> +		ndev = &p->devices[i];
> +		if (ndev) {
> +			NPU2DEVINF(ndev, "Resetting device\n");
> +			reset_ntl(ndev);

What is the motivation for implementing this? I suspect we should also reset the
DL and PHY.

Regards,

Alistair

> +		}
> +	}
> +	return OPAL_SUCCESS;
> +}
> +
>  static struct pci_slot *npu2_slot_create(struct phb *phb)
>  {
>  	struct pci_slot *slot;
> @@ -1191,7 +1210,7 @@ static struct pci_slot *npu2_slot_create(struct phb *phb)
>  	slot->ops.poll_link           = NULL;
>  	slot->ops.hreset              = npu2_hreset;
>  	slot->ops.freset              = npu2_freset;
> -	slot->ops.creset              = NULL;
> +	slot->ops.creset              = npu2_creset;
>  
>  	return slot;
>  }
> diff --git a/include/npu2.h b/include/npu2.h
> index dae152a6..c1f39616 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -162,5 +162,6 @@ int64_t npu2_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
>  void npu2_dev_procedure_reset(struct npu2_dev *dev);
>  void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag);
>  void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag);
> +uint32_t reset_ntl(struct npu2_dev *ndev);
>  extern int nv_zcal_nominal;
>  #endif /* __NPU2_H */
>
Balbir Singh Dec. 17, 2017, 10:26 a.m. | #2
On Fri, Dec 15, 2017 at 2:38 PM, Alistair Popple <alistair@popple.id.au> wrote:
> On Friday, 15 December 2017 1:59:42 PM AEDT Balbir Singh wrote:
>> creset calls in the hw procedure that resets the PHY, we don't
>> take them out of reset, just put them in reset.
>>
>> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
>> ---
>>  hw/npu2-hw-procedures.c |  2 +-
>>  hw/npu2.c               | 21 ++++++++++++++++++++-
>>  include/npu2.h          |  1 +
>>  3 files changed, 22 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
>> index 1318e867..dfff4a2a 100644
>> --- a/hw/npu2-hw-procedures.c
>> +++ b/hw/npu2-hw-procedures.c
>> @@ -233,7 +233,7 @@ static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val)
>>  }
>>
>>  /* Procedure 1.2.1 - Reset NPU/NDL */
>> -static uint32_t reset_ntl(struct npu2_dev *ndev)
>> +uint32_t reset_ntl(struct npu2_dev *ndev)
>>  {
>>       uint64_t val;
>>
>> diff --git a/hw/npu2.c b/hw/npu2.c
>> index 7ffb0941..ee75b6fd 100644
>> --- a/hw/npu2.c
>> +++ b/hw/npu2.c
>> @@ -1170,6 +1170,25 @@ static int64_t npu2_freset(struct pci_slot *slot __unused)
>>       return OPAL_SUCCESS;
>>  }
>>
>> +static int64_t npu2_creset(struct pci_slot *slot)
>> +{
>> +     struct npu2 *p;
>> +     int i;
>> +     struct npu2_dev *ndev;
>> +
>> +     p = phb_to_npu2(slot->phb);
>> +     NPU2INF(p, "Creset PHB state\n");
>> +
>> +     for (i = 0; i < p->total_devices; i++) {
>
> Wont this reset every link on the NPU rather than just the given pci_slot? We
> need to make sure this resets just the specified device.

We have a slot for each of the struct npu2 and total_devices are the
total number of devices on that phb. In the tests on a 4 GPU system, I
did not find anything missing or extra. I'll double check

>
>> +             ndev = &p->devices[i];
>> +             if (ndev) {
>> +                     NPU2DEVINF(ndev, "Resetting device\n");
>> +                     reset_ntl(ndev);
>
> What is the motivation for implementing this? I suspect we should also reset the
> DL and PHY.
>

A creset should set the links to reset state, I'm just following the
procedure I am aware of. I do not want to bring these links out of
reset


> Regards,
>
> Alistair
>

Thanks for the review!
Balbir Singh.
>> +             }
>> +     }
>> +     return OPAL_SUCCESS;
>> +}
>> +
>>  static struct pci_slot *npu2_slot_create(struct phb *phb)
>>  {
>>       struct pci_slot *slot;
>> @@ -1191,7 +1210,7 @@ static struct pci_slot *npu2_slot_create(struct phb *phb)
>>       slot->ops.poll_link           = NULL;
>>       slot->ops.hreset              = npu2_hreset;
>>       slot->ops.freset              = npu2_freset;
>> -     slot->ops.creset              = NULL;
>> +     slot->ops.creset              = npu2_creset;
>>
>>       return slot;
>>  }
>> diff --git a/include/npu2.h b/include/npu2.h
>> index dae152a6..c1f39616 100644
>> --- a/include/npu2.h
>> +++ b/include/npu2.h
>> @@ -162,5 +162,6 @@ int64_t npu2_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
>>  void npu2_dev_procedure_reset(struct npu2_dev *dev);
>>  void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag);
>>  void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag);
>> +uint32_t reset_ntl(struct npu2_dev *ndev);
>>  extern int nv_zcal_nominal;
>>  #endif /* __NPU2_H */
>>
>
>
Balbir Singh Jan. 17, 2018, 4:49 a.m. | #3
On Sun, Dec 17, 2017 at 3:56 PM, Balbir Singh <bsingharora@gmail.com> wrote:
> On Fri, Dec 15, 2017 at 2:38 PM, Alistair Popple <alistair@popple.id.au> wrote:
>> On Friday, 15 December 2017 1:59:42 PM AEDT Balbir Singh wrote:
>>> creset calls in the hw procedure that resets the PHY, we don't
>>> take them out of reset, just put them in reset.
>>>
>>> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
>>> ---
>>>  hw/npu2-hw-procedures.c |  2 +-
>>>  hw/npu2.c               | 21 ++++++++++++++++++++-
>>>  include/npu2.h          |  1 +
>>>  3 files changed, 22 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
>>> index 1318e867..dfff4a2a 100644
>>> --- a/hw/npu2-hw-procedures.c
>>> +++ b/hw/npu2-hw-procedures.c
>>> @@ -233,7 +233,7 @@ static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val)
>>>  }
>>>
>>>  /* Procedure 1.2.1 - Reset NPU/NDL */
>>> -static uint32_t reset_ntl(struct npu2_dev *ndev)
>>> +uint32_t reset_ntl(struct npu2_dev *ndev)
>>>  {
>>>       uint64_t val;
>>>
>>> diff --git a/hw/npu2.c b/hw/npu2.c
>>> index 7ffb0941..ee75b6fd 100644
>>> --- a/hw/npu2.c
>>> +++ b/hw/npu2.c
>>> @@ -1170,6 +1170,25 @@ static int64_t npu2_freset(struct pci_slot *slot __unused)
>>>       return OPAL_SUCCESS;
>>>  }
>>>
>>> +static int64_t npu2_creset(struct pci_slot *slot)
>>> +{
>>> +     struct npu2 *p;
>>> +     int i;
>>> +     struct npu2_dev *ndev;
>>> +
>>> +     p = phb_to_npu2(slot->phb);
>>> +     NPU2INF(p, "Creset PHB state\n");
>>> +
>>> +     for (i = 0; i < p->total_devices; i++) {
>>
>> Wont this reset every link on the NPU rather than just the given pci_slot? We
>> need to make sure this resets just the specified device.
>
> We have a slot for each of the struct npu2 and total_devices are the
> total number of devices on that phb. In the tests on a 4 GPU system, I
> did not find anything missing or extra. I'll double check
>
>>
>>> +             ndev = &p->devices[i];
>>> +             if (ndev) {
>>> +                     NPU2DEVINF(ndev, "Resetting device\n");
>>> +                     reset_ntl(ndev);
>>
>> What is the motivation for implementing this? I suspect we should also reset the
>> DL and PHY.
>>
>
> A creset should set the links to reset state, I'm just following the
> procedure I am aware of. I do not want to bring these links out of
> reset
>
>
>> Regards,
>>
>> Alistair

We discussed this offline as well, did you want to see any more changes?

Balbir
Alistair Popple Feb. 9, 2018, 1:10 a.m. | #4
Acked-by: Alistair Popple <alistair@popple.id.au>

On Wednesday, 17 January 2018 10:19:32 AM AEDT Balbir Singh wrote:
> On Sun, Dec 17, 2017 at 3:56 PM, Balbir Singh <bsingharora@gmail.com> wrote:
> > On Fri, Dec 15, 2017 at 2:38 PM, Alistair Popple <alistair@popple.id.au> wrote:
> >> On Friday, 15 December 2017 1:59:42 PM AEDT Balbir Singh wrote:
> >>> creset calls in the hw procedure that resets the PHY, we don't
> >>> take them out of reset, just put them in reset.
> >>>
> >>> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
> >>> ---
> >>>  hw/npu2-hw-procedures.c |  2 +-
> >>>  hw/npu2.c               | 21 ++++++++++++++++++++-
> >>>  include/npu2.h          |  1 +
> >>>  3 files changed, 22 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
> >>> index 1318e867..dfff4a2a 100644
> >>> --- a/hw/npu2-hw-procedures.c
> >>> +++ b/hw/npu2-hw-procedures.c
> >>> @@ -233,7 +233,7 @@ static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val)
> >>>  }
> >>>
> >>>  /* Procedure 1.2.1 - Reset NPU/NDL */
> >>> -static uint32_t reset_ntl(struct npu2_dev *ndev)
> >>> +uint32_t reset_ntl(struct npu2_dev *ndev)
> >>>  {
> >>>       uint64_t val;
> >>>
> >>> diff --git a/hw/npu2.c b/hw/npu2.c
> >>> index 7ffb0941..ee75b6fd 100644
> >>> --- a/hw/npu2.c
> >>> +++ b/hw/npu2.c
> >>> @@ -1170,6 +1170,25 @@ static int64_t npu2_freset(struct pci_slot *slot __unused)
> >>>       return OPAL_SUCCESS;
> >>>  }
> >>>
> >>> +static int64_t npu2_creset(struct pci_slot *slot)
> >>> +{
> >>> +     struct npu2 *p;
> >>> +     int i;
> >>> +     struct npu2_dev *ndev;
> >>> +
> >>> +     p = phb_to_npu2(slot->phb);
> >>> +     NPU2INF(p, "Creset PHB state\n");
> >>> +
> >>> +     for (i = 0; i < p->total_devices; i++) {
> >>
> >> Wont this reset every link on the NPU rather than just the given pci_slot? We
> >> need to make sure this resets just the specified device.
> >
> > We have a slot for each of the struct npu2 and total_devices are the
> > total number of devices on that phb. In the tests on a 4 GPU system, I
> > did not find anything missing or extra. I'll double check
> >
> >>
> >>> +             ndev = &p->devices[i];
> >>> +             if (ndev) {
> >>> +                     NPU2DEVINF(ndev, "Resetting device\n");
> >>> +                     reset_ntl(ndev);
> >>
> >> What is the motivation for implementing this? I suspect we should also reset the
> >> DL and PHY.
> >>
> >
> > A creset should set the links to reset state, I'm just following the
> > procedure I am aware of. I do not want to bring these links out of
> > reset
> >
> >
> >> Regards,
> >>
> >> Alistair
> 
> We discussed this offline as well, did you want to see any more changes?
> 
> Balbir
>
Stewart Smith Feb. 13, 2018, 8:30 a.m. | #5
Balbir Singh <bsingharora@gmail.com> writes:
> creset calls in the hw procedure that resets the PHY, we don't
> take them out of reset, just put them in reset.
>
> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
> ---
>  hw/npu2-hw-procedures.c |  2 +-
>  hw/npu2.c               | 21 ++++++++++++++++++++-
>  include/npu2.h          |  1 +
>  3 files changed, 22 insertions(+), 2 deletions(-)

Thanks! (and sorry for the delay merging).

Merged to master as of 4133e973b4f2bf61d387ac4528bed3acd12b13d9

Patch

diff --git a/hw/npu2-hw-procedures.c b/hw/npu2-hw-procedures.c
index 1318e867..dfff4a2a 100644
--- a/hw/npu2-hw-procedures.c
+++ b/hw/npu2-hw-procedures.c
@@ -233,7 +233,7 @@  static bool poll_fence_status(struct npu2_dev *ndev, uint64_t val)
 }
 
 /* Procedure 1.2.1 - Reset NPU/NDL */
-static uint32_t reset_ntl(struct npu2_dev *ndev)
+uint32_t reset_ntl(struct npu2_dev *ndev)
 {
 	uint64_t val;
 
diff --git a/hw/npu2.c b/hw/npu2.c
index 7ffb0941..ee75b6fd 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -1170,6 +1170,25 @@  static int64_t npu2_freset(struct pci_slot *slot __unused)
 	return OPAL_SUCCESS;
 }
 
+static int64_t npu2_creset(struct pci_slot *slot)
+{
+	struct npu2 *p;
+	int i;
+	struct npu2_dev *ndev;
+
+	p = phb_to_npu2(slot->phb);
+	NPU2INF(p, "Creset PHB state\n");
+
+	for (i = 0; i < p->total_devices; i++) {
+		ndev = &p->devices[i];
+		if (ndev) {
+			NPU2DEVINF(ndev, "Resetting device\n");
+			reset_ntl(ndev);
+		}
+	}
+	return OPAL_SUCCESS;
+}
+
 static struct pci_slot *npu2_slot_create(struct phb *phb)
 {
 	struct pci_slot *slot;
@@ -1191,7 +1210,7 @@  static struct pci_slot *npu2_slot_create(struct phb *phb)
 	slot->ops.poll_link           = NULL;
 	slot->ops.hreset              = npu2_hreset;
 	slot->ops.freset              = npu2_freset;
-	slot->ops.creset              = NULL;
+	slot->ops.creset              = npu2_creset;
 
 	return slot;
 }
diff --git a/include/npu2.h b/include/npu2.h
index dae152a6..c1f39616 100644
--- a/include/npu2.h
+++ b/include/npu2.h
@@ -162,5 +162,6 @@  int64_t npu2_dev_procedure(void *dev, struct pci_cfg_reg_filter *pcrf,
 void npu2_dev_procedure_reset(struct npu2_dev *dev);
 void npu2_set_link_flag(struct npu2_dev *ndev, uint8_t flag);
 void npu2_clear_link_flag(struct npu2_dev *ndev, uint8_t flag);
+uint32_t reset_ntl(struct npu2_dev *ndev);
 extern int nv_zcal_nominal;
 #endif /* __NPU2_H */