diff mbox series

[1/1] PCI/AER: prevent pcie_do_fatal_recovery from using device after it is removed

Message ID 1534179088-44219-2-git-send-email-thomas.tai@oracle.com
State Not Applicable
Delegated to: Bjorn Helgaas
Headers show
Series PCI/AER: prevent pcie_do_fatal_recovery from using device after it is removed | expand

Commit Message

Thomas Tai Aug. 13, 2018, 4:51 p.m. UTC
In order to prevent the pcie_do_fatal_recovery() from
using the device after it is removed, the device's
domain:bus:devfn is stored at the entry of
pcie_do_fatal_recovery(). After rescanning the bus, the
stored domain:bus:devfn is used to find the device and
uses to report pci_info. The original issue only happens
on an non-bridge device, a local variable is used instead
of checking the device's header type.

Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
---
 drivers/pci/pcie/err.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

Comments

Oza Pawandeep Aug. 14, 2018, 9:16 a.m. UTC | #1
On 2018-08-13 22:21, Thomas Tai wrote:
> In order to prevent the pcie_do_fatal_recovery() from
> using the device after it is removed, the device's
> domain:bus:devfn is stored at the entry of
> pcie_do_fatal_recovery(). After rescanning the bus, the
> stored domain:bus:devfn is used to find the device and
> uses to report pci_info. The original issue only happens
> on an non-bridge device, a local variable is used instead
> of checking the device's header type.
> 
> Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
> ---
>  drivers/pci/pcie/err.c | 33 +++++++++++++++++++++++----------
>  1 file changed, 23 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index f02e334..3414445 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -287,15 +287,20 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
> u32 service)
>  	struct pci_bus *parent;
>  	struct pci_dev *pdev, *temp;
>  	pci_ers_result_t result;
> +	bool is_bridge_device = false;
> +	u16 domain = pci_domain_nr(dev->bus);
> +	u8 bus = dev->bus->number;
> +	u8 devfn = dev->devfn;
> 
> -	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
> +	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
> +		is_bridge_device = true;
>  		udev = dev;
> -	else
> +	} else {
>  		udev = dev->bus->self;
> +	}
> 
>  	parent = udev->subordinate;
>  	pci_lock_rescan_remove();
> -	pci_dev_get(dev);
>  	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
>  					 bus_list) {
>  		pci_dev_get(pdev);
> @@ -309,27 +314,35 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
> u32 service)
> 
>  	result = reset_link(udev, service);
> 
> -	if ((service == PCIE_PORT_SERVICE_AER) &&
> -	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
> +	if (service == PCIE_PORT_SERVICE_AER && is_bridge_device) {
>  		/*
>  		 * If the error is reported by a bridge, we think this error
>  		 * is related to the downstream link of the bridge, so we
>  		 * do error recovery on all subordinates of the bridge instead
>  		 * of the bridge and clear the error status of the bridge.
>  		 */
> -		pci_cleanup_aer_uncorrect_error_status(dev);
> +		pci_cleanup_aer_uncorrect_error_status(udev);
>  	}
> 
>  	if (result == PCI_ERS_RESULT_RECOVERED) {
>  		if (pcie_wait_for_link(udev, true))
>  			pci_rescan_bus(udev->bus);
> -		pci_info(dev, "Device recovery from fatal error successful\n");
> +		/* find the pci_dev after rescanning the bus */
> +		dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
one of the motivations was to remove and re-enumerate rather then going 
thorugh driver's recovery sequence
was; it might be possible that hotplug capable bridge, the device might 
have changed.
hence this check will fail
> +		if (dev)
> +			pci_info(dev, "Device recovery from fatal error successful\n");
> +		else
> +			pr_err("AER: Can not find pci_dev for %04x:%02x:%02x.%x\n",
> +			       domain, bus,
> +			       PCI_SLOT(devfn), PCI_FUNC(devfn));
> +		pci_dev_put(dev);
>  	} else {
> -		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
> -		pci_info(dev, "Device recovery from fatal error failed\n");
> +		if (is_bridge_device)
previously there was no condition.. why is this required now ?
and it was doing on "dev" while you are doing it on "udev"
> +			pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
> +		pr_err("AER: Device %04x:%02x:%02x.%x recovery from fatal error 
> failed\n",
> +		       domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>  	}
> 
> -	pci_dev_put(dev);
>  	pci_unlock_rescan_remove();
>  }

Regards,
Oza.
Oza Pawandeep Aug. 14, 2018, 9:22 a.m. UTC | #2
On 2018-08-14 14:46, poza@codeaurora.org wrote:
> On 2018-08-13 22:21, Thomas Tai wrote:
>> In order to prevent the pcie_do_fatal_recovery() from
>> using the device after it is removed, the device's
>> domain:bus:devfn is stored at the entry of
>> pcie_do_fatal_recovery(). After rescanning the bus, the
>> stored domain:bus:devfn is used to find the device and
>> uses to report pci_info. The original issue only happens
>> on an non-bridge device, a local variable is used instead
>> of checking the device's header type.
>> 
>> Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
>> ---
>>  drivers/pci/pcie/err.c | 33 +++++++++++++++++++++++----------
>>  1 file changed, 23 insertions(+), 10 deletions(-)
>> 
>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>> index f02e334..3414445 100644
>> --- a/drivers/pci/pcie/err.c
>> +++ b/drivers/pci/pcie/err.c
>> @@ -287,15 +287,20 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
>> u32 service)
>>  	struct pci_bus *parent;
>>  	struct pci_dev *pdev, *temp;
>>  	pci_ers_result_t result;
>> +	bool is_bridge_device = false;
>> +	u16 domain = pci_domain_nr(dev->bus);
>> +	u8 bus = dev->bus->number;
>> +	u8 devfn = dev->devfn;
>> 
>> -	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
>> +	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
>> +		is_bridge_device = true;
>>  		udev = dev;
>> -	else
>> +	} else {
>>  		udev = dev->bus->self;
>> +	}
>> 
>>  	parent = udev->subordinate;
>>  	pci_lock_rescan_remove();
>> -	pci_dev_get(dev);
>>  	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
>>  					 bus_list) {
>>  		pci_dev_get(pdev);
>> @@ -309,27 +314,35 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
>> u32 service)
>> 
>>  	result = reset_link(udev, service);
>> 
>> -	if ((service == PCIE_PORT_SERVICE_AER) &&
>> -	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
>> +	if (service == PCIE_PORT_SERVICE_AER && is_bridge_device) {
>>  		/*
>>  		 * If the error is reported by a bridge, we think this error
>>  		 * is related to the downstream link of the bridge, so we
>>  		 * do error recovery on all subordinates of the bridge instead
>>  		 * of the bridge and clear the error status of the bridge.
>>  		 */
>> -		pci_cleanup_aer_uncorrect_error_status(dev);
>> +		pci_cleanup_aer_uncorrect_error_status(udev);
>>  	}
>> 
>>  	if (result == PCI_ERS_RESULT_RECOVERED) {
>>  		if (pcie_wait_for_link(udev, true))
>>  			pci_rescan_bus(udev->bus);
>> -		pci_info(dev, "Device recovery from fatal error successful\n");
>> +		/* find the pci_dev after rescanning the bus */
>> +		dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
> one of the motivations was to remove and re-enumerate rather then
> going thorugh driver's recovery sequence
> was; it might be possible that hotplug capable bridge, the device
> might have changed.
> hence this check will fail
>> +		if (dev)
>> +			pci_info(dev, "Device recovery from fatal error successful\n");
>> +		else
>> +			pr_err("AER: Can not find pci_dev for %04x:%02x:%02x.%x\n",
>> +			       domain, bus,
>> +			       PCI_SLOT(devfn), PCI_FUNC(devfn));
>> +		pci_dev_put(dev);
>>  	} else {
>> -		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>> -		pci_info(dev, "Device recovery from fatal error failed\n");
>> +		if (is_bridge_device)
> previously there was no condition.. why is this required now ?
> and it was doing on "dev" while you are doing it on "udev"
is that the reason we dont watnt o use dev after it is removed ? instead 
do pci_uevent_ers on bridge ?
>> +			pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>> +		pr_err("AER: Device %04x:%02x:%02x.%x recovery from fatal error 
>> failed\n",
>> +		       domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>  	}
>> 
>> -	pci_dev_put(dev);
>>  	pci_unlock_rescan_remove();
>>  }
> 
> Regards,
> Oza.
Thomas Tai Aug. 14, 2018, 1:51 p.m. UTC | #3
On 08/14/2018 05:22 AM, poza@codeaurora.org wrote:
> On 2018-08-14 14:46, poza@codeaurora.org wrote:
>> On 2018-08-13 22:21, Thomas Tai wrote:
>>> In order to prevent the pcie_do_fatal_recovery() from
>>> using the device after it is removed, the device's
>>> domain:bus:devfn is stored at the entry of
>>> pcie_do_fatal_recovery(). After rescanning the bus, the
>>> stored domain:bus:devfn is used to find the device and
>>> uses to report pci_info. The original issue only happens
>>> on an non-bridge device, a local variable is used instead
>>> of checking the device's header type.
>>>
>>> Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
>>> ---
>>>  drivers/pci/pcie/err.c | 33 +++++++++++++++++++++++----------
>>>  1 file changed, 23 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>>> index f02e334..3414445 100644
>>> --- a/drivers/pci/pcie/err.c
>>> +++ b/drivers/pci/pcie/err.c
>>> @@ -287,15 +287,20 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
>>> u32 service)
>>>      struct pci_bus *parent;
>>>      struct pci_dev *pdev, *temp;
>>>      pci_ers_result_t result;
>>> +    bool is_bridge_device = false;
>>> +    u16 domain = pci_domain_nr(dev->bus);
>>> +    u8 bus = dev->bus->number;
>>> +    u8 devfn = dev->devfn;
>>>
>>> -    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
>>> +    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
>>> +        is_bridge_device = true;
>>>          udev = dev;
>>> -    else
>>> +    } else {
>>>          udev = dev->bus->self;
>>> +    }
>>>
>>>      parent = udev->subordinate;
>>>      pci_lock_rescan_remove();
>>> -    pci_dev_get(dev);
>>>      list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
>>>                       bus_list) {
>>>          pci_dev_get(pdev);
>>> @@ -309,27 +314,35 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
>>> u32 service)
>>>
>>>      result = reset_link(udev, service);
>>>
>>> -    if ((service == PCIE_PORT_SERVICE_AER) &&
>>> -        (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
>>> +    if (service == PCIE_PORT_SERVICE_AER && is_bridge_device) {
>>>          /*
>>>           * If the error is reported by a bridge, we think this error
>>>           * is related to the downstream link of the bridge, so we
>>>           * do error recovery on all subordinates of the bridge instead
>>>           * of the bridge and clear the error status of the bridge.
>>>           */
>>> -        pci_cleanup_aer_uncorrect_error_status(dev);
>>> +        pci_cleanup_aer_uncorrect_error_status(udev);
>>>      }
>>>
>>>      if (result == PCI_ERS_RESULT_RECOVERED) {
>>>          if (pcie_wait_for_link(udev, true))
>>>              pci_rescan_bus(udev->bus);
>>> -        pci_info(dev, "Device recovery from fatal error successful\n");
>>> +        /* find the pci_dev after rescanning the bus */
>>> +        dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
>> one of the motivations was to remove and re-enumerate rather then
>> going thorugh driver's recovery sequence
>> was; it might be possible that hotplug capable bridge, the device
>> might have changed.
>> hence this check will fail

Thanks Oza for your information. Instead of searching for the dev_pci, 
should I just use the stored domain:bus:devfn to printout the info?

>>> +        if (dev)
>>> +            pci_info(dev, "Device recovery from fatal error 
>>> successful\n");
>>> +        else
>>> +            pr_err("AER: Can not find pci_dev for %04x:%02x:%02x.%x\n",
>>> +                   domain, bus,
>>> +                   PCI_SLOT(devfn), PCI_FUNC(devfn));
>>> +        pci_dev_put(dev);

That is, replace above with:
	
	pr_info("AER: Device %04x:%02x:%02x.%x recover from fatal error 
successful\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));


>>>      } else {
>>> -        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>>> -        pci_info(dev, "Device recovery from fatal error failed\n");
>>> +        if (is_bridge_device)
>> previously there was no condition.. why is this required now ?
>> and it was doing on "dev" while you are doing it on "udev"
> is that the reason we dont watnt o use dev after it is removed ? instead 
> do pci_uevent_ers on bridge ?


Yes, that's exactly the reason. I should have written down the reason in 
the commit log. Here is the details explanation for the benefit of 
others. If the failing device is a bridge device, udev is equal to dev. 
So we can use udev in the pci_uevent_ers. But for non bridge device the 
device is already removed from the bus and thus can't send pci_uevent_ers.

Thank you,
Thomas

>>> +            pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>>> +        pr_err("AER: Device %04x:%02x:%02x.%x recovery from fatal 
>>> error failed\n",
>>> +               domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>      }
>>>
>>> -    pci_dev_put(dev);
>>>      pci_unlock_rescan_remove();
>>>  }
>>
>> Regards,
>> Oza.
Oza Pawandeep Aug. 15, 2018, 2:57 p.m. UTC | #4
On 2018-08-14 19:21, Thomas Tai wrote:
> On 08/14/2018 05:22 AM, poza@codeaurora.org wrote:
>> On 2018-08-14 14:46, poza@codeaurora.org wrote:
>>> On 2018-08-13 22:21, Thomas Tai wrote:
>>>> In order to prevent the pcie_do_fatal_recovery() from
>>>> using the device after it is removed, the device's
>>>> domain:bus:devfn is stored at the entry of
>>>> pcie_do_fatal_recovery(). After rescanning the bus, the
>>>> stored domain:bus:devfn is used to find the device and
>>>> uses to report pci_info. The original issue only happens
>>>> on an non-bridge device, a local variable is used instead
>>>> of checking the device's header type.
>>>> 
>>>> Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
>>>> ---
>>>>  drivers/pci/pcie/err.c | 33 +++++++++++++++++++++++----------
>>>>  1 file changed, 23 insertions(+), 10 deletions(-)
>>>> 
>>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>>>> index f02e334..3414445 100644
>>>> --- a/drivers/pci/pcie/err.c
>>>> +++ b/drivers/pci/pcie/err.c
>>>> @@ -287,15 +287,20 @@ void pcie_do_fatal_recovery(struct pci_dev 
>>>> *dev,
>>>> u32 service)
>>>>      struct pci_bus *parent;
>>>>      struct pci_dev *pdev, *temp;
>>>>      pci_ers_result_t result;
>>>> +    bool is_bridge_device = false;
>>>> +    u16 domain = pci_domain_nr(dev->bus);
>>>> +    u8 bus = dev->bus->number;
>>>> +    u8 devfn = dev->devfn;
>>>> 
>>>> -    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
>>>> +    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
>>>> +        is_bridge_device = true;
>>>>          udev = dev;
>>>> -    else
>>>> +    } else {
>>>>          udev = dev->bus->self;
>>>> +    }
>>>> 
>>>>      parent = udev->subordinate;
>>>>      pci_lock_rescan_remove();
>>>> -    pci_dev_get(dev);
>>>>      list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
>>>>                       bus_list) {
>>>>          pci_dev_get(pdev);
>>>> @@ -309,27 +314,35 @@ void pcie_do_fatal_recovery(struct pci_dev 
>>>> *dev,
>>>> u32 service)
>>>> 
>>>>      result = reset_link(udev, service);
>>>> 
>>>> -    if ((service == PCIE_PORT_SERVICE_AER) &&
>>>> -        (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
>>>> +    if (service == PCIE_PORT_SERVICE_AER && is_bridge_device) {
>>>>          /*
>>>>           * If the error is reported by a bridge, we think this 
>>>> error
>>>>           * is related to the downstream link of the bridge, so we
>>>>           * do error recovery on all subordinates of the bridge 
>>>> instead
>>>>           * of the bridge and clear the error status of the bridge.
>>>>           */
>>>> -        pci_cleanup_aer_uncorrect_error_status(dev);
>>>> +        pci_cleanup_aer_uncorrect_error_status(udev);
>>>>      }
>>>> 
>>>>      if (result == PCI_ERS_RESULT_RECOVERED) {
>>>>          if (pcie_wait_for_link(udev, true))
>>>>              pci_rescan_bus(udev->bus);
>>>> -        pci_info(dev, "Device recovery from fatal error 
>>>> successful\n");
>>>> +        /* find the pci_dev after rescanning the bus */
>>>> +        dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
>>> one of the motivations was to remove and re-enumerate rather then
>>> going thorugh driver's recovery sequence
>>> was; it might be possible that hotplug capable bridge, the device
>>> might have changed.
>>> hence this check will fail
> 
> Thanks Oza for your information. Instead of searching for the dev_pci,
> should I just use the stored domain:bus:devfn to printout the info?
> 
>>>> +        if (dev)
>>>> +            pci_info(dev, "Device recovery from fatal error 
>>>> successful\n");
>>>> +        else
>>>> +            pr_err("AER: Can not find pci_dev for 
>>>> %04x:%02x:%02x.%x\n",
>>>> +                   domain, bus,
>>>> +                   PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>> +        pci_dev_put(dev);
> 
> That is, replace above with:
> 
> 	pr_info("AER: Device %04x:%02x:%02x.%x recover from fatal error
> successful\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
> 
is this not sufficient to print ?  which is there already.
pci_info(dev, "Device recovery from fatal error successful\n");

> 
>>>>      } else {
>>>> -        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>>>> -        pci_info(dev, "Device recovery from fatal error failed\n");
>>>> +        if (is_bridge_device)
>>> previously there was no condition.. why is this required now ?
>>> and it was doing on "dev" while you are doing it on "udev"
>> is that the reason we dont watnt o use dev after it is removed ? 
>> instead do pci_uevent_ers on bridge ?
> 
> 
> Yes, that's exactly the reason. I should have written down the reason
> in the commit log. Here is the details explanation for the benefit of
> others. If the failing device is a bridge device, udev is equal to
> dev. So we can use udev in the pci_uevent_ers. But for non bridge
> device the device is already removed from the bus and thus can't send
> pci_uevent_ers.
> 
> Thank you,
> Thomas
> 
>>>> +            pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>>>> +        pr_err("AER: Device %04x:%02x:%02x.%x recovery from fatal 
>>>> error failed\n",
>>>> +               domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>      }
>>>> 
>>>> -    pci_dev_put(dev);
>>>>      pci_unlock_rescan_remove();
>>>>  }
>>> 
>>> Regards,
>>> Oza.
Thomas Tai Aug. 15, 2018, 3:02 p.m. UTC | #5
On 08/15/2018 10:57 AM, poza@codeaurora.org wrote:
> On 2018-08-14 19:21, Thomas Tai wrote:
>> On 08/14/2018 05:22 AM, poza@codeaurora.org wrote:
>>> On 2018-08-14 14:46, poza@codeaurora.org wrote:
>>>> On 2018-08-13 22:21, Thomas Tai wrote:
>>>>> In order to prevent the pcie_do_fatal_recovery() from
>>>>> using the device after it is removed, the device's
>>>>> domain:bus:devfn is stored at the entry of
>>>>> pcie_do_fatal_recovery(). After rescanning the bus, the
>>>>> stored domain:bus:devfn is used to find the device and
>>>>> uses to report pci_info. The original issue only happens
>>>>> on an non-bridge device, a local variable is used instead
>>>>> of checking the device's header type.
>>>>>
>>>>> Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
>>>>> ---
>>>>>  drivers/pci/pcie/err.c | 33 +++++++++++++++++++++++----------
>>>>>  1 file changed, 23 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>>>>> index f02e334..3414445 100644
>>>>> --- a/drivers/pci/pcie/err.c
>>>>> +++ b/drivers/pci/pcie/err.c
>>>>> @@ -287,15 +287,20 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
>>>>> u32 service)
>>>>>      struct pci_bus *parent;
>>>>>      struct pci_dev *pdev, *temp;
>>>>>      pci_ers_result_t result;
>>>>> +    bool is_bridge_device = false;
>>>>> +    u16 domain = pci_domain_nr(dev->bus);
>>>>> +    u8 bus = dev->bus->number;
>>>>> +    u8 devfn = dev->devfn;
>>>>>
>>>>> -    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
>>>>> +    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
>>>>> +        is_bridge_device = true;
>>>>>          udev = dev;
>>>>> -    else
>>>>> +    } else {
>>>>>          udev = dev->bus->self;
>>>>> +    }
>>>>>
>>>>>      parent = udev->subordinate;
>>>>>      pci_lock_rescan_remove();
>>>>> -    pci_dev_get(dev);
>>>>>      list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
>>>>>                       bus_list) {
>>>>>          pci_dev_get(pdev);
>>>>> @@ -309,27 +314,35 @@ void pcie_do_fatal_recovery(struct pci_dev *dev,
>>>>> u32 service)
>>>>>
>>>>>      result = reset_link(udev, service);
>>>>>
>>>>> -    if ((service == PCIE_PORT_SERVICE_AER) &&
>>>>> -        (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
>>>>> +    if (service == PCIE_PORT_SERVICE_AER && is_bridge_device) {
>>>>>          /*
>>>>>           * If the error is reported by a bridge, we think this error
>>>>>           * is related to the downstream link of the bridge, so we
>>>>>           * do error recovery on all subordinates of the bridge 
>>>>> instead
>>>>>           * of the bridge and clear the error status of the bridge.
>>>>>           */
>>>>> -        pci_cleanup_aer_uncorrect_error_status(dev);
>>>>> +        pci_cleanup_aer_uncorrect_error_status(udev);
>>>>>      }
>>>>>
>>>>>      if (result == PCI_ERS_RESULT_RECOVERED) {
>>>>>          if (pcie_wait_for_link(udev, true))
>>>>>              pci_rescan_bus(udev->bus);
>>>>> -        pci_info(dev, "Device recovery from fatal error 
>>>>> successful\n");
>>>>> +        /* find the pci_dev after rescanning the bus */
>>>>> +        dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
>>>> one of the motivations was to remove and re-enumerate rather then
>>>> going thorugh driver's recovery sequence
>>>> was; it might be possible that hotplug capable bridge, the device
>>>> might have changed.
>>>> hence this check will fail
>>
>> Thanks Oza for your information. Instead of searching for the dev_pci,
>> should I just use the stored domain:bus:devfn to printout the info?
>>
>>>>> +        if (dev)
>>>>> +            pci_info(dev, "Device recovery from fatal error 
>>>>> successful\n");
>>>>> +        else
>>>>> +            pr_err("AER: Can not find pci_dev for 
>>>>> %04x:%02x:%02x.%x\n",
>>>>> +                   domain, bus,
>>>>> +                   PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>> +        pci_dev_put(dev);
>>
>> That is, replace above with:
>>
>>     pr_info("AER: Device %04x:%02x:%02x.%x recover from fatal error
>> successful\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>
> is this not sufficient to print ?  which is there already.
> pci_info(dev, "Device recovery from fatal error successful\n");

Unfortunately, I can't use "dev" because as you said I can't 
pci_get_domain_slot to search for it after rescanning.

-T

> 
>>
>>>>>      } else {
>>>>> -        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>>>>> -        pci_info(dev, "Device recovery from fatal error failed\n");
>>>>> +        if (is_bridge_device)
>>>> previously there was no condition.. why is this required now ?
>>>> and it was doing on "dev" while you are doing it on "udev"
>>> is that the reason we dont watnt o use dev after it is removed ? 
>>> instead do pci_uevent_ers on bridge ?
>>
>>
>> Yes, that's exactly the reason. I should have written down the reason
>> in the commit log. Here is the details explanation for the benefit of
>> others. If the failing device is a bridge device, udev is equal to
>> dev. So we can use udev in the pci_uevent_ers. But for non bridge
>> device the device is already removed from the bus and thus can't send
>> pci_uevent_ers.
>>
>> Thank you,
>> Thomas
>>
>>>>> +            pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>>>>> +        pr_err("AER: Device %04x:%02x:%02x.%x recovery from fatal 
>>>>> error failed\n",
>>>>> +               domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>      }
>>>>>
>>>>> -    pci_dev_put(dev);
>>>>>      pci_unlock_rescan_remove();
>>>>>  }
>>>>
>>>> Regards,
>>>> Oza.
Oza Pawandeep Aug. 15, 2018, 3:26 p.m. UTC | #6
On 2018-08-15 20:32, Thomas Tai wrote:
> On 08/15/2018 10:57 AM, poza@codeaurora.org wrote:
>> On 2018-08-14 19:21, Thomas Tai wrote:
>>> On 08/14/2018 05:22 AM, poza@codeaurora.org wrote:
>>>> On 2018-08-14 14:46, poza@codeaurora.org wrote:
>>>>> On 2018-08-13 22:21, Thomas Tai wrote:
>>>>>> In order to prevent the pcie_do_fatal_recovery() from
>>>>>> using the device after it is removed, the device's
>>>>>> domain:bus:devfn is stored at the entry of
>>>>>> pcie_do_fatal_recovery(). After rescanning the bus, the
>>>>>> stored domain:bus:devfn is used to find the device and
>>>>>> uses to report pci_info. The original issue only happens
>>>>>> on an non-bridge device, a local variable is used instead
>>>>>> of checking the device's header type.
>>>>>> 
>>>>>> Signed-off-by: Thomas Tai <thomas.tai@oracle.com>
>>>>>> ---
>>>>>>  drivers/pci/pcie/err.c | 33 +++++++++++++++++++++++----------
>>>>>>  1 file changed, 23 insertions(+), 10 deletions(-)
>>>>>> 
>>>>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>>>>>> index f02e334..3414445 100644
>>>>>> --- a/drivers/pci/pcie/err.c
>>>>>> +++ b/drivers/pci/pcie/err.c
>>>>>> @@ -287,15 +287,20 @@ void pcie_do_fatal_recovery(struct pci_dev 
>>>>>> *dev,
>>>>>> u32 service)
>>>>>>      struct pci_bus *parent;
>>>>>>      struct pci_dev *pdev, *temp;
>>>>>>      pci_ers_result_t result;
>>>>>> +    bool is_bridge_device = false;
>>>>>> +    u16 domain = pci_domain_nr(dev->bus);
>>>>>> +    u8 bus = dev->bus->number;
>>>>>> +    u8 devfn = dev->devfn;
>>>>>> 
>>>>>> -    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
>>>>>> +    if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
>>>>>> +        is_bridge_device = true;
>>>>>>          udev = dev;
>>>>>> -    else
>>>>>> +    } else {
>>>>>>          udev = dev->bus->self;
>>>>>> +    }
>>>>>> 
>>>>>>      parent = udev->subordinate;
>>>>>>      pci_lock_rescan_remove();
>>>>>> -    pci_dev_get(dev);
>>>>>>      list_for_each_entry_safe_reverse(pdev, temp, 
>>>>>> &parent->devices,
>>>>>>                       bus_list) {
>>>>>>          pci_dev_get(pdev);
>>>>>> @@ -309,27 +314,35 @@ void pcie_do_fatal_recovery(struct pci_dev 
>>>>>> *dev,
>>>>>> u32 service)
>>>>>> 
>>>>>>      result = reset_link(udev, service);
>>>>>> 
>>>>>> -    if ((service == PCIE_PORT_SERVICE_AER) &&
>>>>>> -        (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
>>>>>> +    if (service == PCIE_PORT_SERVICE_AER && is_bridge_device) {
>>>>>>          /*
>>>>>>           * If the error is reported by a bridge, we think this 
>>>>>> error
>>>>>>           * is related to the downstream link of the bridge, so we
>>>>>>           * do error recovery on all subordinates of the bridge 
>>>>>> instead
>>>>>>           * of the bridge and clear the error status of the 
>>>>>> bridge.
>>>>>>           */
>>>>>> -        pci_cleanup_aer_uncorrect_error_status(dev);
>>>>>> +        pci_cleanup_aer_uncorrect_error_status(udev);
>>>>>>      }
>>>>>> 
>>>>>>      if (result == PCI_ERS_RESULT_RECOVERED) {
>>>>>>          if (pcie_wait_for_link(udev, true))
>>>>>>              pci_rescan_bus(udev->bus);
>>>>>> -        pci_info(dev, "Device recovery from fatal error 
>>>>>> successful\n");
>>>>>> +        /* find the pci_dev after rescanning the bus */
>>>>>> +        dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
>>>>> one of the motivations was to remove and re-enumerate rather then
>>>>> going thorugh driver's recovery sequence
>>>>> was; it might be possible that hotplug capable bridge, the device
>>>>> might have changed.
>>>>> hence this check will fail
>>> 
>>> Thanks Oza for your information. Instead of searching for the 
>>> dev_pci,
>>> should I just use the stored domain:bus:devfn to printout the info?
>>> 
>>>>>> +        if (dev)
>>>>>> +            pci_info(dev, "Device recovery from fatal error 
>>>>>> successful\n");
>>>>>> +        else
>>>>>> +            pr_err("AER: Can not find pci_dev for 
>>>>>> %04x:%02x:%02x.%x\n",
>>>>>> +                   domain, bus,
>>>>>> +                   PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>> +        pci_dev_put(dev);
>>> 
>>> That is, replace above with:
>>> 
>>>     pr_info("AER: Device %04x:%02x:%02x.%x recover from fatal error
>>> successful\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>> 
>> is this not sufficient to print ?  which is there already.
>> pci_info(dev, "Device recovery from fatal error successful\n");
> 
> Unfortunately, I can't use "dev" because as you said I can't
> pci_get_domain_slot to search for it after rescanning.
> 
> -T

If I understood the patch correctly,
1) it is restructuring of pci_dev_put(dev);
2) now we are sending uevent only if it is bridge.
3) the other logic where you store and compare bdf is not required, 
although we have to fix following.
pci_info(dev, "Device recovery from fatal error successful\n");


probably 1) and 2) could be a separate patch.

by the way,
In my testing even after removing device the call pci_uevent_ers(dev, 
PCI_ERS_RESULT_DISCONNECT); did not create any problem.
> 

> 
>> 
>>> 
>>>>>>      } else {
>>>>>> -        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>>>>>> -        pci_info(dev, "Device recovery from fatal error 
>>>>>> failed\n");
>>>>>> +        if (is_bridge_device)
>>>>> previously there was no condition.. why is this required now ?
>>>>> and it was doing on "dev" while you are doing it on "udev"
>>>> is that the reason we dont watnt o use dev after it is removed ? 
>>>> instead do pci_uevent_ers on bridge ?
>>> 
>>> 
>>> Yes, that's exactly the reason. I should have written down the reason
>>> in the commit log. Here is the details explanation for the benefit of
>>> others. If the failing device is a bridge device, udev is equal to
>>> dev. So we can use udev in the pci_uevent_ers. But for non bridge
>>> device the device is already removed from the bus and thus can't send
>>> pci_uevent_ers.
>>> 
>>> Thank you,
>>> Thomas
>>> 
>>>>>> +            pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>>>>>> +        pr_err("AER: Device %04x:%02x:%02x.%x recovery from fatal 
>>>>>> error failed\n",
>>>>>> +               domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>      }
>>>>>> 
>>>>>> -    pci_dev_put(dev);
>>>>>>      pci_unlock_rescan_remove();
>>>>>>  }
>>>>> 
>>>>> Regards,
>>>>> Oza.
Thomas Tai Aug. 15, 2018, 3:43 p.m. UTC | #7
[ ... ]>>>>>>> +        if (dev)
>>>>>>> +            pci_info(dev, "Device recovery from fatal error 
>>>>>>> successful\n");
>>>>>>> +        else
>>>>>>> +            pr_err("AER: Can not find pci_dev for 
>>>>>>> %04x:%02x:%02x.%x\n",
>>>>>>> +                   domain, bus,
>>>>>>> +                   PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>> +        pci_dev_put(dev);
>>>>
>>>> That is, replace above with:
>>>>
>>>>     pr_info("AER: Device %04x:%02x:%02x.%x recover from fatal error
>>>> successful\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>
>>> is this not sufficient to print ?  which is there already.
>>> pci_info(dev, "Device recovery from fatal error successful\n");
>>
>> Unfortunately, I can't use "dev" because as you said I can't
>> pci_get_domain_slot to search for it after rescanning.
>>
>> -T
> 
> If I understood the patch correctly,
> 1) it is restructuring of pci_dev_put(dev);
> 2) now we are sending uevent only if it is bridge.
> 3) the other logic where you store and compare bdf is not required, 
> although we have to fix following.
> pci_info(dev, "Device recovery from fatal error successful\n");
> 
> 
> probably 1) and 2) could be a separate patch.

Yes Oza. I will separate 1) and 2) and fix the pci_info(dev, "...") to 
use domain:bus:devfn

> 
> by the way,
> In my testing even after removing device the call pci_uevent_ers(dev, 
> PCI_ERS_RESULT_DISCONNECT); did not create any problem.

I think that is dangerous, if we restructure pci_dev_put(dev), ie revert 
commit bd91b56cb3b27492963caeb5fccefe20a986ca8d. The dev structure is 
freed and we shouldn't really use it to send pci_uevent_ers(). Although 
pci_uevent_ers() may be smart enough to figure out to avoid crash, we 
should only use it when the dev is not removed.

Thanks,
Thomas

>>
> 
>>
>>>
>>>>
>>>>>>>      } else {
>>>>>>> -        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>>>>>>> -        pci_info(dev, "Device recovery from fatal error failed\n");
>>>>>>> +        if (is_bridge_device)
>>>>>> previously there was no condition.. why is this required now ?
>>>>>> and it was doing on "dev" while you are doing it on "udev"
>>>>> is that the reason we dont watnt o use dev after it is removed ? 
>>>>> instead do pci_uevent_ers on bridge ?
>>>>
>>>>
>>>> Yes, that's exactly the reason. I should have written down the reason
>>>> in the commit log. Here is the details explanation for the benefit of
>>>> others. If the failing device is a bridge device, udev is equal to
>>>> dev. So we can use udev in the pci_uevent_ers. But for non bridge
>>>> device the device is already removed from the bus and thus can't send
>>>> pci_uevent_ers.
>>>>
>>>> Thank you,
>>>> Thomas
>>>>
>>>>>>> +            pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>>>>>>> +        pr_err("AER: Device %04x:%02x:%02x.%x recovery from 
>>>>>>> fatal error failed\n",
>>>>>>> +               domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>>      }
>>>>>>>
>>>>>>> -    pci_dev_put(dev);
>>>>>>>      pci_unlock_rescan_remove();
>>>>>>>  }
>>>>>>
>>>>>> Regards,
>>>>>> Oza.
Oza Pawandeep Aug. 15, 2018, 3:59 p.m. UTC | #8
On 2018-08-15 21:13, Thomas Tai wrote:
> [ ... ]>>>>>>> +        if (dev)
>>>>>>>> +            pci_info(dev, "Device recovery from fatal error 
>>>>>>>> successful\n");
>>>>>>>> +        else
>>>>>>>> +            pr_err("AER: Can not find pci_dev for 
>>>>>>>> %04x:%02x:%02x.%x\n",
>>>>>>>> +                   domain, bus,
>>>>>>>> +                   PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>>> +        pci_dev_put(dev);
>>>>> 
>>>>> That is, replace above with:
>>>>> 
>>>>>     pr_info("AER: Device %04x:%02x:%02x.%x recover from fatal error
>>>>> successful\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>> 
>>>> is this not sufficient to print ?  which is there already.
>>>> pci_info(dev, "Device recovery from fatal error successful\n");
>>> 
>>> Unfortunately, I can't use "dev" because as you said I can't
>>> pci_get_domain_slot to search for it after rescanning.
>>> 
>>> -T
>> 
>> If I understood the patch correctly,
>> 1) it is restructuring of pci_dev_put(dev);
>> 2) now we are sending uevent only if it is bridge.
>> 3) the other logic where you store and compare bdf is not required, 
>> although we have to fix following.
>> pci_info(dev, "Device recovery from fatal error successful\n");
>> 
>> 
>> probably 1) and 2) could be a separate patch.
> 
> Yes Oza. I will separate 1) and 2) and fix the pci_info(dev, "...") to
> use domain:bus:devfn

Thanks Thomas,
although I am not sure on 3)  by doing following.

  dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
      if (dev)
         pci_info(dev, "Device recovery from fatal error successful\n");

but also it may be the case that device is removed and there is no new 
device which would match BDF
so in any case recovery has happened as far as PCIe link is concerned.

surprisingly in my case followin also went fine
pci_info(dev, "Device recovery from fatal error successful\n");
I guess probably after re-enumeration, I got the sane dev back !!  
(since it enumerated the same device)

how about something like this

dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
      if (dev)
         pci_info(dev, "Device recovery from fatal error successful\n");
       else
          pr_ifo ("AER: Device recovery from fatal error successful")

Regards,
Oza.

> 
>> 
>> by the way,
>> In my testing even after removing device the call pci_uevent_ers(dev, 
>> PCI_ERS_RESULT_DISCONNECT); did not create any problem.
> 
> I think that is dangerous, if we restructure pci_dev_put(dev), ie
> revert commit bd91b56cb3b27492963caeb5fccefe20a986ca8d. The dev
> structure is freed and we shouldn't really use it to send
> pci_uevent_ers(). Although pci_uevent_ers() may be smart enough to
> figure out to avoid crash, we should only use it when the dev is not
> removed.
> 

I Agree.

> Thanks,
> Thomas
> 
>>> 
>> 
>>> 
>>>> 
>>>>> 
>>>>>>>>      } else {
>>>>>>>> -        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>>>>>>>> -        pci_info(dev, "Device recovery from fatal error 
>>>>>>>> failed\n");
>>>>>>>> +        if (is_bridge_device)
>>>>>>> previously there was no condition.. why is this required now ?
>>>>>>> and it was doing on "dev" while you are doing it on "udev"
>>>>>> is that the reason we dont watnt o use dev after it is removed ? 
>>>>>> instead do pci_uevent_ers on bridge ?
>>>>> 
>>>>> 
>>>>> Yes, that's exactly the reason. I should have written down the 
>>>>> reason
>>>>> in the commit log. Here is the details explanation for the benefit 
>>>>> of
>>>>> others. If the failing device is a bridge device, udev is equal to
>>>>> dev. So we can use udev in the pci_uevent_ers. But for non bridge
>>>>> device the device is already removed from the bus and thus can't 
>>>>> send
>>>>> pci_uevent_ers.
>>>>> 
>>>>> Thank you,
>>>>> Thomas
>>>>> 
>>>>>>>> +            pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>>>>>>>> +        pr_err("AER: Device %04x:%02x:%02x.%x recovery from 
>>>>>>>> fatal error failed\n",
>>>>>>>> +               domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>>>      }
>>>>>>>> 
>>>>>>>> -    pci_dev_put(dev);
>>>>>>>>      pci_unlock_rescan_remove();
>>>>>>>>  }
>>>>>>> 
>>>>>>> Regards,
>>>>>>> Oza.
Thomas Tai Aug. 15, 2018, 4:04 p.m. UTC | #9
On 08/15/2018 11:59 AM, poza@codeaurora.org wrote:
> On 2018-08-15 21:13, Thomas Tai wrote:
>> [ ... ]>>>>>>> +        if (dev)
>>>>>>>>> +            pci_info(dev, "Device recovery from fatal error 
>>>>>>>>> successful\n");
>>>>>>>>> +        else
>>>>>>>>> +            pr_err("AER: Can not find pci_dev for 
>>>>>>>>> %04x:%02x:%02x.%x\n",
>>>>>>>>> +                   domain, bus,
>>>>>>>>> +                   PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>>>> +        pci_dev_put(dev);
>>>>>>
>>>>>> That is, replace above with:
>>>>>>
>>>>>>     pr_info("AER: Device %04x:%02x:%02x.%x recover from fatal error
>>>>>> successful\n", domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>
>>>>> is this not sufficient to print ?  which is there already.
>>>>> pci_info(dev, "Device recovery from fatal error successful\n");
>>>>
>>>> Unfortunately, I can't use "dev" because as you said I can't
>>>> pci_get_domain_slot to search for it after rescanning.
>>>>
>>>> -T
>>>
>>> If I understood the patch correctly,
>>> 1) it is restructuring of pci_dev_put(dev);
>>> 2) now we are sending uevent only if it is bridge.
>>> 3) the other logic where you store and compare bdf is not required, 
>>> although we have to fix following.
>>> pci_info(dev, "Device recovery from fatal error successful\n");
>>>
>>>
>>> probably 1) and 2) could be a separate patch.
>>
>> Yes Oza. I will separate 1) and 2) and fix the pci_info(dev, "...") to
>> use domain:bus:devfn
> 
> Thanks Thomas,
> although I am not sure on 3)  by doing following.
> 
>   dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
>       if (dev)
>          pci_info(dev, "Device recovery from fatal error successful\n");
> 
> but also it may be the case that device is removed and there is no new 
> device which would match BDF
> so in any case recovery has happened as far as PCIe link is concerned.
> 
> surprisingly in my case followin also went fine
> pci_info(dev, "Device recovery from fatal error successful\n");
> I guess probably after re-enumeration, I got the sane dev back !! (since 
> it enumerated the same device)
> 
> how about something like this
> 
> dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
>       if (dev)
>          pci_info(dev, "Device recovery from fatal error successful\n");
>        else
>           pr_ifo ("AER: Device recovery from fatal error successful")

That is cool! Thank you for your suggestion, I will do so in V2.

Thomas

> 
> Regards,
> Oza.
> 
>>
>>>
>>> by the way,
>>> In my testing even after removing device the call pci_uevent_ers(dev, 
>>> PCI_ERS_RESULT_DISCONNECT); did not create any problem.
>>
>> I think that is dangerous, if we restructure pci_dev_put(dev), ie
>> revert commit bd91b56cb3b27492963caeb5fccefe20a986ca8d. The dev
>> structure is freed and we shouldn't really use it to send
>> pci_uevent_ers(). Although pci_uevent_ers() may be smart enough to
>> figure out to avoid crash, we should only use it when the dev is not
>> removed.
>>
> 
> I Agree.
> 
>> Thanks,
>> Thomas
>>
>>>>
>>>
>>>>
>>>>>
>>>>>>
>>>>>>>>>      } else {
>>>>>>>>> -        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
>>>>>>>>> -        pci_info(dev, "Device recovery from fatal error 
>>>>>>>>> failed\n");
>>>>>>>>> +        if (is_bridge_device)
>>>>>>>> previously there was no condition.. why is this required now ?
>>>>>>>> and it was doing on "dev" while you are doing it on "udev"
>>>>>>> is that the reason we dont watnt o use dev after it is removed ? 
>>>>>>> instead do pci_uevent_ers on bridge ?
>>>>>>
>>>>>>
>>>>>> Yes, that's exactly the reason. I should have written down the reason
>>>>>> in the commit log. Here is the details explanation for the benefit of
>>>>>> others. If the failing device is a bridge device, udev is equal to
>>>>>> dev. So we can use udev in the pci_uevent_ers. But for non bridge
>>>>>> device the device is already removed from the bus and thus can't send
>>>>>> pci_uevent_ers.
>>>>>>
>>>>>> Thank you,
>>>>>> Thomas
>>>>>>
>>>>>>>>> +            pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
>>>>>>>>> +        pr_err("AER: Device %04x:%02x:%02x.%x recovery from 
>>>>>>>>> fatal error failed\n",
>>>>>>>>> +               domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
>>>>>>>>>      }
>>>>>>>>>
>>>>>>>>> -    pci_dev_put(dev);
>>>>>>>>>      pci_unlock_rescan_remove();
>>>>>>>>>  }
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Oza.
Benjamin Herrenschmidt Aug. 15, 2018, 9:55 p.m. UTC | #10
On Tue, 2018-08-14 at 14:52 +0530, poza@codeaurora.org wrote:
> > >       if (result == PCI_ERS_RESULT_RECOVERED) {
> > >               if (pcie_wait_for_link(udev, true))
> > >                       pci_rescan_bus(udev->bus);
> > > -            pci_info(dev, "Device recovery from fatal error successful\n");
> > > +            /* find the pci_dev after rescanning the bus */
> > > +            dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
> > 
> > one of the motivations was to remove and re-enumerate rather then
> > going thorugh driver's recovery sequence
> > was; it might be possible that hotplug capable bridge, the device
> > might have changed.
> > hence this check will fail

Under what circumstances do you actually "unplug" the device ? We are
trying to cleanup/fix some of the PowerPC EEH code which is in a way
similar to AER, and we found that this unplug/replug, which we do if
the driver doesn't have recovery callbacks only, is causing more
problems than it solves.

We are moving toward instead unbinding the driver, resetting the
device, then re-binding the driver instead of unplug/replug.

Also why would you ever bypass the driver callbacks if the driver has
some ? The whole point is to keep the driver bound while resetting the
device (provided it has the right callbacks) so we don't lose the
linkage between stroage devices and mounted filesystems.

Cheers,
Ben.
Benjamin Herrenschmidt Aug. 15, 2018, 9:56 p.m. UTC | #11
(resent using my proper email address)

On Tue, 2018-08-14 at 14:52 +0530, poza@codeaurora.org wrote:
> > >       if (result == PCI_ERS_RESULT_RECOVERED) {
> > >               if (pcie_wait_for_link(udev, true))
> > >                       pci_rescan_bus(udev->bus);
> > > -            pci_info(dev, "Device recovery from fatal error successful\n");
> > > +            /* find the pci_dev after rescanning the bus */
> > > +            dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
> > 
> > one of the motivations was to remove and re-enumerate rather then
> > going thorugh driver's recovery sequence
> > was; it might be possible that hotplug capable bridge, the device
> > might have changed.
> > hence this check will fail

Under what circumstances do you actually "unplug" the device ? We are
trying to cleanup/fix some of the PowerPC EEH code which is in a way
similar to AER, and we found that this unplug/replug, which we do if
the driver doesn't have recovery callbacks only, is causing more
problems than it solves.

We are moving toward instead unbinding the driver, resetting the
device, then re-binding the driver instead of unplug/replug.

Also why would you ever bypass the driver callbacks if the driver has
some ? The whole point is to keep the driver bound while resetting the
device (provided it has the right callbacks) so we don't lose the
linkage between stroage devices and mounted filesystems.

Cheers,
Ben.
Oza Pawandeep Aug. 16, 2018, 6:36 a.m. UTC | #12
On 2018-08-16 03:26, Benjamin Herrenschmidt wrote:
> (resent using my proper email address)
> 
> On Tue, 2018-08-14 at 14:52 +0530, poza@codeaurora.org wrote:
>> > >       if (result == PCI_ERS_RESULT_RECOVERED) {
>> > >               if (pcie_wait_for_link(udev, true))
>> > >                       pci_rescan_bus(udev->bus);
>> > > -            pci_info(dev, "Device recovery from fatal error successful\n");
>> > > +            /* find the pci_dev after rescanning the bus */
>> > > +            dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
>> >
>> > one of the motivations was to remove and re-enumerate rather then
>> > going thorugh driver's recovery sequence
>> > was; it might be possible that hotplug capable bridge, the device
>> > might have changed.
>> > hence this check will fail
> 
> Under what circumstances do you actually "unplug" the device ? We are
> trying to cleanup/fix some of the PowerPC EEH code which is in a way
> similar to AER, and we found that this unplug/replug, which we do if
> the driver doesn't have recovery callbacks only, is causing more
> problems than it solves.
> 
> We are moving toward instead unbinding the driver, resetting the
> device, then re-binding the driver instead of unplug/replug.
> 
> Also why would you ever bypass the driver callbacks if the driver has
> some ? The whole point is to keep the driver bound while resetting the
> device (provided it has the right callbacks) so we don't lose the
> linkage between stroage devices and mounted filesystems.
> 

there are two different paths we are taking, one is for ERR_FATAL and 
the other is for ERR_NONFATAL.
while AER and DPC converge into ERR_FATAL path; ERR_NONFATAL path is the 
one where AER and driver callbacks would come into picture.

since DPC does hw recovery of the link, and handles only ERR_FATAL, we 
do remove devices and re-enumerate them.
but if the error is no fatal, we will fall back to driver callbacks to 
initiate link error handling and recovery process.

under normal circumstances I do not see the some downstream devices 
would have been replaced in case of FATAL errors, but code might not 
want to assume that
same device is there, since we are removing downstream devices 
completely. so taking reference of old BDF and keeping it for later 
reference is not good idea.
although I think there are some parts of pcie_do_fatal_recovery need a 
fix, which Thomas is fixing is anyway.

so yes, I agree with you and we are talking about same thing e.g.
"
  We are moving toward instead unbinding the driver, resetting the
  device, then re-binding the driver instead of unplug/replug.
"

driver callbacks are very much there, please have a look at 
pcie_do_nonfatal_recovery().

refering Sepc: 6.2.2.2.1 Fatal Errors, where link is unreliable and it 
might need AER style reset of link or DPC style HW recovery
In both cases, the shutdown callbacks are expected to be called,
e.g.  some driver handle errors ERR_NONFATAL or FATAL in similar ways
e.g.
ioat_pcie_error_detected();  calls  ioat_shutdown(); in case of 
ERR_NONFATAL
otherwise ioat_shutdown() in case of ERR_FATAL.


> Cheers,
> Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 6:51 a.m. UTC | #13
On Thu, 2018-08-16 at 12:06 +0530, poza@codeaurora.org wrote:
> 
> > Under what circumstances do you actually "unplug" the device ? We are
> > trying to cleanup/fix some of the PowerPC EEH code which is in a way
> > similar to AER, and we found that this unplug/replug, which we do if
> > the driver doesn't have recovery callbacks only, is causing more
> > problems than it solves.
> > 
> > We are moving toward instead unbinding the driver, resetting the
> > device, then re-binding the driver instead of unplug/replug.
> > 
> > Also why would you ever bypass the driver callbacks if the driver has
> > some ? The whole point is to keep the driver bound while resetting the
> > device (provided it has the right callbacks) so we don't lose the
> > linkage between stroage devices and mounted filesystems.
> > 
> 
> there are two different paths we are taking, one is for ERR_FATAL and 
> the other is for ERR_NONFATAL.

Ok. With EEH we have a much finer granularity but fundamentally we
don't care as long as we reset the device.

> while AER and DPC converge into ERR_FATAL path; ERR_NONFATAL path is the 
> one where AER and driver callbacks would come into picture.

I disagree. See below.

> since DPC does hw recovery of the link, and handles only ERR_FATAL, we 
> do remove devices and re-enumerate them.

What is "DPC" ?

> but if the error is no fatal, we will fall back to driver callbacks to 
> initiate link error handling and recovery process.

Hrm ... the callbacks were never about link errors, they were about
device errors of any kind and predate PCIe concept of links in fact.

They include iommu errors, HW errors, SW bugs etc...

> under normal circumstances I do not see the some downstream devices 
> would have been replaced in case of FATAL errors, but code might not 
> want to assume that
> same device is there, since we are removing downstream devices 
> completely. so taking reference of old BDF and keeping it for later 
> reference is not good idea.
> although I think there are some parts of pcie_do_fatal_recovery need a 
> fix, which Thomas is fixing is anyway.

I think you shouldn't be unplugging/replugging.

> so yes, I agree with you and we are talking about same thing e.g.
> "
>   We are moving toward instead unbinding the driver, resetting the
>   device, then re-binding the driver instead of unplug/replug.
> "
> 
> driver callbacks are very much there, please have a look at 
> pcie_do_nonfatal_recovery().

Even for fatal. Why on earth would you skip the driver callbacks ? Most
errors are fatal anyway.

For us, we don't really differenciate because in order to avoid any
form of propagation of bad data, our HW will freeze/isolate a device on
any error, whether it's fatal or non fatal.

So we treat them both as needing a full recovery which usually implies
a reset of the device.

However we do that through the use of the driver callbacks because
otherwise you will lose the linkage between a storage driver and the
moutned file systems which cannot be recovered, so you are killing your
system.

> refering Sepc: 6.2.2.2.1 Fatal Errors, where link is unreliable and it 
> might need AER style reset of link or DPC style HW recovery
> In both cases, the shutdown callbacks are expected to be called,

No, this is wrong and not the intent of the error handling.

You seem to be applying PCIe specific concepts brain-farted at Intel
that are way way away from what we care about in practice and in Linux.

> e.g.  some driver handle errors ERR_NONFATAL or FATAL in similar ways
> e.g.
> ioat_pcie_error_detected();  calls  ioat_shutdown(); in case of 
> ERR_NONFATAL
> otherwise ioat_shutdown() in case of ERR_FATAL.

Since when the error handling callbacks even have the concept of FATAL
vs. non-fatal ? This doesn't appear anyhwhere in the prototype of the
struct pci_error_handlers and shouldn't.

Benm=.

> 
> > Cheers,
> > Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 6:59 a.m. UTC | #14
On Thu, 2018-08-16 at 16:51 +1000, Benjamin Herrenschmidt wrote:
> > refering Sepc: 6.2.2.2.1 Fatal Errors, where link is unreliable and it 
> > might need AER style reset of link or DPC style HW recovery
> > In both cases, the shutdown callbacks are expected to be called,
> 
> No, this is wrong and not the intent of the error handling.
> 
> You seem to be applying PCIe specific concepts brain-farted at Intel
> that are way way away from what we care about in practice and in Linux.
> 
> > e.g.  some driver handle errors ERR_NONFATAL or FATAL in similar ways
> > e.g.
> > ioat_pcie_error_detected();  calls  ioat_shutdown(); in case of 
> > ERR_NONFATAL
> > otherwise ioat_shutdown() in case of ERR_FATAL.
> 
> Since when the error handling callbacks even have the concept of FATAL
> vs. non-fatal ? This doesn't appear anyhwhere in the prototype of the
> struct pci_error_handlers and shouldn't.

In fact looking at pcie_do_nonfatal_recovery() it's indeed completely
broken. It tells the driver that the slot was reset without actually
resetting anything... Ugh.

Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 7:05 a.m. UTC | #15
On Thu, 2018-08-16 at 16:51 +1000, Benjamin Herrenschmidt wrote:
> No, this is wrong and not the intent of the error handling.
> 
> You seem to be applying PCIe specific concepts brain-farted at Intel
> that are way way away from what we care about in practice and in Linux.
> 
> > e.g.  some driver handle errors ERR_NONFATAL or FATAL in similar ways
> > e.g.
> > ioat_pcie_error_detected();  calls  ioat_shutdown(); in case of 
> > ERR_NONFATAL
> > otherwise ioat_shutdown() in case of ERR_FATAL.
> 
> Since when the error handling callbacks even have the concept of FATAL
> vs. non-fatal ? This doesn't appear anyhwhere in the prototype of the
> struct pci_error_handlers and shouldn't.

Ugh... I just saw the changes you did to Documentation/PCI/pci-error-
recovery.txt and I would very much like to revert those !

Bjorn, you shouldn't let changes to the PCI error handling through
without acks from us, it looks like we didn't notice (we can't possibly
monitor all lists).

We wrote that in the firsat place and our EEH infrastructure rely on it
heavily on it.

Poza, you seem to have not understood the intent of the code and are
now changing the rules in ways that are broken in our opinion. This is
bad.

Bjorn, please revert all of those changes.

There was NEVER an intent to separate fatal from non-fatal at that
level. We could pass the information to the driver if we wished but the
recovery sequence is NOT intended to be different.

Especially we specifically do NOT want to unplug and replug the device
for fatal errors at all. This is not going to work with drivers that
cannot re-link with their various kernel services, such as storage
devices re-connecting with mounted file systems etc...

Those changes are utterly broken.

The basic premise of the design that we woudl do that unplug/replug
trick if and ONLY IF the driver doesn't have the appropriate callbacks.

We are also now looking at replacing this with an ubind/re-bind because
in practice, the unplugging is causing us all sort of problems. Sam
(CC) can elaborate.

Bjorn, we are the main authors of that spec (Linas wrote it under my
supervision) and created those callbacks for EEH. AER picked them up
only later. Those changes must be at the very least acked by us before
going upstream.

Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 7:15 a.m. UTC | #16
On Thu, 2018-08-16 at 17:05 +1000, Benjamin Herrenschmidt wrote:
> Those changes are utterly broken.
> 
> The basic premise of the design that we woudl do that unplug/replug
> trick if and ONLY IF the driver doesn't have the appropriate callbacks.
> 
> We are also now looking at replacing this with an ubind/re-bind because
> in practice, the unplugging is causing us all sort of problems. Sam
> (CC) can elaborate.
> 
> Bjorn, we are the main authors of that spec (Linas wrote it under my
> supervision) and created those callbacks for EEH. AER picked them up
> only later. Those changes must be at the very least acked by us before
> going upstream.

Also I had a quick look at the DPC spec, it looks like a subset of our
EEH capability.

Again, the code in Linux was written without concertation with us,
misunderstands/misuses the driver callbacks, does unplugs when it
shouldn't etc... 

It's all very broken.

Please, at the very least revert the spec changes. They are utterly
wrong.

The driver MUST remain active during the recovery process *including*
fatal errors.

Only if the recovery fails and the driver gives us may you chose to
unplug the device (though there is little point).

What you have designed will work fine for network drivers but will not
work for storage.

Ben.
Oza Pawandeep Aug. 16, 2018, 7:56 a.m. UTC | #17
On 2018-08-16 12:45, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 17:05 +1000, Benjamin Herrenschmidt wrote:
>> Those changes are utterly broken.
>> 
>> The basic premise of the design that we woudl do that unplug/replug
>> trick if and ONLY IF the driver doesn't have the appropriate 
>> callbacks.
>> 
>> We are also now looking at replacing this with an ubind/re-bind 
>> because
>> in practice, the unplugging is causing us all sort of problems. Sam
>> (CC) can elaborate.
>> 
>> Bjorn, we are the main authors of that spec (Linas wrote it under my
>> supervision) and created those callbacks for EEH. AER picked them up
>> only later. Those changes must be at the very least acked by us before
>> going upstream.
> 
> Also I had a quick look at the DPC spec, it looks like a subset of our
> EEH capability.
> 
> Again, the code in Linux was written without concertation with us,
> misunderstands/misuses the driver callbacks, does unplugs when it
> shouldn't etc...
> 
> It's all very broken.
> 
> Please, at the very least revert the spec changes. They are utterly
> wrong.
> 
> The driver MUST remain active during the recovery process *including*
> fatal errors.
> 
> Only if the recovery fails and the driver gives us may you chose to
> unplug the device (though there is little point).
> 
> What you have designed will work fine for network drivers but will not
> work for storage.
> 
> Ben.

Hi Ben,

If you look at original DPC driver it was doing exactly the same as it 
is doing now.
so with or without new changes DPC driver had the same behavior from 
beginning.

commit 26e515713342b6f7c553aa3c66b21c6ab7cf82af

Regards,
Oza.
Oza Pawandeep Aug. 16, 2018, 8:05 a.m. UTC | #18
On 2018-08-16 12:35, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 16:51 +1000, Benjamin Herrenschmidt wrote:
>> No, this is wrong and not the intent of the error handling.
>> 
>> You seem to be applying PCIe specific concepts brain-farted at Intel
>> that are way way away from what we care about in practice and in 
>> Linux.
>> 
>> > e.g.  some driver handle errors ERR_NONFATAL or FATAL in similar ways
>> > e.g.
>> > ioat_pcie_error_detected();  calls  ioat_shutdown(); in case of
>> > ERR_NONFATAL
>> > otherwise ioat_shutdown() in case of ERR_FATAL.
>> 
>> Since when the error handling callbacks even have the concept of FATAL
>> vs. non-fatal ? This doesn't appear anyhwhere in the prototype of the
>> struct pci_error_handlers and shouldn't.
> 
> Ugh... I just saw the changes you did to Documentation/PCI/pci-error-
> recovery.txt and I would very much like to revert those !
> 
> Bjorn, you shouldn't let changes to the PCI error handling through
> without acks from us, it looks like we didn't notice (we can't possibly
> monitor all lists).
> 
> We wrote that in the firsat place and our EEH infrastructure rely on it
> heavily on it.
> 
> Poza, you seem to have not understood the intent of the code and are
> now changing the rules in ways that are broken in our opinion. This is
> bad.
> 
> Bjorn, please revert all of those changes.
> 
> There was NEVER an intent to separate fatal from non-fatal at that
> level. We could pass the information to the driver if we wished but the
> recovery sequence is NOT intended to be different.
> 
> Especially we specifically do NOT want to unplug and replug the device
> for fatal errors at all. This is not going to work with drivers that
> cannot re-link with their various kernel services, such as storage
> devices re-connecting with mounted file systems etc...
> 
> Those changes are utterly broken.
> 
> The basic premise of the design that we woudl do that unplug/replug
> trick if and ONLY IF the driver doesn't have the appropriate callbacks.
> 
> We are also now looking at replacing this with an ubind/re-bind because
> in practice, the unplugging is causing us all sort of problems. Sam
> (CC) can elaborate.
> 
> Bjorn, we are the main authors of that spec (Linas wrote it under my
> supervision) and created those callbacks for EEH. AER picked them up
> only later. Those changes must be at the very least acked by us before
> going upstream.
> 
> Ben.


+ Sinan

This patch set was there in mailing list for nearly 17 to 18 revisions 
for 7 months.
besides the intent was to bring DPC and AER into the same well defined 
way of error handling.
The way DPC used to behave in 2016, is still the same; which involved 
removing and re-enumerating the devices.

Regards,
Oza.
Oza Pawandeep Aug. 16, 2018, 8:07 a.m. UTC | #19
On 2018-08-16 12:29, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 16:51 +1000, Benjamin Herrenschmidt wrote:
>> > refering Sepc: 6.2.2.2.1 Fatal Errors, where link is unreliable and it
>> > might need AER style reset of link or DPC style HW recovery
>> > In both cases, the shutdown callbacks are expected to be called,
>> 
>> No, this is wrong and not the intent of the error handling.
>> 
>> You seem to be applying PCIe specific concepts brain-farted at Intel
>> that are way way away from what we care about in practice and in 
>> Linux.
>> 
>> > e.g.  some driver handle errors ERR_NONFATAL or FATAL in similar ways
>> > e.g.
>> > ioat_pcie_error_detected();  calls  ioat_shutdown(); in case of
>> > ERR_NONFATAL
>> > otherwise ioat_shutdown() in case of ERR_FATAL.
>> 
>> Since when the error handling callbacks even have the concept of FATAL
>> vs. non-fatal ? This doesn't appear anyhwhere in the prototype of the
>> struct pci_error_handlers and shouldn't.
> 
> In fact looking at pcie_do_nonfatal_recovery() it's indeed completely
> broken. It tells the driver that the slot was reset without actually
> resetting anything... Ugh.
> 
> Ben.

pcie_do_nonfatal_recovery() exhibit the same behavior with or without 
the patch-series.
in short, there was no functional change brought in to 
pcie_do_nonfatal_recovery()

Regards,
Oza.
Benjamin Herrenschmidt Aug. 16, 2018, 8:10 a.m. UTC | #20
On Thu, 2018-08-16 at 13:26 +0530, poza@codeaurora.org wrote:
> > Please, at the very least revert the spec changes. They are utterly
> > wrong.
> > 
> > The driver MUST remain active during the recovery process *including*
> > fatal errors.
> > 
> > Only if the recovery fails and the driver gives us may you chose to
> > unplug the device (though there is little point).
> > 
> > What you have designed will work fine for network drivers but will not
> > work for storage.
> > 
> > Ben.
> 
> Hi Ben,
> 
> If you look at original DPC driver it was doing exactly the same as it 
> is doing now.
> so with or without new changes DPC driver had the same behavior from 
> beginning.

Yes and that behaviour is utterly wrong.

> commit 26e515713342b6f7c553aa3c66b21c6ab7cf82af

Yup, I noticed. We don't use DPC, all of that is subsumed into our EEH
infrastructure, so we didn't notice.

But now that the wrongness is spreading it's starting to show,
especially the changes to the spec.

We shouldn't be too focused on ERR_FATAL vs NON_FATAL. HW has a knack
for misreporting these things anyway.

All we really care about is whether the device was isolated and data
was lost or whether it's just an informational corrected error. The
latter should just be reported without impact. The former should lead
to a recovery procedure.

The design of the error callback was specifially so that you *could*
reset the link (or even PERST the whole device & retrain) *without*
having to unbind the driver or worse, soft-unplug the device.
Specifically beacause once you do the latter, you lose the links to the
mounted filesystems and those cannot be re-established.

So the spec needs to be reverted, DPC fixed and AER as well to do the
right thing here.

In fact it would be nice if we could make some of that logic common
with EEH. Sam (CC) has been cleaning up some of our EEH code which is,
to be honest, a bloody mess, so we might be able to comprehend and
refactor it more easily.

The basic idea is that regardless of the type of error, you first
notify the driver that an error occured. At that point, the io channel
might have been frozen already, so the driver can make no expectation
that the device is still reachable, which matches what DPC does I
believe.

The driver can ask for re-enabling the channel if it thinks it can
attempt a recovery but the core doesn't have to honor it and can chose
to just reset the device.

Similarly, in case of re-enabling, the driver can decide that this
didn't work out and request an escalation to a reset.

So typically, a non-fatal could go through the re-enable phase (in the
case where DPC didn't actually block the channel at all), while non-
fatal would always go to reset.

The only case where we would "unplug" on the linux side is when the
driver doesn't have error handling callbacks.

Cheers,
Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 8:12 a.m. UTC | #21
On Thu, 2018-08-16 at 13:37 +0530, poza@codeaurora.org wrote:
> > 
> > In fact looking at pcie_do_nonfatal_recovery() it's indeed completely
> > broken. It tells the driver that the slot was reset without actually
> > resetting anything... Ugh.
> > 
> > Ben.
> 
> pcie_do_nonfatal_recovery() exhibit the same behavior with or without 
> the patch-series.
> in short, there was no functional change brought in to 
> pcie_do_nonfatal_recovery()

Yes, I know, I'm just saying what it does is broken :-)

Keep in mind that those callbacks were designed originally for EEH
(which predates AER), and so was the spec written.

We don't actually use the AER code on POWER today, so we didn't notice
how broken the implementation was :-)

We should fix that.

Either we can sort all that out by email, or we should plan some kind
of catch-up, either at Plumbers (provided I can go there) or maybe a
webex call.

Cheers,
Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 8:15 a.m. UTC | #22
On Thu, 2018-08-16 at 13:35 +0530, poza@codeaurora.org wrote:
> > 
> > Bjorn, we are the main authors of that spec (Linas wrote it under my
> > supervision) and created those callbacks for EEH. AER picked them up
> > only later. Those changes must be at the very least acked by us before
> > going upstream.
> > 
> > Ben.
> 
> 
> + Sinan
> 
> This patch set was there in mailing list for nearly 17 to 18 revisions 
> for 7 months.

Right and sadly the guy doing EEH on our side left and I didn't notice
what was going on in the list.

But Bjorn should know better :-)

> besides the intent was to bring DPC and AER into the same well defined 
> way of error handling.

That's a good idea, but we need to fix DPC and AER understanding of the
intent of those callbacks, not change the spec to match the broken
implementation.

> The way DPC used to behave in 2016, is still the same; which involved 
> removing and re-enumerating the devices.

Which is mostly useless for anything that isn't a network device.

We've been doing EEH for something like 15 to 20 years, so we have a
long experience with what it takes to get PCI(e) devices to recover on
enterprise systems.

Removing and re-enumerating is one of the very worst thing you can do
in that area.

Cheers,
Ben.
Oza Pawandeep Aug. 16, 2018, 8:22 a.m. UTC | #23
On 2018-08-16 13:45, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 13:35 +0530, poza@codeaurora.org wrote:
>> >
>> > Bjorn, we are the main authors of that spec (Linas wrote it under my
>> > supervision) and created those callbacks for EEH. AER picked them up
>> > only later. Those changes must be at the very least acked by us before
>> > going upstream.
>> >
>> > Ben.
>> 
>> 
>> + Sinan
>> 
>> This patch set was there in mailing list for nearly 17 to 18 revisions
>> for 7 months.
> 
> Right and sadly the guy doing EEH on our side left and I didn't notice
> what was going on in the list.
> 
> But Bjorn should know better :-)
> 
>> besides the intent was to bring DPC and AER into the same well defined
>> way of error handling.
> 
> That's a good idea, but we need to fix DPC and AER understanding of the
> intent of those callbacks, not change the spec to match the broken
> implementation.
> 

ok lets start with what we have rather than going back, because 
reverting the changes is not going to solve anything
as I mentioned the behavior of some of the functions and DPC (was the 
same before and now)
but the good thing happened because of the patches is; there is a common 
framework defined in err.c
and DPC and AER both act on similar rules (the rule is what we define 
understanding of SPEC)

and all we have to do is discuss and evolve it or change it
we can catch up on webex, (Sinan is going to be there in Plumber's 
conference, I might not be able to join there, as we have bring-up 
coming)


>> The way DPC used to behave in 2016, is still the same; which involved
>> removing and re-enumerating the devices.
> 
> Which is mostly useless for anything that isn't a network device.
> 
> We've been doing EEH for something like 15 to 20 years, so we have a
> long experience with what it takes to get PCI(e) devices to recover on
> enterprise systems.
> 
> Removing and re-enumerating is one of the very worst thing you can do
> in that area.
> 
> Cheers,
> Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 8:28 a.m. UTC | #24
On Thu, 2018-08-16 at 13:52 +0530, poza@codeaurora.org wrote:
> 
> ok lets start with what we have rather than going back, because 
> reverting the changes is not going to solve anything

I'm not saying you should revert the DPC/AER changes, but I want to
revert the *spec* changes, they are wrong and they make EEH now not
match the spec.

IE. Documentation/PCI/pci-error-recovery.txt

> as I mentioned the behavior of some of the functions and DPC (was the 
> same before and now)
> but the good thing happened because of the patches is; there is a common 
> framework defined in err.c
> and DPC and AER both act on similar rules (the rule is what we define 
> understanding of SPEC)

Depends what you call spec. I'm talking about the Linux error recovery
specification.

Let's not muddle it with the PCIe spec itself, or I'll start quoting
Linus about the general usefulness of specs :-)

What we need to do is how we want Linux and drivers to behave for error
recovery, more / less *regardless* of what the PCIe spec says because
HW specs are invariably wrong and HW implementations ignore them more
often than not anyway.

> and all we have to do is discuss and evolve it or change it
> we can catch up on webex, (Sinan is going to be there in Plumber's 
> conference, I might not be able to join there, as we have bring-up 
> coming)

Ok, I'll try to get there. Let's plan at least a BOF or two if not a
microconf.

To setup a webex let's first list who needs to attend and respective
timezones so we can figure out a time. I'm in Australia east coast.

> > > The way DPC used to behave in 2016, is still the same; which involved
> > > removing and re-enumerating the devices.
> > 
> > Which is mostly useless for anything that isn't a network device.
> > 
> > We've been doing EEH for something like 15 to 20 years, so we have a
> > long experience with what it takes to get PCI(e) devices to recover on
> > enterprise systems.
> > 
> > Removing and re-enumerating is one of the very worst thing you can do
> > in that area.
> > 
> > Cheers,
> > Ben.
Oza Pawandeep Aug. 16, 2018, 9:03 a.m. UTC | #25
On 2018-08-16 13:42, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 13:37 +0530, poza@codeaurora.org wrote:
>> >
>> > In fact looking at pcie_do_nonfatal_recovery() it's indeed completely
>> > broken. It tells the driver that the slot was reset without actually
>> > resetting anything... Ugh.
>> >
>> > Ben.
>> 
>> pcie_do_nonfatal_recovery() exhibit the same behavior with or without
>> the patch-series.
>> in short, there was no functional change brought in to
>> pcie_do_nonfatal_recovery()
> 
> Yes, I know, I'm just saying what it does is broken :-)


when I meant spec, i meant PCIe Spec.
At least spec distinguish fatal and non-fatal

"
Non-fatal errors are uncorrectable errors which cause a particular 
transaction to be unreliable but the Link is otherwise fully functional.
Isolating Non-fatal from Fatal errors provides Requester/Receiver logic 
in a device or system management software the opportunity to recover
from the error without resetting the components on the Link and 
disturbing other transactions in progress.
"

Here the basic assumption is link is fully functional, hence we do not 
initiate link reset. (while in case FATAL we do initiate Secondary Bus 
Reset)


okay, so here is what current pcie_do_nonfatal_recovery() doe.

pcie_do_nonfatal_recovery
     report_error_detected()    >> calls driver callbacks
     report_mmio_enabled()
     report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
     report_resume()

If you suggest how it is broken, it will help me to understand.
probably you might want to point out what are the calls need to be added 
or removed or differently handled, specially storage point of view.

Regards,
Oza.

> 
> Keep in mind that those callbacks were designed originally for EEH
> (which predates AER), and so was the spec written.
> 
> We don't actually use the AER code on POWER today, so we didn't notice
> how broken the implementation was :-)
> 
> We should fix that.
> 
> Either we can sort all that out by email, or we should plan some kind
> of catch-up, either at Plumbers (provided I can go there) or maybe a
> webex call.
> 
> Cheers,
> Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 10:07 a.m. UTC | #26
On Thu, 2018-08-16 at 14:33 +0530, poza@codeaurora.org wrote:
> On 2018-08-16 13:42, Benjamin Herrenschmidt wrote:
> > 
> when I meant spec, i meant PCIe Spec.
> At least spec distinguish fatal and non-fatal

Yes, I'm well aware of that, however the policy implemented by EEH is
stricter in that *any* uncorrectable error will trigger an immediate
freeze of the endpoint in order to prevent bad data propagation.

However, you don't have to implement it that way for AER. You can
implement a policy where you don't enforce a reset of the device and
link unless the driver requests it.

However if/when the driver does, then you should honor the driver wish
and do it which isn't currently the case in the AER code.

Note that the current error callbacks have no way to convey the fatal
vs. non-fatal information to the device that I can see, we might want
to change the prototype here with a tree-wide patch if you think that
drivers might care.

> Non-fatal errors are uncorrectable errors which cause a particular 
> transaction to be unreliable but the Link is otherwise fully functional.
> Isolating Non-fatal from Fatal errors provides Requester/Receiver logic 
> in a device or system management software the opportunity to recover
> from the error without resetting the components on the Link and 
> disturbing other transactions in progress.
> "
> 
> Here the basic assumption is link is fully functional, hence we do not 
> initiate link reset. (while in case FATAL we do initiate Secondary Bus 
> Reset)

See above.
> 
> okay, so here is what current pcie_do_nonfatal_recovery() doe.
> 
> pcie_do_nonfatal_recovery
>      report_error_detected()    >> calls driver callbacks
>      report_mmio_enabled()
>      report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET

Above if the driver returned "NEED RESET" we should not just "report" a
slot reset, we should *perform* one :-) Unless the AER code does it in
a place I missed...

Also we should do a hot reset at least, not just a link reset.

>      report_resume()
> 
> If you suggest how it is broken, it will help me to understand.
> probably you might want to point out what are the calls need to be added 
> or removed or differently handled, specially storage point of view.


> Regards,
> Oza.
> 
> > 
> > Keep in mind that those callbacks were designed originally for EEH
> > (which predates AER), and so was the spec written.
> > 
> > We don't actually use the AER code on POWER today, so we didn't notice
> > how broken the implementation was :-)
> > 
> > We should fix that.
> > 
> > Either we can sort all that out by email, or we should plan some kind
> > of catch-up, either at Plumbers (provided I can go there) or maybe a
> > webex call.
> > 
> > Cheers,
> > Ben.
Thomas Tai Aug. 16, 2018, 1:30 p.m. UTC | #27
[ ... ]>
>> and all we have to do is discuss and evolve it or change it
>> we can catch up on webex, (Sinan is going to be there in Plumber's
>> conference, I might not be able to join there, as we have bring-up
>> coming)
> 
> Ok, I'll try to get there. Let's plan at least a BOF or two if not a
> microconf.
> 
> To setup a webex let's first list who needs to attend and respective
> timezones so we can figure out a time. I'm in Australia east coast.

Hi guys,
I see that there are a lot discussion regarding the error handling. May 
I join the webex and hopefully I can be helpful? I'm in Canada east 
coast (Ottawa,Ontario,Canada)

Thank you,
Thomas

> 
>>>> The way DPC used to behave in 2016, is still the same; which involved
>>>> removing and re-enumerating the devices.
>>>
>>> Which is mostly useless for anything that isn't a network device.
>>>
>>> We've been doing EEH for something like 15 to 20 years, so we have a
>>> long experience with what it takes to get PCI(e) devices to recover on
>>> enterprise systems.
>>>
>>> Removing and re-enumerating is one of the very worst thing you can do
>>> in that area.
>>>
>>> Cheers,
>>> Ben.
>
Sinan Kaya Aug. 16, 2018, 1:46 p.m. UTC | #28
On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
> we can catch up on webex, (Sinan is going to be there in Plumber's 
> conference, I might not be able to join there, as we have bring-up coming)

My plumbers conference attendance is not confirmed yet. I'm trying to
combine it with a business trip. I did not get an ACK yet from my boss.
Oza Pawandeep Aug. 16, 2018, 2:11 p.m. UTC | #29
On 2018-08-16 15:37, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 14:33 +0530, poza@codeaurora.org wrote:
>> On 2018-08-16 13:42, Benjamin Herrenschmidt wrote:
>> >
>> when I meant spec, i meant PCIe Spec.
>> At least spec distinguish fatal and non-fatal
> 
> Yes, I'm well aware of that, however the policy implemented by EEH is
> stricter in that *any* uncorrectable error will trigger an immediate
> freeze of the endpoint in order to prevent bad data propagation.
> 
> However, you don't have to implement it that way for AER. You can
> implement a policy where you don't enforce a reset of the device and
> link unless the driver requests it.
> 
> However if/when the driver does, then you should honor the driver wish
> and do it which isn't currently the case in the AER code.
> 
> Note that the current error callbacks have no way to convey the fatal
> vs. non-fatal information to the device that I can see, we might want
> to change the prototype here with a tree-wide patch if you think that
> drivers might care.
> 
>> Non-fatal errors are uncorrectable errors which cause a particular
>> transaction to be unreliable but the Link is otherwise fully 
>> functional.
>> Isolating Non-fatal from Fatal errors provides Requester/Receiver 
>> logic
>> in a device or system management software the opportunity to recover
>> from the error without resetting the components on the Link and
>> disturbing other transactions in progress.
>> "
>> 
>> Here the basic assumption is link is fully functional, hence we do not
>> initiate link reset. (while in case FATAL we do initiate Secondary Bus
>> Reset)
> 
> See above.
>> 
>> okay, so here is what current pcie_do_nonfatal_recovery() doe.
>> 
>> pcie_do_nonfatal_recovery
>>      report_error_detected()    >> calls driver callbacks
>>      report_mmio_enabled()
>>      report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
> 
> Above if the driver returned "NEED RESET" we should not just "report" a
> slot reset, we should *perform* one :-) Unless the AER code does it in
> a place I missed...

I am willing work on this if Bjorn agrees.
but I am still trying to figure out missing piece.

so Ben,
you are suggesting

ERR_NONFATAL handling
pcie_do_nonfatal_recovery
      report_error_detected()    >> calls driver callbacks
       report_mmio_enabled()
       report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
Here along with calling slot_reset, you are suggesting to do Secondary 
Bus Reset ?

but this is ERR_NONFATAL and according to definition the link is still 
good, so that the transcriptions on PCIe link can happen.
so my question is why do we want to reset the link ?

although
I see following note in the code as well.
                 /*
		 * TODO: Should call platform-specific
		 * functions to reset slot before calling
		 * drivers' slot_reset callbacks?
		 */

Regards,
Oza.

> 
> Also we should do a hot reset at least, not just a link reset.
> 
>>      report_resume()
>> 
>> If you suggest how it is broken, it will help me to understand.
>> probably you might want to point out what are the calls need to be 
>> added
>> or removed or differently handled, specially storage point of view.
> 
> 
>> Regards,
>> Oza.
>> 
>> >
>> > Keep in mind that those callbacks were designed originally for EEH
>> > (which predates AER), and so was the spec written.
>> >
>> > We don't actually use the AER code on POWER today, so we didn't notice
>> > how broken the implementation was :-)
>> >
>> > We should fix that.
>> >
>> > Either we can sort all that out by email, or we should plan some kind
>> > of catch-up, either at Plumbers (provided I can go there) or maybe a
>> > webex call.
>> >
>> > Cheers,
>> > Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 11:27 p.m. UTC | #30
On Thu, 2018-08-16 at 09:46 -0400, Sinan Kaya wrote:
> On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
> > we can catch up on webex, (Sinan is going to be there in Plumber's 
> > conference, I might not be able to join there, as we have bring-up coming)
> 
> My plumbers conference attendance is not confirmed yet. I'm trying to
> combine it with a business trip. I did not get an ACK yet from my boss.

Ok, I'll collect the names and will propose a time/date early next
week, I'm a bit caught up until monday.

Bjorn, are you available as well ?

Cheers,
Ben.
Benjamin Herrenschmidt Aug. 16, 2018, 11:30 p.m. UTC | #31
On Thu, 2018-08-16 at 19:41 +0530, poza@codeaurora.org wrote:
> 
> > See above.
> > > 
> > > okay, so here is what current pcie_do_nonfatal_recovery() doe.
> > > 
> > > pcie_do_nonfatal_recovery
> > >      report_error_detected()    >> calls driver callbacks
> > >      report_mmio_enabled()
> > >      report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
> > 
> > Above if the driver returned "NEED RESET" we should not just "report" a
> > slot reset, we should *perform* one :-) Unless the AER code does it in
> > a place I missed...
> 
> I am willing work on this if Bjorn agrees.
> but I am still trying to figure out missing piece.
> 
> so Ben,
> you are suggesting
> 
> ERR_NONFATAL handling
> pcie_do_nonfatal_recovery
>       report_error_detected()    >> calls driver callbacks
>        report_mmio_enabled()
>        report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
> Here along with calling slot_reset, you are suggesting to do Secondary 
> Bus Reset ?
> 
> but this is ERR_NONFATAL and according to definition the link is still 
> good, so that the transcriptions on PCIe link can happen.
> so my question is why do we want to reset the link ?

The driver responded to you from error_detected() or mmio_enabled()
that it wants the slot to be reset. So you should do so. This comes
from the driver deciding that whatever happens, it doesn't trust the
device state anymore.

report_slot_reset() iirc is about telling the driver that the reset has
been performed and it can reconfigure the device.

To be frank I haven't looked at this in a while, so we should refer to
the document before ou patched it :-) But the basic design we created
back then was precisely that the driver would have the ultimate say in
the matter.

Also because multiple devices, at least on power, can share a domain, 
we can get into a situation where one device requires a reset, so all
will get reset, and their respective drivers need to be notified.

Note: it's not so much about resetting the *link* than about resetting
the *device*.

> although
> I see following note in the code as well.
>                  /*
> 		 * TODO: Should call platform-specific
> 		 * functions to reset slot before calling
> 		 * drivers' slot_reset callbacks?
> 		 */

Yup :-)

Cheers,
Ben.

> Regards,
> Oza.
> 
> > 
> > Also we should do a hot reset at least, not just a link reset.
> > 
> > >      report_resume()
> > > 
> > > If you suggest how it is broken, it will help me to understand.
> > > probably you might want to point out what are the calls need to be 
> > > added
> > > or removed or differently handled, specially storage point of view.
> > 
> > 
> > > Regards,
> > > Oza.
> > > 
> > > > 
> > > > Keep in mind that those callbacks were designed originally for EEH
> > > > (which predates AER), and so was the spec written.
> > > > 
> > > > We don't actually use the AER code on POWER today, so we didn't notice
> > > > how broken the implementation was :-)
> > > > 
> > > > We should fix that.
> > > > 
> > > > Either we can sort all that out by email, or we should plan some kind
> > > > of catch-up, either at Plumbers (provided I can go there) or maybe a
> > > > webex call.
> > > > 
> > > > Cheers,
> > > > Ben.
Oza Pawandeep Aug. 17, 2018, 6:35 a.m. UTC | #32
On 2018-08-17 04:57, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 09:46 -0400, Sinan Kaya wrote:
>> On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
>> > we can catch up on webex, (Sinan is going to be there in Plumber's
>> > conference, I might not be able to join there, as we have bring-up coming)
>> 
>> My plumbers conference attendance is not confirmed yet. I'm trying to
>> combine it with a business trip. I did not get an ACK yet from my 
>> boss.
> 
> Ok, I'll collect the names and will propose a time/date early next
> week, I'm a bit caught up until monday.
> 
> Bjorn, are you available as well ?

I might be able to make it to Plumbers conference.

Regards,
Oza.

> 
> Cheers,
> Ben.
Oza Pawandeep Aug. 17, 2018, 10:29 a.m. UTC | #33
On 2018-08-17 05:00, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 19:41 +0530, poza@codeaurora.org wrote:
>> 
>> > See above.
>> > >
>> > > okay, so here is what current pcie_do_nonfatal_recovery() doe.
>> > >
>> > > pcie_do_nonfatal_recovery
>> > >      report_error_detected()    >> calls driver callbacks
>> > >      report_mmio_enabled()
>> > >      report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
>> >
>> > Above if the driver returned "NEED RESET" we should not just "report" a
>> > slot reset, we should *perform* one :-) Unless the AER code does it in
>> > a place I missed...
>> 
>> I am willing work on this if Bjorn agrees.
>> but I am still trying to figure out missing piece.
>> 
>> so Ben,
>> you are suggesting
>> 
>> ERR_NONFATAL handling
>> pcie_do_nonfatal_recovery
>>       report_error_detected()    >> calls driver callbacks
>>        report_mmio_enabled()
>>        report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
>> Here along with calling slot_reset, you are suggesting to do Secondary
>> Bus Reset ?
>> 
>> but this is ERR_NONFATAL and according to definition the link is still
>> good, so that the transcriptions on PCIe link can happen.
>> so my question is why do we want to reset the link ?
> 
> The driver responded to you from error_detected() or mmio_enabled()
> that it wants the slot to be reset. So you should do so. This comes
> from the driver deciding that whatever happens, it doesn't trust the
> device state anymore.
> 
> report_slot_reset() iirc is about telling the driver that the reset has
> been performed and it can reconfigure the device.
> 
> To be frank I haven't looked at this in a while, so we should refer to
> the document before ou patched it :-) But the basic design we created
> back then was precisely that the driver would have the ultimate say in
> the matter.

The existing design is; framework dictating drivers what it should do 
rather than driver deciding.
let me explain.

aer_isr
     aer_isr_one_error
          get_device_error_info   >> this checks
               {
                 /* Link is still healthy for IO reads */    >> Bjorn 
this looks like a very strange thing to me, if link is not healthy we 
are not even going for pcie_do_fatal_recovery()
		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS,
			&info->status);
		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK,
			&info->mask);
		if (!(info->status & ~info->mask))
			return 0;

               {
         handle_error_source
         pcie_do_nonfatal_recovery or pcie_do_fatal_recovery

now it does not seem to me that driver has some way of knowing if it has 
to return PCI_ERS_RESULT_CAN_RECOVER ? or PCI_ERS_RESULT_NEED_RESET?

let us take an example of storage driver here e.g. NVME
nvme_error_detected()  relies heavily on pci_channel_state_t which is 
passed by error framework  (err.c)

I understand your point of view and original intention of design that 
let driver dictate whether it wants slot_reset ?
but driver relies on framework to pass that information in order to take 
decision.

In case of pci_channel_io_frozen  (which is ERR_FATAL), most of the 
drivers demand link reset.

but in case of ERR_NONFATAL, the link is supposed to be functional and 
there is no need for link reset.
and exactly all the PCI based drivers returns 
PCI_ERS_RESULT_CAN_RECOVER, which is valid.


so I think to conclude, you are referring more of your views towards
pcie_do_fatal_recovery()


which does

> remove_devices from the downstream link
> reset_link()   (Secondary bus reset)
> re-enumerate.

and this is what DPC defined, and AER is just following that.

Regards,
Oza.

> 
> Also because multiple devices, at least on power, can share a domain,
> we can get into a situation where one device requires a reset, so all
> will get reset, and their respective drivers need to be notified.
> 
> Note: it's not so much about resetting the *link* than about resetting
> the *device*.
> 
>> although
>> I see following note in the code as well.
>>                  /*
>> 		 * TODO: Should call platform-specific
>> 		 * functions to reset slot before calling
>> 		 * drivers' slot_reset callbacks?
>> 		 */
> 
> Yup :-)
> 
> Cheers,
> Ben.
> 
>> Regards,
>> Oza.
>> 
>> >
>> > Also we should do a hot reset at least, not just a link reset.
>> >
>> > >      report_resume()
>> > >
>> > > If you suggest how it is broken, it will help me to understand.
>> > > probably you might want to point out what are the calls need to be
>> > > added
>> > > or removed or differently handled, specially storage point of view.
>> >
>> >
>> > > Regards,
>> > > Oza.
>> > >
>> > > >
>> > > > Keep in mind that those callbacks were designed originally for EEH
>> > > > (which predates AER), and so was the spec written.
>> > > >
>> > > > We don't actually use the AER code on POWER today, so we didn't notice
>> > > > how broken the implementation was :-)
>> > > >
>> > > > We should fix that.
>> > > >
>> > > > Either we can sort all that out by email, or we should plan some kind
>> > > > of catch-up, either at Plumbers (provided I can go there) or maybe a
>> > > > webex call.
>> > > >
>> > > > Cheers,
>> > > > Ben.
Oza Pawandeep Aug. 17, 2018, 10:44 a.m. UTC | #34
On 2018-08-17 15:59, poza@codeaurora.org wrote:
> On 2018-08-17 05:00, Benjamin Herrenschmidt wrote:
>> On Thu, 2018-08-16 at 19:41 +0530, poza@codeaurora.org wrote:
>>> 
>>> > See above.
>>> > >
>>> > > okay, so here is what current pcie_do_nonfatal_recovery() doe.
>>> > >
>>> > > pcie_do_nonfatal_recovery
>>> > >      report_error_detected()    >> calls driver callbacks
>>> > >      report_mmio_enabled()
>>> > >      report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
>>> >
>>> > Above if the driver returned "NEED RESET" we should not just "report" a
>>> > slot reset, we should *perform* one :-) Unless the AER code does it in
>>> > a place I missed...
>>> 
>>> I am willing work on this if Bjorn agrees.
>>> but I am still trying to figure out missing piece.
>>> 
>>> so Ben,
>>> you are suggesting
>>> 
>>> ERR_NONFATAL handling
>>> pcie_do_nonfatal_recovery
>>>       report_error_detected()    >> calls driver callbacks
>>>        report_mmio_enabled()
>>>        report_slot_reset()  >> if PCI_ERS_RESULT_NEED_RESET
>>> Here along with calling slot_reset, you are suggesting to do 
>>> Secondary
>>> Bus Reset ?
>>> 
>>> but this is ERR_NONFATAL and according to definition the link is 
>>> still
>>> good, so that the transcriptions on PCIe link can happen.
>>> so my question is why do we want to reset the link ?
>> 
>> The driver responded to you from error_detected() or mmio_enabled()
>> that it wants the slot to be reset. So you should do so. This comes
>> from the driver deciding that whatever happens, it doesn't trust the
>> device state anymore.
>> 
>> report_slot_reset() iirc is about telling the driver that the reset 
>> has
>> been performed and it can reconfigure the device.
>> 
>> To be frank I haven't looked at this in a while, so we should refer to
>> the document before ou patched it :-) But the basic design we created
>> back then was precisely that the driver would have the ultimate say in
>> the matter.
> 
> The existing design is; framework dictating drivers what it should do
> rather than driver deciding.
> let me explain.
> 
> aer_isr
>     aer_isr_one_error
>          get_device_error_info   >> this checks
>               {
>                 /* Link is still healthy for IO reads */    >> Bjorn
> this looks like a very strange thing to me, if link is not healthy we
> are not even going for pcie_do_fatal_recovery()

I misread mask as severity, this code is ok, it just does not process 
masked errors.

> 		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS,
> 			&info->status);
> 		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK,
> 			&info->mask);
> 		if (!(info->status & ~info->mask))
> 			return 0;
> 
>               {
>         handle_error_source
>         pcie_do_nonfatal_recovery or pcie_do_fatal_recovery
> 
> now it does not seem to me that driver has some way of knowing if it
> has to return PCI_ERS_RESULT_CAN_RECOVER ? or
> PCI_ERS_RESULT_NEED_RESET?
> 
> let us take an example of storage driver here e.g. NVME
> nvme_error_detected()  relies heavily on pci_channel_state_t which is
> passed by error framework  (err.c)
> 
> I understand your point of view and original intention of design that
> let driver dictate whether it wants slot_reset ?
> but driver relies on framework to pass that information in order to
> take decision.
> 
> In case of pci_channel_io_frozen  (which is ERR_FATAL), most of the
> drivers demand link reset.
> 
> but in case of ERR_NONFATAL, the link is supposed to be functional and
> there is no need for link reset.
> and exactly all the PCI based drivers returns
> PCI_ERS_RESULT_CAN_RECOVER, which is valid.
> 
> 
> so I think to conclude, you are referring more of your views towards
> pcie_do_fatal_recovery()
> 
> 
> which does
> 
>> remove_devices from the downstream link
>> reset_link()   (Secondary bus reset)
>> re-enumerate.
> 
> and this is what DPC defined, and AER is just following that.
> 
> Regards,
> Oza.
> 
>> 
>> Also because multiple devices, at least on power, can share a domain,
>> we can get into a situation where one device requires a reset, so all
>> will get reset, and their respective drivers need to be notified.
>> 
>> Note: it's not so much about resetting the *link* than about resetting
>> the *device*.
>> 
>>> although
>>> I see following note in the code as well.
>>>                  /*
>>> 		 * TODO: Should call platform-specific
>>> 		 * functions to reset slot before calling
>>> 		 * drivers' slot_reset callbacks?
>>> 		 */
>> 
>> Yup :-)
>> 
>> Cheers,
>> Ben.
>> 
>>> Regards,
>>> Oza.
>>> 
>>> >
>>> > Also we should do a hot reset at least, not just a link reset.
>>> >
>>> > >      report_resume()
>>> > >
>>> > > If you suggest how it is broken, it will help me to understand.
>>> > > probably you might want to point out what are the calls need to be
>>> > > added
>>> > > or removed or differently handled, specially storage point of view.
>>> >
>>> >
>>> > > Regards,
>>> > > Oza.
>>> > >
>>> > > >
>>> > > > Keep in mind that those callbacks were designed originally for EEH
>>> > > > (which predates AER), and so was the spec written.
>>> > > >
>>> > > > We don't actually use the AER code on POWER today, so we didn't notice
>>> > > > how broken the implementation was :-)
>>> > > >
>>> > > > We should fix that.
>>> > > >
>>> > > > Either we can sort all that out by email, or we should plan some kind
>>> > > > of catch-up, either at Plumbers (provided I can go there) or maybe a
>>> > > > webex call.
>>> > > >
>>> > > > Cheers,
>>> > > > Ben.
Benjamin Herrenschmidt Aug. 18, 2018, 7:38 a.m. UTC | #35
On Fri, 2018-08-17 at 15:59 +0530, poza@codeaurora.org wrote:
> 
> The existing design is; framework dictating drivers what it should do 
> rather than driver deciding.
> let me explain.

The AER code design might be but it's not what was documented in
Documentation/PCI/pci-error-recovery.txt before you changed it !

And it's not what about 20 years of doing PCI error recovery before
there even was such a thing as PCIe or AER taught us on powerpc.

The basic idea is that the "action" to be taken to recover is the
"deepest" of all the ones requested by the framework and all the
involved drivers (more than one device could be part of an error
handling "domain" under some circumstances, for example when the
failure originates from a bridge).

What the PCIe spec says is rather uninteresting and can be trusted as
far as policy is concern about as much as you can trust HW to be
compliant with it, that is about zilch.

The only interesting thing to take from the spec is that in the case of
a FATAL error, you *must* at least reset the link. That's it. It
doesn't mean that you shouldn't reset more (PERST for example) and it
doesn't mean that you shouldn't reset the device or link for a
NON_FATAL error either. And even if it did we should ignore it.

There are many reasons why a driver might request a reset. For example,
the error, while non-fatal to the link itself, might still have
triggered a non-recoverable event in the device, either in HW or SW,
and the only safe way out might well be a reset.

On POWER systems, with EEH HW support, we have additional rules that
say that on *any* non-corrected error (and under some circumstances if
corrected errors cross a certain threshold) will isolate the device
completely and the typical recovery from that is a reset (it doesn't
*have* to be but that's what most drivers chose).

This is rules design to prevent propagation of potentially corrupted
data which is considering way more important than QoS of the device.

We should see if we can make some of the AER and EEH code more common,
the first thing would be to take things out of pcie/ since EEH isn't
PCIe specific, and abstract the recovery process from the underlying
architecture hooks to perform things like reset etc...
 
> aer_isr
>      aer_isr_one_error
>           get_device_error_info   >> this checks
>                {
>                  /* Link is still healthy for IO reads */    >> Bjorn 
> this looks like a very strange thing to me, if link is not healthy we 
> are not even going for pcie_do_fatal_recovery()
> 		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS,
> 			&info->status);
> 		pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK,
> 			&info->mask);
> 		if (!(info->status & ~info->mask))
> 			return 0;
> 
>                {
>          handle_error_source
>          pcie_do_nonfatal_recovery or pcie_do_fatal_recovery
> 
> now it does not seem to me that driver has some way of knowing if it has 
> to return PCI_ERS_RESULT_CAN_RECOVER ? or PCI_ERS_RESULT_NEED_RESET?

You don't know that. A driver can look at the device state at the
enable_mmio phase and decide that the device is sufficiently sane that
it can continue... or not. A driver could have a policy of always
resetting. Etc...

> let us take an example of storage driver here e.g. NVME
> nvme_error_detected()  relies heavily on pci_channel_state_t which is 
> passed by error framework  (err.c)

So what ? This is one example... the IPR SCSI driver is much more
complex in its error recovery afaik.

> I understand your point of view and original intention of design that 
> let driver dictate whether it wants slot_reset ?

It's both the driver *and* the framework, which ever wants the
"harshest" solution wins. If either wants a reset we do a reset.

> but driver relies on framework to pass that information in order to take 
> decision.

No. Some drivers might. This is by no means the absolute rule. There
are also all other type of errors that aren't covered by the AER
categories that fit in that framework such as iommu errors (at least
for us on powerpc).

> In case of pci_channel_io_frozen  (which is ERR_FATAL), most of the 
> drivers demand link reset.

No, they demand a *device* reset. ie, hot reset or PERST. It's not
about the link. It's about the device.

> but in case of ERR_NONFATAL, the link is supposed to be functional and 
> there is no need for link reset.

It depends whether the error was corrected or not. If it was not
corrected, then something bad did happen and the framework has no way
to know what is the appropriate remediation for a given device.

> and exactly all the PCI based drivers returns 
> PCI_ERS_RESULT_CAN_RECOVER, which is valid.

You are mixing the AER terminology of FATAL/NON_FATAL which is AER
specific and the framework defined channel states, which are
"functional", "frozen" and "permanently dead". These don't necessarily
match 1:1.

> so I think to conclude, you are referring more of your views towards
> pcie_do_fatal_recovery()

Both. But yes, pcie_do_fatal_recovery() should definitely NOT do an
unplug/replug blindly. The whole point of the error handlers was to
allow driver to recover from these things without losing the links to
their respective consumers (such as filesystems for block devices).

> which does
> 
> > remove_devices from the downstream link
> > reset_link()   (Secondary bus reset)
> > re-enumerate.
> 
> and this is what DPC defined, and AER is just following that.

And is completely and utterly broken and useless as a policy.

Again, somebody cared too much about the spec rather than what makes
actual sense in practice.

You should *only* remove devices if the driver doesn't have the
necessary callbacks.

In fact, remove and re-plug device is probably the worst thing you can
do, I don't remember all the details but it's been causing us endless
issues with EEH, which is why as I mentioned, we are looking more
toward an unbind and rebind of the driver, but that's a details.

If the driver has the error callbacks it should participate in the
recovery and NOT be unbound or the device unplugged. That's completely
useless for Linux to do that since it means you cannot recover from an
error on a storage driver.

Ben.

> Regards,
> Oza.
> 
> > 
> > Also because multiple devices, at least on power, can share a domain,
> > we can get into a situation where one device requires a reset, so all
> > will get reset, and their respective drivers need to be notified.
> > 
> > Note: it's not so much about resetting the *link* than about resetting
> > the *device*.
> > 
> > > although
> > > I see following note in the code as well.
> > >                  /*
> > > 		 * TODO: Should call platform-specific
> > > 		 * functions to reset slot before calling
> > > 		 * drivers' slot_reset callbacks?
> > > 		 */
> > 
> > Yup :-)
> > 
> > Cheers,
> > Ben.
> > 
> > > Regards,
> > > Oza.
> > > 
> > > > 
> > > > Also we should do a hot reset at least, not just a link reset.
> > > > 
> > > > >      report_resume()
> > > > > 
> > > > > If you suggest how it is broken, it will help me to understand.
> > > > > probably you might want to point out what are the calls need to be
> > > > > added
> > > > > or removed or differently handled, specially storage point of view.
> > > > 
> > > > 
> > > > > Regards,
> > > > > Oza.
> > > > > 
> > > > > > 
> > > > > > Keep in mind that those callbacks were designed originally for EEH
> > > > > > (which predates AER), and so was the spec written.
> > > > > > 
> > > > > > We don't actually use the AER code on POWER today, so we didn't notice
> > > > > > how broken the implementation was :-)
> > > > > > 
> > > > > > We should fix that.
> > > > > > 
> > > > > > Either we can sort all that out by email, or we should plan some kind
> > > > > > of catch-up, either at Plumbers (provided I can go there) or maybe a
> > > > > > webex call.
> > > > > > 
> > > > > > Cheers,
> > > > > > Ben.
Bjorn Helgaas Aug. 19, 2018, 2:19 a.m. UTC | #36
On Thu, Aug 16, 2018 at 05:05:30PM +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 16:51 +1000, Benjamin Herrenschmidt wrote:
> > No, this is wrong and not the intent of the error handling.
> > 
> > You seem to be applying PCIe specific concepts brain-farted at Intel
> > that are way way away from what we care about in practice and in Linux.
> > 
> > > e.g.  some driver handle errors ERR_NONFATAL or FATAL in similar ways
> > > e.g.
> > > ioat_pcie_error_detected();  calls  ioat_shutdown(); in case of 
> > > ERR_NONFATAL
> > > otherwise ioat_shutdown() in case of ERR_FATAL.
> > 
> > Since when the error handling callbacks even have the concept of FATAL
> > vs. non-fatal ? This doesn't appear anyhwhere in the prototype of the
> > struct pci_error_handlers and shouldn't.
> 
> Ugh... I just saw the changes you did to Documentation/PCI/pci-error-
> recovery.txt and I would very much like to revert those !
> 
> Bjorn, you shouldn't let changes to the PCI error handling through
> without acks from us, it looks like we didn't notice (we can't possibly
> monitor all lists).

Sorry, they were certainly very visible on linux-pci, but I should
have added you to cc if you weren't there already.  Please update
MAINTAINERS if it's incomplete or out of date (I can't possibly know
who might be interested in every change).

> Bjorn, please revert all of those changes.

Please send the appropriate patches and we'll go from there.
Bjorn Helgaas Aug. 19, 2018, 2:24 a.m. UTC | #37
On Fri, Aug 17, 2018 at 09:27:03AM +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2018-08-16 at 09:46 -0400, Sinan Kaya wrote:
> > On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
> > > we can catch up on webex, (Sinan is going to be there in Plumber's 
> > > conference, I might not be able to join there, as we have bring-up coming)
> > 
> > My plumbers conference attendance is not confirmed yet. I'm trying to
> > combine it with a business trip. I did not get an ACK yet from my boss.
> 
> Ok, I'll collect the names and will propose a time/date early next
> week, I'm a bit caught up until monday.
> 
> Bjorn, are you available as well ?

I'm in CDT (UTC-5).  Possible times for me are 6am-10pm CDT.
Sinan Kaya Aug. 19, 2018, 9:41 p.m. UTC | #38
On 8/18/2018 10:19 PM, Bjorn Helgaas wrote:
>> Bjorn, please revert all of those changes.
> Please send the appropriate patches and we'll go from there.
> 

I'm also catching up on this thread.

I don't think revert is the way to go. There is certainly value in Oza's
code to make error handling common.

We started by following the existing error handling scheme and then
moved onto the stop/remove behavior based on Bjorn's feedback.

The right thing is for Oza to rework the code to go back to original
error handling callback mechanism. That should be a trivial change.
Benjamin Herrenschmidt Aug. 20, 2018, 2:03 a.m. UTC | #39
On Sun, 2018-08-19 at 17:41 -0400, Sinan Kaya wrote:
> On 8/18/2018 10:19 PM, Bjorn Helgaas wrote:
> > > Bjorn, please revert all of those changes.
> > 
> > Please send the appropriate patches and we'll go from there.
> > 
> 
> I'm also catching up on this thread.
> 
> I don't think revert is the way to go. There is certainly value in Oza's
> code to make error handling common.

The revert of the Documentation change must happen though. It's
completely wrong. The documentation documents what EEH implements so by
making it match what I argue is a broken implementation in AER, you are
in fact breaking us.

> We started by following the existing error handling scheme and then
> moved onto the stop/remove behavior based on Bjorn's feedback.

Whish is utterly wrong.

> The right thing is for Oza to rework the code to go back to original
> error handling callback mechanism. That should be a trivial change.

At this stage I'm only asking to revert the documentation updatgae.
I'll send a patch to that effect.

As for figuring out where to go from there, I agree we should discuss
this further, I would love to be able to make more of the code common
with EEH as well.

Cheers,
Ben.
Benjamin Herrenschmidt Aug. 20, 2018, 4:37 a.m. UTC | #40
On Sat, 2018-08-18 at 21:19 -0500, Bjorn Helgaas wrote:
> 
> > Ugh... I just saw the changes you did to Documentation/PCI/pci-error-
> > recovery.txt and I would very much like to revert those !
> > 
> > Bjorn, you shouldn't let changes to the PCI error handling through
> > without acks from us, it looks like we didn't notice (we can't possibly
> > monitor all lists).
> 
> Sorry, they were certainly very visible on linux-pci, but I should
> have added you to cc if you weren't there already.  Please update
> MAINTAINERS if it's incomplete or out of date (I can't possibly know
> who might be interested in every change).

Right but we designed the error handling (we = powerpc). Due to churn
with people I think nobody's been actively monitoring linux-pci and I
was myself caught up with a bunch of other things.

> > Bjorn, please revert all of those changes.
> 
> Please send the appropriate patches and we'll go from there.
Oza Pawandeep Aug. 20, 2018, 5:09 a.m. UTC | #41
On 2018-08-19 07:54, Bjorn Helgaas wrote:
> On Fri, Aug 17, 2018 at 09:27:03AM +1000, Benjamin Herrenschmidt wrote:
>> On Thu, 2018-08-16 at 09:46 -0400, Sinan Kaya wrote:
>> > On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
>> > > we can catch up on webex, (Sinan is going to be there in Plumber's
>> > > conference, I might not be able to join there, as we have bring-up coming)
>> >
>> > My plumbers conference attendance is not confirmed yet. I'm trying to
>> > combine it with a business trip. I did not get an ACK yet from my boss.
>> 
>> Ok, I'll collect the names and will propose a time/date early next
>> week, I'm a bit caught up until monday.
>> 
>> Bjorn, are you available as well ?
> 
> I'm in CDT (UTC-5).  Possible times for me are 6am-10pm CDT.

I am in India timezone.

Regards,
Oza.
Benjamin Herrenschmidt Aug. 20, 2018, 5:15 a.m. UTC | #42
On Mon, 2018-08-20 at 10:39 +0530, poza@codeaurora.org wrote:
> On 2018-08-19 07:54, Bjorn Helgaas wrote:
> > On Fri, Aug 17, 2018 at 09:27:03AM +1000, Benjamin Herrenschmidt wrote:
> > > On Thu, 2018-08-16 at 09:46 -0400, Sinan Kaya wrote:
> > > > On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
> > > > > we can catch up on webex, (Sinan is going to be there in Plumber's
> > > > > conference, I might not be able to join there, as we have bring-up coming)
> > > > 
> > > > My plumbers conference attendance is not confirmed yet. I'm trying to
> > > > combine it with a business trip. I did not get an ACK yet from my boss.
> > > 
> > > Ok, I'll collect the names and will propose a time/date early next
> > > week, I'm a bit caught up until monday.
> > > 
> > > Bjorn, are you available as well ?
> > 
> > I'm in CDT (UTC-5).  Possible times for me are 6am-10pm CDT.
> 
> I am in India timezone.

Hrm ... this is going to be difficult to reconcile. Let's first see
what we can do by email this week shall we ?

Cheers,
Ben.
Oza Pawandeep Aug. 20, 2018, 5:19 a.m. UTC | #43
On 2018-08-20 07:33, Benjamin Herrenschmidt wrote:
> On Sun, 2018-08-19 at 17:41 -0400, Sinan Kaya wrote:
>> On 8/18/2018 10:19 PM, Bjorn Helgaas wrote:
>> > > Bjorn, please revert all of those changes.
>> >
>> > Please send the appropriate patches and we'll go from there.
>> >
>> 
>> I'm also catching up on this thread.
>> 
>> I don't think revert is the way to go. There is certainly value in 
>> Oza's
>> code to make error handling common.
> 
> The revert of the Documentation change must happen though. It's
> completely wrong. The documentation documents what EEH implements so by
> making it match what I argue is a broken implementation in AER, you are
> in fact breaking us.
> 
>> We started by following the existing error handling scheme and then
>> moved onto the stop/remove behavior based on Bjorn's feedback.
> 
> Whish is utterly wrong.
> 
>> The right thing is for Oza to rework the code to go back to original
>> error handling callback mechanism. That should be a trivial change.
> 
> At this stage I'm only asking to revert the documentation updatgae.
> I'll send a patch to that effect.
> 
> As for figuring out where to go from there, I agree we should discuss
> this further, I would love to be able to make more of the code common
> with EEH as well.
> 
> Cheers,
> Ben.


Reverting spec/Documentation which is fine by me.

But the good thing has happened now is; we can have very clear 
definition for the framework to go forward.
e.g. how the errors have to be handled.

Because of those patches, the whole error framework is under common code 
base and now has become independent of service e.g. AER, DPC etc..
That enables us to define or extend policies in more clearly defined way 
irrespective of what services are running.

Now it is just that we have to change in err.c and walk away with the 
policies what we want to enforce.

let me know how this sounds Ben.

Regards,
Oza.
Benjamin Herrenschmidt Aug. 20, 2018, 5:33 a.m. UTC | #44
On Mon, 2018-08-20 at 10:49 +0530, poza@codeaurora.org wrote:
> 
> Reverting spec/Documentation which is fine by me.
> 
> But the good thing has happened now is; we can have very clear 
> definition for the framework to go forward.
> e.g. how the errors have to be handled.
> 
> Because of those patches, the whole error framework is under common code 
> base and now has become independent of service e.g. AER, DPC etc..

Well, EEH isn't yet :-) But then the EEH code is a real mess buried in
arch/powerpc. Sam (CC) is trying to improve that situation and I might
step in as well to help if we think we can make things more common, it
would definitely help.

> That enables us to define or extend policies in more clearly defined way 
> irrespective of what services are running.
> 
> Now it is just that we have to change in err.c and walk away with the 
> policies what we want to enforce.
> 
> let me know how this sounds Ben.

So for now, I've sent a revert patch for the Documentation/ bit to
Bjorn, and I have no (not yet at least) beef in what you do in
drivers/pci/* ... however, that said, I think it would be great to move
EEH toward having a bulk of the policy use common code as well.

It will be long road, in part due to the historical crappyness of our
EEH code, so my thinking is we should:

 - First agree on what we want the policy to be. I need to read a bit
more about DPC since that's new to me, it seems to be similar to what
our EEH does, with slighty less granularity (we can freeze access to
individual functions for example).

 - Rework err.c to implement that policy with the existing AER and DPC
code.

 - Figure out what hooks might be needed to be able to plumb EEH into
it, possibly removing a bunch of crap in arch/powerpc (yay !)

I don't think having a webex will be that practical with the timezones
involved. I'm trying to get approval to go to Plumbers in which case we
could setup a BOF but I have no guarantee at this point that I can make
it happen.

So let's try using email as much possible for now.

Cheers,
Ben.
Oza Pawandeep Aug. 20, 2018, 7:56 a.m. UTC | #45
On 2018-08-20 11:03, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 10:49 +0530, poza@codeaurora.org wrote:
>> 
>> Reverting spec/Documentation which is fine by me.
>> 
>> But the good thing has happened now is; we can have very clear
>> definition for the framework to go forward.
>> e.g. how the errors have to be handled.
>> 
>> Because of those patches, the whole error framework is under common 
>> code
>> base and now has become independent of service e.g. AER, DPC etc..
> 
> Well, EEH isn't yet :-) But then the EEH code is a real mess buried in
> arch/powerpc. Sam (CC) is trying to improve that situation and I might
> step in as well to help if we think we can make things more common, it
> would definitely help.
> 
>> That enables us to define or extend policies in more clearly defined 
>> way
>> irrespective of what services are running.
>> 
>> Now it is just that we have to change in err.c and walk away with the
>> policies what we want to enforce.
>> 
>> let me know how this sounds Ben.
> 
> So for now, I've sent a revert patch for the Documentation/ bit to
> Bjorn, and I have no (not yet at least) beef in what you do in
> drivers/pci/* ... however, that said, I think it would be great to move
> EEH toward having a bulk of the policy use common code as well.
> 

In my opinion, I think revert patch can wait, because again we might 
need to change it according to improvements we bring.

> It will be long road, in part due to the historical crappyness of our
> EEH code, so my thinking is we should:
> 
>  - First agree on what we want the policy to be. I need to read a bit
> more about DPC since that's new to me, it seems to be similar to what
> our EEH does, with slighty less granularity (we can freeze access to
> individual functions for example).
> 
>  - Rework err.c to implement that policy with the existing AER and DPC
> code.
> 

I went through eeh-pci-error-recovery.txt
If I pick up folowing

"
If the OS or device driver suspects that a PCI slot has been 
EEH-isolated, there is a firmware call it can make to determine if this 
is the case. If so, then the device driver should put itself into a 
consistent state (given that it won't be able to complete any pending 
work) and start recovery of the card.  Recovery normally would consist 
of resetting the PCI device (holding the PCI #RST line high for two 
seconds), followed by setting up the device config space (the base 
address registers (BAR's), latency timer, cache line size, interrupt 
line, and so on).  This is followed by a reinitialization of the device 
driver.  In a worst-case scenario, the power to the card can be toggled, 
at least on hot-plug-capable slots.  In principle, layers far above the 
device driver probably do not need to know that the PCI card has been 
"rebooted" in this way; ideally, there should be at most a pause in 
Ethernet/disk/USB I/O while the card is being reset.

If the card cannot be recovered after three or four resets, the 
kernel/device driver should assume the worst-case scenario, that the 
card has died completely, and report this error to the sysadmin. In 
addition, error messages are reported through RTAS and also through 
syslogd (/var/log/messages) to alert the sysadmin of PCI resets. The 
correct way to deal with failed adapters is to use the standard PCI 
hotplug tools to remove and replace the dead card.
"

some differences: I find is:
Current framework does not attempt recovery 3-4 times.

Besides If I grasp eeh-pci-error-recovery.txt correctly, (although I 
could be easily wrong),
mainly the difference is coming how we reset the device.

Linux uses SBR, while EEH seems to be using #PERST.
PERST signal implementation might be board specific, e.g. we have 
io-port-expander sitting on i2c to drive #PERST.
we rely on SBR. and SBR also should be a good way to bring hotreset of 
device.
"All Lanes in the configured Link transmit TS1 Ordered Sets with the Hot 
Reset bit 15 asserted and the configured Link and Lane numbers."
although I am not sure between #PERST and SBR is there is any subtle 
differences such as some sticky bits might not be affected by SBR.
but so far SBR has served well for us.

Also I see that eeh-pci-error-recovery.txt does not seem to distinguish 
between fatal and nonfatal errors, is the behavior same for both type of 
errors in EEH ?

I would like to mention is that there should nto be any need to reset 
the card in case of NONFATAL because by definition link is functional, 
only the particular transaction
had a problem. so not sure how EEH deals with ERR_NONFATAL.

Also I am very much interested in knowing original intention of DPC 
driver to unplug/plug devices,
all I remember in some conversation was:
hotplug capable bridge might have see devices changed, so it is safer to 
remove/unplug the devices and during which .shutdown methods of driver 
is called, in case of ERR_FATAL.
although DPC is HW recovery while AER is sw recovery both should 
fundamentally act in the same way as far as device drivers callbacks are 
concerned.
(again I really dont know real motivation behind this)


Regards,
Oza.

>  - Figure out what hooks might be needed to be able to plumb EEH into
> it, possibly removing a bunch of crap in arch/powerpc (yay !)
> 
> I don't think having a webex will be that practical with the timezones
> involved. I'm trying to get approval to go to Plumbers in which case we
> could setup a BOF but I have no guarantee at this point that I can make
> it happen.
> 
> So let's try using email as much possible for now.
> 
> Cheers,
> Ben.
Benjamin Herrenschmidt Aug. 20, 2018, 11:22 a.m. UTC | #46
On Mon, 2018-08-20 at 13:26 +0530, poza@codeaurora.org wrote:
>  now, I've sent a revert patch for the Documentation/ bit to
> > Bjorn, and I have no (not yet at least) beef in what you do in
> > drivers/pci/* ... however, that said, I think it would be great to move
> > EEH toward having a bulk of the policy use common code as well.
> > 
> 
> In my opinion, I think revert patch can wait, because again we might 
> need to change it according to improvements we bring.

I doubt it will differ much from what's there ... anyway...

> > It will be long road, in part due to the historical crappyness of our
> > EEH code, so my thinking is we should:
> > 
> >  - First agree on what we want the policy to be. I need to read a bit
> > more about DPC since that's new to me, it seems to be similar to what
> > our EEH does, with slighty less granularity (we can freeze access to
> > individual functions for example).
> > 
> >  - Rework err.c to implement that policy with the existing AER and DPC
> > code.
> > 
> 
> I went through eeh-pci-error-recovery.txt
> If I pick up folowing
> 
> "
> If the OS or device driver suspects that a PCI slot has been 
> EEH-isolated, there is a firmware call it can make to determine if this 
> is the case. If so, then the device driver should put itself into a 
> consistent state (given that it won't be able to complete any pending 
> work) and start recovery of the card.  Recovery normally would consist 
> of resetting the PCI device (holding the PCI #RST line high for two 
> seconds), followed by setting up the device config space (the base 
> address registers (BAR's), latency timer, cache line size, interrupt 
> line, and so on).  This is followed by a reinitialization of the device 
> driver.  In a worst-case scenario, the power to the card can be toggled, 
> at least on hot-plug-capable slots.  In principle, layers far above the 
> device driver probably do not need to know that the PCI card has been 
> "rebooted" in this way; ideally, there should be at most a pause in 
> Ethernet/disk/USB I/O while the card is being reset.

This verbiage is a bit old :-) A bunch of it predates PCIe.

> If the card cannot be recovered after three or four resets, the 
> kernel/device driver should assume the worst-case scenario, that the 
> card has died completely, and report this error to the sysadmin. In 
> addition, error messages are reported through RTAS and also through 
> syslogd (/var/log/messages) to alert the sysadmin of PCI resets. The 
> correct way to deal with failed adapters is to use the standard PCI 
> hotplug tools to remove and replace the dead card.
> "
> 
> some differences: I find is:
> Current framework does not attempt recovery 3-4 times.
> 
> Besides If I grasp eeh-pci-error-recovery.txt correctly, (although I 
> could be easily wrong),
> mainly the difference is coming how we reset the device.

See below

> Linux uses SBR, while EEH seems to be using #PERST.
> PERST signal implementation might be board specific, e.g. we have 
> io-port-expander sitting on i2c to drive #PERST.
> we rely on SBR. and SBR also should be a good way to bring hotreset of 
> device.
> "All Lanes in the configured Link transmit TS1 Ordered Sets with the Hot 
> Reset bit 15 asserted and the configured Link and Lane numbers."
> although I am not sure between #PERST and SBR is there is any subtle 
> differences such as some sticky bits might not be affected by SBR.
> but so far SBR has served well for us.
> 
> Also I see that eeh-pci-error-recovery.txt does not seem to distinguish 
> between fatal and nonfatal errors, is the behavior same for both type of 
> errors in EEH ?

Yeah so...

> I would like to mention is that there should nto be any need to reset 
> the card in case of NONFATAL because by definition link is functional, 
> only the particular transaction
> had a problem. so not sure how EEH deals with ERR_NONFATAL.

As I believe I said earlier... NONFATAL means the link is still usable.
It doesn't mean *anything* as far as the state of the device itself is
concerned. The device microcode could have crashed for example etc...

That is why it's important to honor a driver requesting a reset.

Ignoring the verbiage in the documents for a minute, think of the big
picture... the idea is that on error, there could be many actors
involved in the recovery. The brigde, the link itself (NONFATAL vs.
FATAL matters here), the device, the driver .... 

So the design is that it's always permitted to perform a "deeper"
action than strictly necessary. IE: If the link is ok, the device might
still require a reset. If the device is ok, the platform might still
enforce a reset.... 

There can be multiple devices or drivers involved in a recovery
operation (for example if the error came from a switch).

The difference between FATAL and NON_FATAL is PCIe specific and only
really matters from the perspective that if it's FATAL the core will
require a reset. It doens't mean some other agent can't request one too
when NON_FATAL errors occur.

As for EEH, *any* error that isn't purely informational results in the
adapter being isolated physcially by some dedicated HW. It's possible
to selectively un-isolate but the policy has generally been to just go
through the reset sequence as it was generally deemed safer.

It's hard enough to properly test and validate recovery path, it's
almost impossible to test and verify every way the HW can recover from
a all type of non-fatal errors.

Thus by enforcing that on any error we simply reset the adapter provide
a slower but much more robust recovery mechanism that is also a lot
more testable.

> Also I am very much interested in knowing original intention of DPC 
> driver to unplug/plug devices,
> all I remember in some conversation was:
> hotplug capable bridge might have see devices changed, so it is safer to 
> remove/unplug the devices and during which .shutdown methods of driver 
> is called, in case of ERR_FATAL.

The device being "swapped" during an error recovery operation is ...
unlikely. Do the bridges have a way to latch that the presence detect
changed ?

> although DPC is HW recovery while AER is sw recovery both should 
> fundamentally act in the same way as far as device drivers callbacks are 
> concerned.
> (again I really dont know real motivation behind this)

The main problem with unplug/replug (as I mentioned earlier) is that it
just does NOT work for storage controllers (or similar type of
devices). The links between the storage controller and the mounted
filesystems is lost permanently, you'll most likely have to reboot the
machine.

With our current EEH implementation we can successfully recover from
fatal errors with the storage controller by resetting it.

Finally as for PERST vs. Hot Reset, this is an implementation detail.
Not all our platforms can control PERST on all devices either, we
generally prefer PERST as it provides a more reliable reset mechanism
in practice (talking from experience) but will fallback to hot reset.

Cheers,
Ben.

> 
> Regards,
> Oza.
> 
> >  - Figure out what hooks might be needed to be able to plumb EEH into
> > it, possibly removing a bunch of crap in arch/powerpc (yay !)
> > 
> > I don't think having a webex will be that practical with the timezones
> > involved. I'm trying to get approval to go to Plumbers in which case we
> > could setup a BOF but I have no guarantee at this point that I can make
> > it happen.
> > 
> > So let's try using email as much possible for now.
> > 
> > Cheers,
> > Ben.
Thomas Tai Aug. 20, 2018, 1:02 p.m. UTC | #47
On 08/20/2018 01:15 AM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 10:39 +0530, poza@codeaurora.org wrote:
>> On 2018-08-19 07:54, Bjorn Helgaas wrote:
>>> On Fri, Aug 17, 2018 at 09:27:03AM +1000, Benjamin Herrenschmidt wrote:
>>>> On Thu, 2018-08-16 at 09:46 -0400, Sinan Kaya wrote:
>>>>> On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
>>>>>> we can catch up on webex, (Sinan is going to be there in Plumber's
>>>>>> conference, I might not be able to join there, as we have bring-up coming)
>>>>>
>>>>> My plumbers conference attendance is not confirmed yet. I'm trying to
>>>>> combine it with a business trip. I did not get an ACK yet from my boss.
>>>>
>>>> Ok, I'll collect the names and will propose a time/date early next
>>>> week, I'm a bit caught up until monday.
>>>>
>>>> Bjorn, are you available as well ?
>>>
>>> I'm in CDT (UTC-5).  Possible times for me are 6am-10pm CDT.
>>
>> I am in India timezone.
> 
> Hrm ... this is going to be difficult to reconcile. Let's first see
> what we can do by email this week shall we ?

OK, I will catch up on email to see the details. I suppose I should hold 
off the original patch to fix the "prevent pcie_do_fatal_recovery from 
using device after it is removed" patch, right?

Thank you,
Thomas

> 
> Cheers,
> Ben.
> 
>
Oza Pawandeep Aug. 20, 2018, 1:26 p.m. UTC | #48
On 2018-08-20 16:52, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 13:26 +0530, poza@codeaurora.org wrote:
>>  now, I've sent a revert patch for the Documentation/ bit to
>> > Bjorn, and I have no (not yet at least) beef in what you do in
>> > drivers/pci/* ... however, that said, I think it would be great to move
>> > EEH toward having a bulk of the policy use common code as well.
>> >
>> 
>> In my opinion, I think revert patch can wait, because again we might
>> need to change it according to improvements we bring.
> 
> I doubt it will differ much from what's there ... anyway...
> 
>> > It will be long road, in part due to the historical crappyness of our
>> > EEH code, so my thinking is we should:
>> >
>> >  - First agree on what we want the policy to be. I need to read a bit
>> > more about DPC since that's new to me, it seems to be similar to what
>> > our EEH does, with slighty less granularity (we can freeze access to
>> > individual functions for example).
>> >
>> >  - Rework err.c to implement that policy with the existing AER and DPC
>> > code.
>> >
>> 
>> I went through eeh-pci-error-recovery.txt
>> If I pick up folowing
>> 
>> "
>> If the OS or device driver suspects that a PCI slot has been
>> EEH-isolated, there is a firmware call it can make to determine if 
>> this
>> is the case. If so, then the device driver should put itself into a
>> consistent state (given that it won't be able to complete any pending
>> work) and start recovery of the card.  Recovery normally would consist
>> of resetting the PCI device (holding the PCI #RST line high for two
>> seconds), followed by setting up the device config space (the base
>> address registers (BAR's), latency timer, cache line size, interrupt
>> line, and so on).  This is followed by a reinitialization of the 
>> device
>> driver.  In a worst-case scenario, the power to the card can be 
>> toggled,
>> at least on hot-plug-capable slots.  In principle, layers far above 
>> the
>> device driver probably do not need to know that the PCI card has been
>> "rebooted" in this way; ideally, there should be at most a pause in
>> Ethernet/disk/USB I/O while the card is being reset.
> 
> This verbiage is a bit old :-) A bunch of it predates PCIe.
> 
>> If the card cannot be recovered after three or four resets, the
>> kernel/device driver should assume the worst-case scenario, that the
>> card has died completely, and report this error to the sysadmin. In
>> addition, error messages are reported through RTAS and also through
>> syslogd (/var/log/messages) to alert the sysadmin of PCI resets. The
>> correct way to deal with failed adapters is to use the standard PCI
>> hotplug tools to remove and replace the dead card.
>> "
>> 
>> some differences: I find is:
>> Current framework does not attempt recovery 3-4 times.
>> 
>> Besides If I grasp eeh-pci-error-recovery.txt correctly, (although I
>> could be easily wrong),
>> mainly the difference is coming how we reset the device.
> 
> See below
> 
>> Linux uses SBR, while EEH seems to be using #PERST.
>> PERST signal implementation might be board specific, e.g. we have
>> io-port-expander sitting on i2c to drive #PERST.
>> we rely on SBR. and SBR also should be a good way to bring hotreset of
>> device.
>> "All Lanes in the configured Link transmit TS1 Ordered Sets with the 
>> Hot
>> Reset bit 15 asserted and the configured Link and Lane numbers."
>> although I am not sure between #PERST and SBR is there is any subtle
>> differences such as some sticky bits might not be affected by SBR.
>> but so far SBR has served well for us.
>> 
>> Also I see that eeh-pci-error-recovery.txt does not seem to 
>> distinguish
>> between fatal and nonfatal errors, is the behavior same for both type 
>> of
>> errors in EEH ?
> 
> Yeah so...
> 
>> I would like to mention is that there should nto be any need to reset
>> the card in case of NONFATAL because by definition link is functional,
>> only the particular transaction
>> had a problem. so not sure how EEH deals with ERR_NONFATAL.
> 
> As I believe I said earlier... NONFATAL means the link is still usable.
> It doesn't mean *anything* as far as the state of the device itself is
> concerned. The device microcode could have crashed for example etc...
> 
> That is why it's important to honor a driver requesting a reset.
> 
> Ignoring the verbiage in the documents for a minute, think of the big
> picture... the idea is that on error, there could be many actors
> involved in the recovery. The brigde, the link itself (NONFATAL vs.
> FATAL matters here), the device, the driver ....
> 
> So the design is that it's always permitted to perform a "deeper"
> action than strictly necessary. IE: If the link is ok, the device might
> still require a reset. If the device is ok, the platform might still
> enforce a reset....
> 
> There can be multiple devices or drivers involved in a recovery
> operation (for example if the error came from a switch).
> 
> The difference between FATAL and NON_FATAL is PCIe specific and only
> really matters from the perspective that if it's FATAL the core will
> require a reset. It doens't mean some other agent can't request one too
> when NON_FATAL errors occur.
> 
> As for EEH, *any* error that isn't purely informational results in the
> adapter being isolated physcially by some dedicated HW. It's possible
> to selectively un-isolate but the policy has generally been to just go
> through the reset sequence as it was generally deemed safer.
> 
> It's hard enough to properly test and validate recovery path, it's
> almost impossible to test and verify every way the HW can recover from
> a all type of non-fatal errors.
> 
> Thus by enforcing that on any error we simply reset the adapter provide
> a slower but much more robust recovery mechanism that is also a lot
> more testable.

Hi Ben,

I get the idea and get your points. and when I started implementation of 
this, I did ask these questions to myself.
but then we all fell aligned to what DPC was doing, because thats how 
the driver was dealing with ERR_FATAL.

I can put together a patch adhering to the idea, and modify err.c.
let me know if you are okay with that.
I have both DPC and AER capable root port, so I can easily validate the 
changes as we improve upon this.

besides We have to come to some common ground on policy.
1) if device driver dictates the reset we should agree to do SBR. 
(irrespective of the type of error)
2) under what circumstances framework will impose the reset, even if 
device driver did not !

probably changes might be simpler; although it will require to exercise 
some tests cases for both DPC and AER.
let me know if anything I missed here, but let me attempt to make patch 
and we can go form there.

Let me know you opinion.

Regards,
Oza.

> 
>> Also I am very much interested in knowing original intention of DPC
>> driver to unplug/plug devices,
>> all I remember in some conversation was:
>> hotplug capable bridge might have see devices changed, so it is safer 
>> to
>> remove/unplug the devices and during which .shutdown methods of driver
>> is called, in case of ERR_FATAL.
> 
> The device being "swapped" during an error recovery operation is ...
> unlikely. Do the bridges have a way to latch that the presence detect
> changed ?
> 
>> although DPC is HW recovery while AER is sw recovery both should
>> fundamentally act in the same way as far as device drivers callbacks 
>> are
>> concerned.
>> (again I really dont know real motivation behind this)
> 
> The main problem with unplug/replug (as I mentioned earlier) is that it
> just does NOT work for storage controllers (or similar type of
> devices). The links between the storage controller and the mounted
> filesystems is lost permanently, you'll most likely have to reboot the
> machine.
> 
> With our current EEH implementation we can successfully recover from
> fatal errors with the storage controller by resetting it.
> 
> Finally as for PERST vs. Hot Reset, this is an implementation detail.
> Not all our platforms can control PERST on all devices either, we
> generally prefer PERST as it provides a more reliable reset mechanism
> in practice (talking from experience) but will fallback to hot reset.
> 
> Cheers,
> Ben.
> 
>> 
>> Regards,
>> Oza.
>> 
>> >  - Figure out what hooks might be needed to be able to plumb EEH into
>> > it, possibly removing a bunch of crap in arch/powerpc (yay !)
>> >
>> > I don't think having a webex will be that practical with the timezones
>> > involved. I'm trying to get approval to go to Plumbers in which case we
>> > could setup a BOF but I have no guarantee at this point that I can make
>> > it happen.
>> >
>> > So let's try using email as much possible for now.
>> >
>> > Cheers,
>> > Ben.
Benjamin Herrenschmidt Aug. 20, 2018, 1:27 p.m. UTC | #49
On Mon, 2018-08-20 at 09:02 -0400, Thomas Tai wrote:
> 
> On 08/20/2018 01:15 AM, Benjamin Herrenschmidt wrote:
> > On Mon, 2018-08-20 at 10:39 +0530, poza@codeaurora.org wrote:
> > > On 2018-08-19 07:54, Bjorn Helgaas wrote:
> > > > On Fri, Aug 17, 2018 at 09:27:03AM +1000, Benjamin Herrenschmidt wrote:
> > > > > On Thu, 2018-08-16 at 09:46 -0400, Sinan Kaya wrote:
> > > > > > On 8/16/2018 4:22 AM, poza@codeaurora.org wrote:
> > > > > > > we can catch up on webex, (Sinan is going to be there in Plumber's
> > > > > > > conference, I might not be able to join there, as we have bring-up coming)
> > > > > > 
> > > > > > My plumbers conference attendance is not confirmed yet. I'm trying to
> > > > > > combine it with a business trip. I did not get an ACK yet from my boss.
> > > > > 
> > > > > Ok, I'll collect the names and will propose a time/date early next
> > > > > week, I'm a bit caught up until monday.
> > > > > 
> > > > > Bjorn, are you available as well ?
> > > > 
> > > > I'm in CDT (UTC-5).  Possible times for me are 6am-10pm CDT.
> > > 
> > > I am in India timezone.
> > 
> > Hrm ... this is going to be difficult to reconcile. Let's first see
> > what we can do by email this week shall we ?
> 
> OK, I will catch up on email to see the details. I suppose I should hold 
> off the original patch to fix the "prevent pcie_do_fatal_recovery from 
> using device after it is removed" patch, right?

I wouldn't hold on bug fixes for now. We might refactor all of this but
it might take a kernel version or two...

Cheers,
Ben.

> 
> Thank you,
> Thomas
> 
> > 
> > Cheers,
> > Ben.
> > 
> >
Keith Busch Aug. 20, 2018, 3:53 p.m. UTC | #50
On Mon, Aug 20, 2018 at 09:22:27PM +1000, Benjamin Herrenschmidt wrote:
> The main problem with unplug/replug (as I mentioned earlier) is that it
> just does NOT work for storage controllers (or similar type of
> devices). The links between the storage controller and the mounted
> filesystems is lost permanently, you'll most likely have to reboot the
> machine.

You probably shouldn't mount raw storage devices if they can be hot
added/removed. There are device mappers for that! :)

And you can't just change DPC device removal. A DPC event triggers
the link down, and that will trigger pciehp to disconnect the subtree
anyway. Having DPC do it too just means you get the same behavior with
or without enabling STLCTL.DLLSC.
Oza Pawandeep Aug. 20, 2018, 4:13 p.m. UTC | #51
On 2018-08-20 21:23, Keith Busch wrote:
> On Mon, Aug 20, 2018 at 09:22:27PM +1000, Benjamin Herrenschmidt wrote:
>> The main problem with unplug/replug (as I mentioned earlier) is that 
>> it
>> just does NOT work for storage controllers (or similar type of
>> devices). The links between the storage controller and the mounted
>> filesystems is lost permanently, you'll most likely have to reboot the
>> machine.
> 
> You probably shouldn't mount raw storage devices if they can be hot
> added/removed. There are device mappers for that! :)
> 
> And you can't just change DPC device removal. A DPC event triggers
> the link down, and that will trigger pciehp to disconnect the subtree
> anyway. Having DPC do it too just means you get the same behavior with
> or without enabling STLCTL.DLLSC.

Hi Keith,

what about the bridges which are not hotplug capable ?

Besides, following patch is trying to ignore link event if there is 
fatal error reporting.
[PATCH v8 1/2] PCI: pciehp: Ignore link events when there is a fatal 
error pending

Regards,
Oza.
Keith Busch Aug. 20, 2018, 4:32 p.m. UTC | #52
On Mon, Aug 20, 2018 at 09:43:48PM +0530, poza@codeaurora.org wrote:
> On 2018-08-20 21:23, Keith Busch wrote:
> > On Mon, Aug 20, 2018 at 09:22:27PM +1000, Benjamin Herrenschmidt wrote:
> > > The main problem with unplug/replug (as I mentioned earlier) is that
> > > it
> > > just does NOT work for storage controllers (or similar type of
> > > devices). The links between the storage controller and the mounted
> > > filesystems is lost permanently, you'll most likely have to reboot the
> > > machine.
> > 
> > You probably shouldn't mount raw storage devices if they can be hot
> > added/removed. There are device mappers for that! :)
> > 
> > And you can't just change DPC device removal. A DPC event triggers
> > the link down, and that will trigger pciehp to disconnect the subtree
> > anyway. Having DPC do it too just means you get the same behavior with
> > or without enabling STLCTL.DLLSC.
> 
> Hi Keith,
> 
> what about the bridges which are not hotplug capable ?

That would have to mean SLTCTL.DLSSC isn't enabled. The point was to
make DPC control behave the same regardless of slot control.
Benjamin Herrenschmidt Aug. 20, 2018, 9:02 p.m. UTC | #53
On Mon, 2018-08-20 at 18:56 +0530, poza@codeaurora.org wrote:
> Hi Ben,
> 
> I get the idea and get your points. and when I started implementation of 
> this, I did ask these questions to myself.
> but then we all fell aligned to what DPC was doing, because thats how 
> the driver was dealing with ERR_FATAL.
> 
> I can put together a patch adhering to the idea, and modify err.c.
> let me know if you are okay with that.
> I have both DPC and AER capable root port, so I can easily validate the 
> changes as we improve upon this.
> 
> besides We have to come to some common ground on policy.
> 1) if device driver dictates the reset we should agree to do SBR. 
> (irrespective of the type of error)

It's not just "the driver dictates the reset". It's a combination of
all agents. Either the driver or the framework, *both* can request a
reset. Also if you have multiple devices involved (for example multiple
functions), then any of the drivers can request the reset.

> 2) under what circumstances framework will impose the reset, even if 
> device driver did not !

Well ERR_FATAL typically would be one. Make it an argument to the
framework, it's possible that EEH might always do, I have to look into
it.

> probably changes might be simpler; although it will require to exercise 
> some tests cases for both DPC and AER.
> let me know if anything I missed here, but let me attempt to make patch 
> and we can go form there.
> 
> Let me know you opinion.

I agree. Hopefully Sam will have a chance to chime in (he's caught up
with something else at the moment) as well.

Cheers,
Ben.

> Regards,
> Oza.
> 
> > 
> > > Also I am very much interested in knowing original intention of DPC
> > > driver to unplug/plug devices,
> > > all I remember in some conversation was:
> > > hotplug capable bridge might have see devices changed, so it is safer 
> > > to
> > > remove/unplug the devices and during which .shutdown methods of driver
> > > is called, in case of ERR_FATAL.
> > 
> > The device being "swapped" during an error recovery operation is ...
> > unlikely. Do the bridges have a way to latch that the presence detect
> > changed ?
> > 
> > > although DPC is HW recovery while AER is sw recovery both should
> > > fundamentally act in the same way as far as device drivers callbacks 
> > > are
> > > concerned.
> > > (again I really dont know real motivation behind this)
> > 
> > The main problem with unplug/replug (as I mentioned earlier) is that it
> > just does NOT work for storage controllers (or similar type of
> > devices). The links between the storage controller and the mounted
> > filesystems is lost permanently, you'll most likely have to reboot the
> > machine.
> > 
> > With our current EEH implementation we can successfully recover from
> > fatal errors with the storage controller by resetting it.
> > 
> > Finally as for PERST vs. Hot Reset, this is an implementation detail.
> > Not all our platforms can control PERST on all devices either, we
> > generally prefer PERST as it provides a more reliable reset mechanism
> > in practice (talking from experience) but will fallback to hot reset.
> > 
> > Cheers,
> > Ben.
> > 
> > > 
> > > Regards,
> > > Oza.
> > > 
> > > >  - Figure out what hooks might be needed to be able to plumb EEH into
> > > > it, possibly removing a bunch of crap in arch/powerpc (yay !)
> > > > 
> > > > I don't think having a webex will be that practical with the timezones
> > > > involved. I'm trying to get approval to go to Plumbers in which case we
> > > > could setup a BOF but I have no guarantee at this point that I can make
> > > > it happen.
> > > > 
> > > > So let's try using email as much possible for now.
> > > > 
> > > > Cheers,
> > > > Ben.
Benjamin Herrenschmidt Aug. 20, 2018, 9:05 p.m. UTC | #54
On Mon, 2018-08-20 at 09:53 -0600, Keith Busch wrote:
> On Mon, Aug 20, 2018 at 09:22:27PM +1000, Benjamin Herrenschmidt wrote:
> > The main problem with unplug/replug (as I mentioned earlier) is that it
> > just does NOT work for storage controllers (or similar type of
> > devices). The links between the storage controller and the mounted
> > filesystems is lost permanently, you'll most likely have to reboot the
> > machine.
> 
> You probably shouldn't mount raw storage devices if they can be hot
> added/removed. There are device mappers for that! :)

This is not about hot adding/removing, it's about error recovery.

> And you can't just change DPC device removal. A DPC event triggers
> the link down, and that will trigger pciehp to disconnect the subtree
> anyway. Having DPC do it too just means you get the same behavior with
> or without enabling STLCTL.DLLSC.

This is wrong. EEH can trigger a link down to and we don't remove the
subtree in that case. We allow the drivers to recover.

Ben.
Sinan Kaya Aug. 20, 2018, 9:21 p.m. UTC | #55
On 8/20/2018 5:05 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 09:53 -0600, Keith Busch wrote:
>> On Mon, Aug 20, 2018 at 09:22:27PM +1000, Benjamin Herrenschmidt wrote:
>>> The main problem with unplug/replug (as I mentioned earlier) is that it
>>> just does NOT work for storage controllers (or similar type of
>>> devices). The links between the storage controller and the mounted
>>> filesystems is lost permanently, you'll most likely have to reboot the
>>> machine.
>>
>> You probably shouldn't mount raw storage devices if they can be hot
>> added/removed. There are device mappers for that! :)
> 
> This is not about hot adding/removing, it's about error recovery.
> 
>> And you can't just change DPC device removal. A DPC event triggers
>> the link down, and that will trigger pciehp to disconnect the subtree
>> anyway. Having DPC do it too just means you get the same behavior with
>> or without enabling STLCTL.DLLSC.
> 
> This is wrong. EEH can trigger a link down to and we don't remove the
> subtree in that case. We allow the drivers to recover.
> 

I have a patch to solve this issue.

https://lkml.org/lkml/2018/8/19/124

Hotplug driver removes the devices on link down events and re-enumerates
on insertion.

I am trying to separate fatal error handling from hotplug.

> Ben.
> 
> 
>
Keith Busch Aug. 20, 2018, 9:35 p.m. UTC | #56
On Mon, Aug 20, 2018 at 05:21:30PM -0400, Sinan Kaya wrote:
> On 8/20/2018 5:05 PM, Benjamin Herrenschmidt wrote:
> > On Mon, 2018-08-20 at 09:53 -0600, Keith Busch wrote:
> > > On Mon, Aug 20, 2018 at 09:22:27PM +1000, Benjamin Herrenschmidt wrote:
> > > > The main problem with unplug/replug (as I mentioned earlier) is that it
> > > > just does NOT work for storage controllers (or similar type of
> > > > devices). The links between the storage controller and the mounted
> > > > filesystems is lost permanently, you'll most likely have to reboot the
> > > > machine.
> > > 
> > > You probably shouldn't mount raw storage devices if they can be hot
> > > added/removed. There are device mappers for that! :)
> > 
> > This is not about hot adding/removing, it's about error recovery.
> > 
> > > And you can't just change DPC device removal. A DPC event triggers
> > > the link down, and that will trigger pciehp to disconnect the subtree
> > > anyway. Having DPC do it too just means you get the same behavior with
> > > or without enabling STLCTL.DLLSC.
> > 
> > This is wrong. EEH can trigger a link down to and we don't remove the
> > subtree in that case. We allow the drivers to recover.
> > 
> 
> I have a patch to solve this issue.
> 
> https://lkml.org/lkml/2018/8/19/124
> 
> Hotplug driver removes the devices on link down events and re-enumerates
> on insertion.
> 
> I am trying to separate fatal error handling from hotplug.

I'll try to take a look. We can't always count on pciehp to do the
removal when a removal occurs, though. The PCIe specification contains
an implementation note that DPC may be used in place of hotplug surprise.
Benjamin Herrenschmidt Aug. 20, 2018, 9:53 p.m. UTC | #57
On Mon, 2018-08-20 at 15:35 -0600, Keith Busch wrote:
> On
> > I have a patch to solve this issue.
> > 
> > https://lkml.org/lkml/2018/8/19/124
> > 
> > Hotplug driver removes the devices on link down events and re-enumerates
> > on insertion.
> > 
> > I am trying to separate fatal error handling from hotplug.
> 
> I'll try to take a look. We can't always count on pciehp to do the
> removal when a removal occurs, though. The PCIe specification contains
> an implementation note that DPC may be used in place of hotplug surprise.

Can't you use the presence detect to differenciate ?

Also, I don't have the specs at hand right now, but does the hotplug
brigde have a way to "latch' the change in presence detect so we can
see if it has transitioned even if it's back on ?

Ben.
Sinan Kaya Aug. 20, 2018, 10:02 p.m. UTC | #58
On 8/20/2018 5:53 PM, Benjamin Herrenschmidt wrote:
>>> Hotplug driver removes the devices on link down events and re-enumerates
>>> on insertion.
>>>
>>> I am trying to separate fatal error handling from hotplug.
>> I'll try to take a look. We can't always count on pciehp to do the
>> removal when a removal occurs, though. The PCIe specification contains
>> an implementation note that DPC may be used in place of hotplug surprise.
> Can't you use the presence detect to differenciate ?
> 
> Also, I don't have the specs at hand right now, but does the hotplug
> brigde have a way to "latch' the change in presence detect so we can
> see if it has transitioned even if it's back on ?

There is only presence detect change and link layer change. No actual
state information.
Benjamin Herrenschmidt Aug. 20, 2018, 10:04 p.m. UTC | #59
On Mon, 2018-08-20 at 18:02 -0400, Sinan Kaya wrote:
> On 8/20/2018 5:53 PM, Benjamin Herrenschmidt wrote:
> > > > Hotplug driver removes the devices on link down events and re-enumerates
> > > > on insertion.
> > > > 
> > > > I am trying to separate fatal error handling from hotplug.
> > > 
> > > I'll try to take a look. We can't always count on pciehp to do the
> > > removal when a removal occurs, though. The PCIe specification contains
> > > an implementation note that DPC may be used in place of hotplug surprise.
> > 
> > Can't you use the presence detect to differenciate ?
> > 
> > Also, I don't have the specs at hand right now, but does the hotplug
> > brigde have a way to "latch' the change in presence detect so we can
> > see if it has transitioned even if it's back on ?
> 
> There is only presence detect change and link layer change. No actual
> state information.

It does latch that it has changed tho right ? So if presence detect
hasn't changed, we can assume it's an error and not an unplug ?

We could discriminate that way to reduce the risk of doing a recovery
without unbind on something that was actually removed and replaced.

Cheers,
Ben.
Sinan Kaya Aug. 20, 2018, 10:13 p.m. UTC | #60
On 8/20/2018 6:04 PM, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 18:02 -0400, Sinan Kaya wrote:
>> On 8/20/2018 5:53 PM, Benjamin Herrenschmidt wrote:
>>>>> Hotplug driver removes the devices on link down events and re-enumerates
>>>>> on insertion.
>>>>>
>>>>> I am trying to separate fatal error handling from hotplug.
>>>>
>>>> I'll try to take a look. We can't always count on pciehp to do the
>>>> removal when a removal occurs, though. The PCIe specification contains
>>>> an implementation note that DPC may be used in place of hotplug surprise.
>>>
>>> Can't you use the presence detect to differenciate ?
>>>
>>> Also, I don't have the specs at hand right now, but does the hotplug
>>> brigde have a way to "latch' the change in presence detect so we can
>>> see if it has transitioned even if it's back on ?
>>
>> There is only presence detect change and link layer change. No actual
>> state information.
> 
> It does latch that it has changed tho right ? So if presence detect
> hasn't changed, we can assume it's an error and not an unplug ?
> 
> We could discriminate that way to reduce the risk of doing a recovery
> without unbind on something that was actually removed and replaced.
> 

I proposed this as one of the possible solutions but presence detect is
optional and also presence detect interrupt can be delivered after
link layer interrupt as well. No guarantees with respect to the order of
link layer interrupt and presence detect interrupts delivery.

I instead look at fatal error pending bit in device status register to
decide if the link down was initiated due to a pending fatal error or
somebody actually removed the card.

If fatal error is pending, wait until it is cleared. If link is healthy,
return gracefully.

Otherwise, proceed with the hotplug removal.


> Cheers,
> Ben.
> 
> 
>
Keith Busch Aug. 20, 2018, 10:13 p.m. UTC | #61
On Tue, Aug 21, 2018 at 08:04:46AM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 18:02 -0400, Sinan Kaya wrote:
> > On 8/20/2018 5:53 PM, Benjamin Herrenschmidt wrote:
> > > > > Hotplug driver removes the devices on link down events and re-enumerates
> > > > > on insertion.
> > > > > 
> > > > > I am trying to separate fatal error handling from hotplug.
> > > > 
> > > > I'll try to take a look. We can't always count on pciehp to do the
> > > > removal when a removal occurs, though. The PCIe specification contains
> > > > an implementation note that DPC may be used in place of hotplug surprise.
> > > 
> > > Can't you use the presence detect to differenciate ?
> > > 
> > > Also, I don't have the specs at hand right now, but does the hotplug
> > > brigde have a way to "latch' the change in presence detect so we can
> > > see if it has transitioned even if it's back on ?
> > 
> > There is only presence detect change and link layer change. No actual
> > state information.
> 
> It does latch that it has changed tho right ? So if presence detect
> hasn't changed, we can assume it's an error and not an unplug ?
> 
> We could discriminate that way to reduce the risk of doing a recovery
> without unbind on something that was actually removed and replaced.

There might be some potential here.

There actualy is a Slot Status Presence Detect State (PDS), but it isn't
latched. The link and presence change (PDC) are latched.

A potential problem may be a PDC event handling observes PDS set to the
last known state if the remove + add happens fast enough, and then no
action is taken in the current implementation.

The DPC capability learned from such issues, and they will latch the
status until acknowledged so those sorts of races are not possible.

But anyway, maybe we can change pciehp to ignore PDS and always
re-enumerate when PDC is observed, and that may very well simplify a lot
of this.
Benjamin Herrenschmidt Aug. 20, 2018, 10:19 p.m. UTC | #62
On Mon, 2018-08-20 at 18:13 -0400, Sinan Kaya wrote:
> 
> I proposed this as one of the possible solutions but presence detect is
> optional 

Is it optional on hotplug capable bridges ? I thought it wasn't ...

> and also presence detect interrupt can be delivered after

We don't care about the interrupt, we just need to look at the value of
the change latch.

> link layer interrupt as well. No guarantees with respect to the order of
> link layer interrupt and presence detect interrupts delivery.
> 
> I instead look at fatal error pending bit in device status register to
> decide if the link down was initiated due to a pending fatal error or
> somebody actually removed the card.
> 
> If fatal error is pending, wait until it is cleared. If link is healthy,
> return gracefully.
> 
> Otherwise, proceed with the hotplug removal.

That could work too but it would be safer to also use the change in
presence detect if available.

Cheers,
Ben.
Benjamin Herrenschmidt Aug. 20, 2018, 10:19 p.m. UTC | #63
On Mon, 2018-08-20 at 16:13 -0600, Keith Busch wrote:
> 
> There might be some potential here.
> 
> There actualy is a Slot Status Presence Detect State (PDS), but it isn't
> latched. The link and presence change (PDC) are latched.
> 
> A potential problem may be a PDC event handling observes PDS set to the
> last known state if the remove + add happens fast enough, and then no
> action is taken in the current implementation.
> 
> The DPC capability learned from such issues, and they will latch the
> status until acknowledged so those sorts of races are not possible.
> 
> But anyway, maybe we can change pciehp to ignore PDS and always
> re-enumerate when PDC is observed, and that may very well simplify a lot
> of this.

That sounds reasonable to me.

Cheers,
Ben.
Keith Busch Aug. 21, 2018, 1:30 a.m. UTC | #64
On Tue, Aug 21, 2018 at 08:19:52AM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 16:13 -0600, Keith Busch wrote:
> > But anyway, maybe we can change pciehp to ignore PDS and always
> > re-enumerate when PDC is observed, and that may very well simplify a lot
> > of this.
> 
> That sounds reasonable to me.

It seems 4.19 is way ahead of me! The latest pciehp driver looks really
good. This'll be more straight forward to separate errors from enumeration
than I thought, and Sinan's patch earlier is probably just fine in that
direction. Will look again tomorrow.
Oza Pawandeep Aug. 21, 2018, 5:14 a.m. UTC | #65
On 2018-08-21 02:32, Benjamin Herrenschmidt wrote:
> On Mon, 2018-08-20 at 18:56 +0530, poza@codeaurora.org wrote:
>> Hi Ben,
>> 
>> I get the idea and get your points. and when I started implementation 
>> of
>> this, I did ask these questions to myself.
>> but then we all fell aligned to what DPC was doing, because thats how
>> the driver was dealing with ERR_FATAL.
>> 
>> I can put together a patch adhering to the idea, and modify err.c.
>> let me know if you are okay with that.
>> I have both DPC and AER capable root port, so I can easily validate 
>> the
>> changes as we improve upon this.
>> 
>> besides We have to come to some common ground on policy.
>> 1) if device driver dictates the reset we should agree to do SBR.
>> (irrespective of the type of error)
> 
> It's not just "the driver dictates the reset". It's a combination of
> all agents. Either the driver or the framework, *both* can request a
> reset. Also if you have multiple devices involved (for example multiple
> functions), then any of the drivers can request the reset.
> 
>> 2) under what circumstances framework will impose the reset, even if
>> device driver did not !
> 
> Well ERR_FATAL typically would be one. Make it an argument to the
> framework, it's possible that EEH might always do, I have to look into
> it.
> 
>> probably changes might be simpler; although it will require to 
>> exercise
>> some tests cases for both DPC and AER.
>> let me know if anything I missed here, but let me attempt to make 
>> patch
>> and we can go form there.
>> 
>> Let me know you opinion.
> 
> I agree. Hopefully Sam will have a chance to chime in (he's caught up
> with something else at the moment) as well.
> 
> Cheers,
> Ben.
> 
>> Regards,
>> Oza.
>> 
>> >
>> > > Also I am very much interested in knowing original intention of DPC
>> > > driver to unplug/plug devices,
>> > > all I remember in some conversation was:
>> > > hotplug capable bridge might have see devices changed, so it is safer
>> > > to
>> > > remove/unplug the devices and during which .shutdown methods of driver
>> > > is called, in case of ERR_FATAL.
>> >
>> > The device being "swapped" during an error recovery operation is ...
>> > unlikely. Do the bridges have a way to latch that the presence detect
>> > changed ?
>> >
>> > > although DPC is HW recovery while AER is sw recovery both should
>> > > fundamentally act in the same way as far as device drivers callbacks
>> > > are
>> > > concerned.
>> > > (again I really dont know real motivation behind this)
>> >
>> > The main problem with unplug/replug (as I mentioned earlier) is that it
>> > just does NOT work for storage controllers (or similar type of
>> > devices). The links between the storage controller and the mounted
>> > filesystems is lost permanently, you'll most likely have to reboot the
>> > machine.
>> >
>> > With our current EEH implementation we can successfully recover from
>> > fatal errors with the storage controller by resetting it.
>> >
>> > Finally as for PERST vs. Hot Reset, this is an implementation detail.
>> > Not all our platforms can control PERST on all devices either, we
>> > generally prefer PERST as it provides a more reliable reset mechanism
>> > in practice (talking from experience) but will fallback to hot reset.
>> >
>> > Cheers,
>> > Ben.

Ok Let me summarize the so far discussed things.

It would be nice if we all (Bjorn, Keith, Ben, Sinan) can hold consensus 
on this.

1) Right now AER and DPC both calls pcie_do_fatal_recovery(), I majorly 
see DPC as error handling and recovery agent rather than being used for 
hotplug.
    so in my opinion, both AER and DPC should have same error handling 
and recovery mechanism

    so if there is a way to figure out that in absence of pcihp, if DPC 
is being used to support hotplug then we fall back to original DPC 
mechanism (which is remove devices)
    otherwise, we fall back to drivers callbacks.

    Spec 6.7.5 Async Removal
    "
    The Surprise Down error resulting from async removal may trigger 
Downstream Port Containment (See Section 6.2.10).
    Downstream Port Containment following an async removal may be 
utilized to hold the Link of a
    Downstream Port in the Disabled LTSSM state while host software 
recovers from the side effects of an async removal.
    "

    I think above is implementation specific. but there has to be some 
way to kow if we are using DPC for hotplug or not !
    otherwise it is assumed to be used for error handling and recovery

    pcie_do_fatal_recovery should take care of above. so that we support 
both error handling and async removal from DPC point of view.


2) It is left to driver first to determine whether it wants to requests 
the reset of device or not based on sanity of link (e.g. if it is 
reading 0xffffffff back !)
    pci_channel_io_frozen is the one which should be used from framework 
to communicate driver that this is ERR_FATAL.

3) @Ben, although I am not very clear that if there is an ERR_FATAL, in 
what circumstances driver will deny the reset but framework will impose 
reset ?

4) Sinan's patch is going to ignore link states if it finds ERR_FATAL,so 
that pcihp will not trigger and unplug the devices.


5) pcie_do_nonfatal_recovery; we sould actually reset the link e.g. SBR 
if driver requests the reset of link.  (there is already TDO note 
anyway).
    if (status == PCI_ERS_RESULT_NEED_RESET) {
		/*
		 * TODO: Should call platform-specific
		 * functions to reset slot before calling
		 * drivers' slot_reset callbacks?
		 */
		status = broadcast_error_message(dev,
				state,
				"slot_reset",
				report_slot_reset);
	}


Let me know how this sounds, if we all agree on behavior I can go ahead 
with the changes.

ps: although I need input on point-1.

Regards,
Oza.

>> >
>> > >
>> > > Regards,
>> > > Oza.
>> > >
>> > > >  - Figure out what hooks might be needed to be able to plumb EEH into
>> > > > it, possibly removing a bunch of crap in arch/powerpc (yay !)
>> > > >
>> > > > I don't think having a webex will be that practical with the timezones
>> > > > involved. I'm trying to get approval to go to Plumbers in which case we
>> > > > could setup a BOF but I have no guarantee at this point that I can make
>> > > > it happen.
>> > > >
>> > > > So let's try using email as much possible for now.
>> > > >
>> > > > Cheers,
>> > > > Ben.
Benjamin Herrenschmidt Aug. 21, 2018, 6:06 a.m. UTC | #66
On Tue, 2018-08-21 at 10:44 +0530, poza@codeaurora.org wrote:
> 
> Ok Let me summarize the so far discussed things.
> 
> It would be nice if we all (Bjorn, Keith, Ben, Sinan) can hold consensus 
> on this.
> 
> 1) Right now AER and DPC both calls pcie_do_fatal_recovery(), I majorly 
> see DPC as error handling and recovery agent rather than being used for 
> hotplug.
>     so in my opinion, both AER and DPC should have same error handling 
> and recovery mechanism

Yes.

>     so if there is a way to figure out that in absence of pcihp, if DPC 
> is being used to support hotplug then we fall back to original DPC 
> mechanism (which is remove devices)

Not exactly. If the presence detect change indicates it was a hotplug
event rather.

>     otherwise, we fall back to drivers callbacks.

Driver callback should be the primary mechanism unless we think there's
a chance the device was removed (or the driver has no callbacks)

>     Spec 6.7.5 Async Removal
>     "
>     The Surprise Down error resulting from async removal may trigger 
> Downstream Port Containment (See Section 6.2.10).
>     Downstream Port Containment following an async removal may be 
> utilized to hold the Link of a
>     Downstream Port in the Disabled LTSSM state while host software 
> recovers from the side effects of an async removal.
>     "
> 
>     I think above is implementation specific. but there has to be some 
> way to kow if we are using DPC for hotplug or not !
>     otherwise it is assumed to be used for error handling and recovery
> 
>     pcie_do_fatal_recovery should take care of above. so that we support 
> both error handling and async removal from DPC point of view.
> 
> 
> 2) It is left to driver first to determine whether it wants to requests 
> the reset of device or not based on sanity of link (e.g. if it is 
> reading 0xffffffff back !)

The need to reset is the logical OR of the driver wanting a reset and
the framework wanting a reset. If *either* wants a reset, then we do a
reset. Typically the framework would request one for ERR_FATAL.

Actually this is a bit more convoluted than that. There can be more
than one driver involved in a given error event, for example because
the device might have multiple functions, or because the error could
have been reported by an upstream bridge.

So the need to reset is actually the logical OR of *any driver*
involved returning PCI_ERS_RESULT_NEED_RESET with the framework wanting
to reset.

>     pci_channel_io_frozen is the one which should be used from framework 
> to communicate driver that this is ERR_FATAL.

Maybe, not sure ... on EEH we have special HW that freezes on all non-
corrected errors but maybe for AER/DPC that's the way to go.

> 3) @Ben, although I am not very clear that if there is an ERR_FATAL, in 
> what circumstances driver will deny the reset but framework will impose 
> reset ?

The driver doesn't "deny" the reset. The driver just says it can
recover without one. 

The driver doesn't necessarily need to care  as to whether the error
was fatal or not, it cares about whether its device seems functional.
The driver can check some diagnostic registers, makes sure the firmware
hasn't crashed etc... and might want to request a reset if it's unhappy
with the state of the device for example.

The framework will impose a reset on ERR_FATAL  typcally (or because
platform policy is to always reset which might be our choice on pwoerpc
with EEH, I'll have to think about it).

> 4) Sinan's patch is going to ignore link states if it finds ERR_FATAL,so 
> that pcihp will not trigger and unplug the devices.

I would suggest checking the prsence detect change latch if it's
supported rather. If not supported, maybe ERR_FATAL is a good fallback.

> 5) pcie_do_nonfatal_recovery; we sould actually reset the link e.g. SBR 
> if driver requests the reset of link.  (there is already TDO note 
> anyway).
>     if (status == PCI_ERS_RESULT_NEED_RESET) {
> 		/*
> 		 * TODO: Should call platform-specific
> 		 * functions to reset slot before calling
> 		 * drivers' slot_reset callbacks?
> 		 */
> 		status = broadcast_error_message(dev,
> 				state,
> 				"slot_reset",
> 				report_slot_reset);
> 	}
> 

Yes.

> Let me know how this sounds, if we all agree on behavior I can go ahead 
> with the changes.

> ps: although I need input on point-1.

Ideally we would like to look at making EEH use the generic framework
as well, though because of subtle differences on how our HW freezes
stuff etc... we might have to add a few quirks to it.

Cheers,
Ben.

> Regards,
> Oza.
> 
> > > > 
> > > > > 
> > > > > Regards,
> > > > > Oza.
> > > > > 
> > > > > >  - Figure out what hooks might be needed to be able to plumb EEH into
> > > > > > it, possibly removing a bunch of crap in arch/powerpc (yay !)
> > > > > > 
> > > > > > I don't think having a webex will be that practical with the timezones
> > > > > > involved. I'm trying to get approval to go to Plumbers in which case we
> > > > > > could setup a BOF but I have no guarantee at this point that I can make
> > > > > > it happen.
> > > > > > 
> > > > > > So let's try using email as much possible for now.
> > > > > > 
> > > > > > Cheers,
> > > > > > Ben.
Keith Busch Aug. 21, 2018, 2:37 p.m. UTC | #67
On Tue, Aug 21, 2018 at 04:06:30PM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2018-08-21 at 10:44 +0530, poza@codeaurora.org wrote:
> > 
> > Ok Let me summarize the so far discussed things.
> > 
> > It would be nice if we all (Bjorn, Keith, Ben, Sinan) can hold consensus 
> > on this.
> > 
> > 1) Right now AER and DPC both calls pcie_do_fatal_recovery(), I majorly 
> > see DPC as error handling and recovery agent rather than being used for 
> > hotplug.
> >     so in my opinion, both AER and DPC should have same error handling 
> > and recovery mechanism
> 
> Yes.
>
> >     so if there is a way to figure out that in absence of pcihp, if DPC 
> > is being used to support hotplug then we fall back to original DPC 
> > mechanism (which is remove devices)
> 
> Not exactly. If the presence detect change indicates it was a hotplug
> event rather.

The actions associated with error recovery will trigger link state changes
for a lot of existing hardware. PCIEHP currently does the same removal
sequence for both link state change (DLLSC) and presence detect change
(PDC) events.

It sounds like you want pciehp to do nothing on the DLLSC events that it
currently handles, and instead do the board removal only on PDC.  If that
is the case, is the desire to not remove devices downstream a permanently
disabled link, or does that responsibility fall onto some other component?
Sinan Kaya Aug. 21, 2018, 3:07 p.m. UTC | #68
On 8/21/2018 10:37 AM, Keith Busch wrote:
> On Tue, Aug 21, 2018 at 04:06:30PM +1000, Benjamin Herrenschmidt wrote:
>> On Tue, 2018-08-21 at 10:44 +0530, poza@codeaurora.org wrote:
>>>
>>> Ok Let me summarize the so far discussed things.
>>>
>>> It would be nice if we all (Bjorn, Keith, Ben, Sinan) can hold consensus
>>> on this.
>>>
>>> 1) Right now AER and DPC both calls pcie_do_fatal_recovery(), I majorly
>>> see DPC as error handling and recovery agent rather than being used for
>>> hotplug.
>>>      so in my opinion, both AER and DPC should have same error handling
>>> and recovery mechanism
>>
>> Yes.
>>
>>>      so if there is a way to figure out that in absence of pcihp, if DPC
>>> is being used to support hotplug then we fall back to original DPC
>>> mechanism (which is remove devices)
>>
>> Not exactly. If the presence detect change indicates it was a hotplug
>> event rather.
> 
> The actions associated with error recovery will trigger link state changes
> for a lot of existing hardware. PCIEHP currently does the same removal
> sequence for both link state change (DLLSC) and presence detect change
> (PDC) events.
> 
> It sounds like you want pciehp to do nothing on the DLLSC events that it
> currently handles, and instead do the board removal only on PDC.  If that
> is the case, is the desire to not remove devices downstream a permanently
> disabled link, or does that responsibility fall onto some other component?
> 

Looking at PDC is not enough. Hotplug driver handles both physical
removal as well as the link going down due to signal integrity issues today.

If the link went down because of a pending FATAL error, AER/DPC recovers
the link automatically. There is no need for hotplug to be involved in
fatal error work.

Hotplug driver needs to handle both physical removal as well as 
intermittent link down issues though.
Keith Busch Aug. 21, 2018, 3:29 p.m. UTC | #69
On Tue, Aug 21, 2018 at 11:07:31AM -0400, Sinan Kaya wrote:
> On 8/21/2018 10:37 AM, Keith Busch wrote:
> > The actions associated with error recovery will trigger link state changes
> > for a lot of existing hardware. PCIEHP currently does the same removal
> > sequence for both link state change (DLLSC) and presence detect change
> > (PDC) events.
> > 
> > It sounds like you want pciehp to do nothing on the DLLSC events that it
> > currently handles, and instead do the board removal only on PDC.  If that
> > is the case, is the desire to not remove devices downstream a permanently
> > disabled link, or does that responsibility fall onto some other component?
> > 
> 
> Looking at PDC is not enough. Hotplug driver handles both physical
> removal as well as the link going down due to signal integrity issues today.
> 
> If the link went down because of a pending FATAL error, AER/DPC recovers
> the link automatically. There is no need for hotplug to be involved in
> fatal error work.
> 
> Hotplug driver needs to handle both physical removal as well as intermittent
> link down issues though.

Back to your patch you linked to earlier, your proposal is to have
pciehp wait for DEVSTS.FED before deciding if it needs to handle the
DLLSC event. That might be a start, but it isn't enough since that
status isn't set if the downstream device reported ERR_FATAL. I think
you'd need to check the secondary status register for a Received System
Error.
Oza Pawandeep Aug. 21, 2018, 3:30 p.m. UTC | #70
On 2018-08-21 20:07, Keith Busch wrote:
> On Tue, Aug 21, 2018 at 04:06:30PM +1000, Benjamin Herrenschmidt wrote:
>> On Tue, 2018-08-21 at 10:44 +0530, poza@codeaurora.org wrote:
>> >
>> > Ok Let me summarize the so far discussed things.
>> >
>> > It would be nice if we all (Bjorn, Keith, Ben, Sinan) can hold consensus
>> > on this.
>> >
>> > 1) Right now AER and DPC both calls pcie_do_fatal_recovery(), I majorly
>> > see DPC as error handling and recovery agent rather than being used for
>> > hotplug.
>> >     so in my opinion, both AER and DPC should have same error handling
>> > and recovery mechanism
>> 
>> Yes.
>> 
>> >     so if there is a way to figure out that in absence of pcihp, if DPC
>> > is being used to support hotplug then we fall back to original DPC
>> > mechanism (which is remove devices)
>> 
>> Not exactly. If the presence detect change indicates it was a hotplug
>> event rather.
> 
> The actions associated with error recovery will trigger link state 
> changes
> for a lot of existing hardware. PCIEHP currently does the same removal
> sequence for both link state change (DLLSC) and presence detect change
> (PDC) events.
> 
> It sounds like you want pciehp to do nothing on the DLLSC events that 
> it
> currently handles, and instead do the board removal only on PDC.  If 
> that
> is the case, is the desire to not remove devices downstream a 
> permanently
> disabled link, or does that responsibility fall onto some other 
> component?

Keith

Are you in agreement with following ?

"
Right now AER and DPC both calls pcie_do_fatal_recovery(), I majorly see 
DPC as error handling and recovery agent rather than being used for 
hotplug.
    so in my opinion, both AER and DPC should have same error handling 
and recovery mechanism

    so if there is a way to figure out that in absence of pcihp, if DPC 
is being used to support hotplug then we fall back to original DPC 
mechanism (which is remove devices)
    otherwise, we fall back to drivers callbacks.

    Spec 6.7.5 Async Removal
    "
    The Surprise Down error resulting from async removal may trigger 
Downstream Port Containment (See Section 6.2.10).
    Downstream Port Containment following an async removal may be 
utilized to hold the Link of a
    Downstream Port in the Disabled LTSSM state while host software 
recovers from the side effects of an async removal.
    "

    I think above is implementation specific. but there has to be some 
way to kow if we are using DPC for hotplug or not !
    otherwise it is assumed to be used for error handling and recovery

    pcie_do_fatal_recovery should take care of above. so that we support 
both error handling and async removal from DPC point of view.
"
Sinan Kaya Aug. 21, 2018, 3:50 p.m. UTC | #71
On 8/21/2018 11:29 AM, Keith Busch wrote:
>> Hotplug driver needs to handle both physical removal as well as intermittent
>> link down issues though.
> Back to your patch you linked to earlier, your proposal is to have
> pciehp wait for DEVSTS.FED before deciding if it needs to handle the
> DLLSC event. That might be a start, but it isn't enough since that
> status isn't set if the downstream device reported ERR_FATAL. I think
> you'd need to check the secondary status register for a Received System
> Error.

Hmm, good feedback.

I was trying not to mix AER/DPC with HP but obviously I failed.
I can add a flag to struct pci_dev like aer_pending for AER.

1. AER ISR sets aer_pending
2. AER ISR issues a secondary bus reset
3. Hotplug driver bails out on aer_pending
4. AER ISR performs the recovery

It is slightly more challenging on the DPC front as HW brings down
the link automatically and hotplug driver can observe the link down
event before the DPC ISR. I'll have to go check if DPC interrupt is
pending.

Let me know if this works out.
Sinan Kaya Aug. 21, 2018, 3:55 p.m. UTC | #72
On 8/21/2018 11:50 AM, Sinan Kaya wrote:
> On 8/21/2018 11:29 AM, Keith Busch wrote:
>>> Hotplug driver needs to handle both physical removal as well as 
>>> intermittent
>>> link down issues though.
>> Back to your patch you linked to earlier, your proposal is to have
>> pciehp wait for DEVSTS.FED before deciding if it needs to handle the
>> DLLSC event. That might be a start, but it isn't enough since that
>> status isn't set if the downstream device reported ERR_FATAL. I think
>> you'd need to check the secondary status register for a Received System
>> Error.
> 
> Hmm, good feedback.
> 
> I was trying not to mix AER/DPC with HP but obviously I failed.
> I can add a flag to struct pci_dev like aer_pending for AER.
> 
> 1. AER ISR sets aer_pending
> 2. AER ISR issues a secondary bus reset
> 3. Hotplug driver bails out on aer_pending
> 4. AER ISR performs the recovery
> 
> It is slightly more challenging on the DPC front as HW brings down
> the link automatically and hotplug driver can observe the link down
> event before the DPC ISR. I'll have to go check if DPC interrupt is
> pending.
> 
> Let me know if this works out.

I forgot to mention that I didn't really want to rely on Secondary
Status Registers as most of the recent PCIe controller HW don't really
implement the old PCI status register bits correctly.
Keith Busch Aug. 21, 2018, 4:44 p.m. UTC | #73
On Tue, Aug 21, 2018 at 11:50:54AM -0400, Sinan Kaya wrote:
> On 8/21/2018 11:29 AM, Keith Busch wrote:
> > > Hotplug driver needs to handle both physical removal as well as intermittent
> > > link down issues though.
> > Back to your patch you linked to earlier, your proposal is to have
> > pciehp wait for DEVSTS.FED before deciding if it needs to handle the
> > DLLSC event. That might be a start, but it isn't enough since that
> > status isn't set if the downstream device reported ERR_FATAL. I think
> > you'd need to check the secondary status register for a Received System
> > Error.
> 
> Hmm, good feedback.
> 
> I was trying not to mix AER/DPC with HP but obviously I failed.
> I can add a flag to struct pci_dev like aer_pending for AER.
> 
> 1. AER ISR sets aer_pending
> 2. AER ISR issues a secondary bus reset
> 3. Hotplug driver bails out on aer_pending
> 4. AER ISR performs the recovery
> 
> It is slightly more challenging on the DPC front as HW brings down
> the link automatically and hotplug driver can observe the link down
> event before the DPC ISR. I'll have to go check if DPC interrupt is
> pending.
> 
> Let me know if this works out.

That sounds like a step in a good direction.

I think it's also safe to say the spec requires a slot capable of swapping
devices report the PDC slot event, so that should be valid criteria for
knowing if a hotplug event occured. There are other issues that we may
need to fix.

Consider a hierarchy of switches where the root port starts a DPC
event and the kernel tries to recover. Meanwhile devices downstream
switches lower in the hierarchy have changed. Since recovery doesn't do
re-enumeration and the top level downstream port didn't detect a hotplug
event, the recovery will "initialize" the wrong device to potentially
undefined results.

I think we'd need pciehp involved when error recovery rewalks the
downstream buses so pciehp may check PDC on capable ports. Implementing
'error_resume' in pciehp to de-enumerate/re-enumerate accordingly might
work ... If that happened, pciehp could call pcie_do_fatal_recovery()
and all three services could be unified!
Benjamin Herrenschmidt Aug. 21, 2018, 9:14 p.m. UTC | #74
On Tue, 2018-08-21 at 08:37 -0600, Keith Busch wrote:
> > 
> > >      so if there is a way to figure out that in absence of pcihp, if DPC 
> > > is being used to support hotplug then we fall back to original DPC 
> > > mechanism (which is remove devices)
> > 
> > Not exactly. If the presence detect change indicates it was a hotplug
> > event rather.
> 
> The actions associated with error recovery will trigger link state changes
> for a lot of existing hardware. PCIEHP currently does the same removal
> sequence for both link state change (DLLSC) and presence detect change
> (PDC) events.
> 
> It sounds like you want pciehp to do nothing on the DLLSC events that it
> currently handles, and instead do the board removal only on PDC.  If that
> is the case, is the desire to not remove devices downstream a permanently
> disabled link, or does that responsibility fall onto some other component?

I think there need to be some coordination between pciehb and DPC on
link state change yes.

We could still remove the device if recovery fails. For example on EEH
we have a threshold and if a device fails more than N times within the
last M minutes (I don't remember the exact numbers and I'm not in front
of the code right now) we give up.

Also under some circumstances, the link will change as a result of the
error handling doing a hot reset.

For example, if the error happens in a parent bridge and that gets
reset, the entire hierarchy underneath does too.

We need to save/restore the state of all bridges and devices (BARs
etc...) below that.

Cheers,
Ben.
Keith Busch Aug. 21, 2018, 10:04 p.m. UTC | #75
On Wed, Aug 22, 2018 at 07:14:32AM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2018-08-21 at 08:37 -0600, Keith Busch wrote:
> > > 
> > > >      so if there is a way to figure out that in absence of pcihp, if DPC 
> > > > is being used to support hotplug then we fall back to original DPC 
> > > > mechanism (which is remove devices)
> > > 
> > > Not exactly. If the presence detect change indicates it was a hotplug
> > > event rather.
> > 
> > The actions associated with error recovery will trigger link state changes
> > for a lot of existing hardware. PCIEHP currently does the same removal
> > sequence for both link state change (DLLSC) and presence detect change
> > (PDC) events.
> > 
> > It sounds like you want pciehp to do nothing on the DLLSC events that it
> > currently handles, and instead do the board removal only on PDC.  If that
> > is the case, is the desire to not remove devices downstream a permanently
> > disabled link, or does that responsibility fall onto some other component?
> 
> I think there need to be some coordination between pciehb and DPC on
> link state change yes.
> 
> We could still remove the device if recovery fails. For example on EEH
> we have a threshold and if a device fails more than N times within the
> last M minutes (I don't remember the exact numbers and I'm not in front
> of the code right now) we give up.
> 
> Also under some circumstances, the link will change as a result of the
> error handling doing a hot reset.
> 
> For example, if the error happens in a parent bridge and that gets
> reset, the entire hierarchy underneath does too.
> 
> We need to save/restore the state of all bridges and devices (BARs
> etc...) below that.

That's not good enough. You'd at least need to check SLTSTS.PDC on all
bridge DSP's beneath the parent *before* you try to restore the state
of devices beneath them. Those races are primarily why DPC currently
removes and reenumerates instead, because that's not something that can
be readily coordinated today.
Keith Busch Aug. 21, 2018, 10:33 p.m. UTC | #76
On Tue, Aug 21, 2018 at 04:04:56PM -0600, Keith Busch wrote:
> > For example, if the error happens in a parent bridge and that gets
> > reset, the entire hierarchy underneath does too.
> > 
> > We need to save/restore the state of all bridges and devices (BARs
> > etc...) below that.
> 
> That's not good enough. You'd at least need to check SLTSTS.PDC on all
> bridge DSP's beneath the parent *before* you try to restore the state
> of devices beneath them. Those races are primarily why DPC currently
> removes and reenumerates instead, because that's not something that can
> be readily coordinated today.

A picture of a real scenario to go with the above comments:

                    ----------------
  --------          |              |
  |      |       -------        -------       --------------
  |  RP  | <---> | USP | SWITCH | DSP | <---> | END DEVICE |
  |      |       ------         -------       --------------
  --------          |              |
                    ----------------

The downstream port (DSP) is hotplug capable, and the root port (RP)
has DPC enabled.

Lets say you hot swap END DEVICE at a time with an inflight posted
transaction such that the RP triggers a CTO, starting a DPC event.
If you treat the RP's DPC event as just "error handling" and restore
the state after containment is released, we are in undefined behavior
because of the mistaken idententy of the device below the SWITCH DSP.

Re-enumerating the topology is easy since it has no races like that. I
understand re-enumeration is problematic for some scenarios, so if we
really need to avoid re-enumeration except when absolutely necessary,
it should be possible if we can detangle the necessary coordination
between the port service drivers.
Benjamin Herrenschmidt Aug. 21, 2018, 11:06 p.m. UTC | #77
On Tue, 2018-08-21 at 16:04 -0600, Keith Busch wrote:
> > I think there need to be some coordination between pciehb and DPC on
> > link state change yes.
> > 
> > We could still remove the device if recovery fails. For example on EEH
> > we have a threshold and if a device fails more than N times within the
> > last M minutes (I don't remember the exact numbers and I'm not in front
> > of the code right now) we give up.
> > 
> > Also under some circumstances, the link will change as a result of the
> > error handling doing a hot reset.
> > 
> > For example, if the error happens in a parent bridge and that gets
> > reset, the entire hierarchy underneath does too.
> > 
> > We need to save/restore the state of all bridges and devices (BARs
> > etc...) below that.
> 
> That's not good enough. You'd at least need to check SLTSTS.PDC on all
> bridge DSP's beneath the parent *before* you try to restore the state
> of devices beneath them. Those races are primarily why DPC currently
> removes and reenumerates instead, because that's not something that can
> be readily coordinated today.

It can be probably done by a simple test & skip as you go down
restoring state, then handling the removals after the dance is
complete.

Cheers,
Ben.
Keith Busch Aug. 21, 2018, 11:13 p.m. UTC | #78
On Wed, Aug 22, 2018 at 09:06:57AM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2018-08-21 at 16:04 -0600, Keith Busch wrote:
> > > I think there need to be some coordination between pciehb and DPC on
> > > link state change yes.
> > > 
> > > We could still remove the device if recovery fails. For example on EEH
> > > we have a threshold and if a device fails more than N times within the
> > > last M minutes (I don't remember the exact numbers and I'm not in front
> > > of the code right now) we give up.
> > > 
> > > Also under some circumstances, the link will change as a result of the
> > > error handling doing a hot reset.
> > > 
> > > For example, if the error happens in a parent bridge and that gets
> > > reset, the entire hierarchy underneath does too.
> > > 
> > > We need to save/restore the state of all bridges and devices (BARs
> > > etc...) below that.
> > 
> > That's not good enough. You'd at least need to check SLTSTS.PDC on all
> > bridge DSP's beneath the parent *before* you try to restore the state
> > of devices beneath them. Those races are primarily why DPC currently
> > removes and reenumerates instead, because that's not something that can
> > be readily coordinated today.
> 
> It can be probably done by a simple test & skip as you go down
> restoring state, then handling the removals after the dance is
> complete.

You can't just test the states in config space. You're racing with
pciehp's ISR, which clears those states. You'd need to fundamentally
change pciehp to (1) not process such events until some currently
undefined criteria is established, and (2) save and export the previously
latched states while pciehp bottom half processing is stalled.
Benjamin Herrenschmidt Aug. 22, 2018, 12:36 a.m. UTC | #79
On Tue, 2018-08-21 at 17:13 -0600, Keith Busch wrote:
> On Wed, Aug 22, 2018 at 09:06:57AM +1000, Benjamin Herrenschmidt wrote:
> > On Tue, 2018-08-21 at 16:04 -0600, Keith Busch wrote:
> > > > I think there need to be some coordination between pciehb and DPC on
> > > > link state change yes.
> > > > 
> > > > We could still remove the device if recovery fails. For example on EEH
> > > > we have a threshold and if a device fails more than N times within the
> > > > last M minutes (I don't remember the exact numbers and I'm not in front
> > > > of the code right now) we give up.
> > > > 
> > > > Also under some circumstances, the link will change as a result of the
> > > > error handling doing a hot reset.
> > > > 
> > > > For example, if the error happens in a parent bridge and that gets
> > > > reset, the entire hierarchy underneath does too.
> > > > 
> > > > We need to save/restore the state of all bridges and devices (BARs
> > > > etc...) below that.
> > > 
> > > That's not good enough. You'd at least need to check SLTSTS.PDC on all
> > > bridge DSP's beneath the parent *before* you try to restore the state
> > > of devices beneath them. Those races are primarily why DPC currently
> > > removes and reenumerates instead, because that's not something that can
> > > be readily coordinated today.
> > 
> > It can be probably done by a simple test & skip as you go down
> > restoring state, then handling the removals after the dance is
> > complete.
> 
> You can't just test the states in config space. You're racing with
> pciehp's ISR, which clears those states.

Sure, I meant you'd call into pciehp.. For EEH we do have some linkage
between hotplug and EEH, though a lot of it goes at the FW level for
us.

>  You'd need to fundamentally
> change pciehp to (1) not process such events until some currently
> undefined criteria is established, and (2) save and export the previously
> latched states while pciehp bottom half processing is stalled.

Ben.
Lukas Wunner Aug. 22, 2018, 9:13 a.m. UTC | #80
On Mon, Aug 20, 2018 at 06:13:03PM -0400, Sinan Kaya wrote:
> presence detect is optional

Hm, how so?  Do you have a spec reference for that?

Thanks,

Lukas
Keith Busch Aug. 22, 2018, 2:38 p.m. UTC | #81
On Wed, Aug 22, 2018 at 11:13:33AM +0200, Lukas Wunner wrote:
> On Mon, Aug 20, 2018 at 06:13:03PM -0400, Sinan Kaya wrote:
> > presence detect is optional
> 
> Hm, how so?  Do you have a spec reference for that?

Yeah, presence detect state and change are required if the slot supports
hot plug. If the slot doesn't support these, error handling becomes a
lot simpler.
Sinan Kaya Aug. 22, 2018, 2:51 p.m. UTC | #82
On 8/22/2018 10:38 AM, Keith Busch wrote:
> On Wed, Aug 22, 2018 at 11:13:33AM +0200, Lukas Wunner wrote:
>> On Mon, Aug 20, 2018 at 06:13:03PM -0400, Sinan Kaya wrote:
>>> presence detect is optional
>>
>> Hm, how so?  Do you have a spec reference for that?
> 
> Yeah, presence detect state and change are required if the slot supports
> hot plug. If the slot doesn't support these, error handling becomes a
> lot simpler.
> 

I double checked the slot capabilities register. You are right, it is
required.
Keith Busch Aug. 30, 2018, 12:01 a.m. UTC | #83
On Wed, Aug 22, 2018 at 09:06:57AM +1000, Benjamin Herrenschmidt wrote:
> It can be probably done by a simple test & skip as you go down
> restoring state, then handling the removals after the dance is
> complete.

I tested on a variety of hardware, and there are mixed results. The spec
captures the crux of the problem with checking PDC (7.5.3.11):

  Note that the in-band presence detect mechanism requires that power be
  applied to an adapter for its presence to be detected. Consequently,
  form factors that require a power controller for hot-plug must implement
  a physical pin presence detect mechanism.

Many slots don't implement power controllers, so a secondary bus reset
always triggers a PDC. We can't really ignore PDC during fatal error
handling since hot plugs are the types of actions that often trigger
fatal errors..

Does it sound okay to trust PDC anyway? It's no worse than what would
happen currently, and it doesn't affect non-hotplug slots.
Sinan Kaya Aug. 30, 2018, 12:10 a.m. UTC | #84
On 8/29/2018 8:01 PM, Keith Busch wrote:
> On Wed, Aug 22, 2018 at 09:06:57AM +1000, Benjamin Herrenschmidt wrote:
>> It can be probably done by a simple test & skip as you go down
>> restoring state, then handling the removals after the dance is
>> complete.
> 
> I tested on a variety of hardware, and there are mixed results. The spec
> captures the crux of the problem with checking PDC (7.5.3.11):
> 
>    Note that the in-band presence detect mechanism requires that power be
>    applied to an adapter for its presence to be detected. Consequently,
>    form factors that require a power controller for hot-plug must implement
>    a physical pin presence detect mechanism.
> 
> Many slots don't implement power controllers, so a secondary bus reset
> always triggers a PDC. We can't really ignore PDC during fatal error
> handling since hot plugs are the types of actions that often trigger
> fatal errors..
> 
> Does it sound okay to trust PDC anyway? It's no worse than what would
> happen currently, and it doesn't affect non-hotplug slots.
> 

Why does hotplug operations cause a fatal error? DPC driver is only monitoring
fatal errors today.
Keith Busch Aug. 30, 2018, 12:46 a.m. UTC | #85
On Wed, Aug 29, 2018 at 05:10:53PM -0700, Sinan Kaya wrote:
> Why does hotplug operations cause a fatal error? DPC driver is only monitoring
> fatal errors today.

Oh, not considering DPC capabilities; just AER. A hotplug capable
switch only suppresses Suprise Link Down, but its possible other link
or transaction errors occur along with the event.
Benjamin Herrenschmidt Aug. 30, 2018, 4:26 a.m. UTC | #86
On Wed, 2018-08-29 at 18:01 -0600, Keith Busch wrote:
> On Wed, Aug 22, 2018 at 09:06:57AM +1000, Benjamin Herrenschmidt wrote:
> > It can be probably done by a simple test & skip as you go down
> > restoring state, then handling the removals after the dance is
> > complete.
> 
> I tested on a variety of hardware, and there are mixed results. The spec
> captures the crux of the problem with checking PDC (7.5.3.11):
> 
>   Note that the in-band presence detect mechanism requires that power be
>   applied to an adapter for its presence to be detected. Consequently,
>   form factors that require a power controller for hot-plug must implement
>   a physical pin presence detect mechanism.
> 
> Many slots don't implement power controllers, so a secondary bus reset
> always triggers a PDC. We can't really ignore PDC during fatal error
> handling since hot plugs are the types of actions that often trigger
> fatal errors..
> 
> Does it sound okay to trust PDC anyway? It's no worse than what would
> happen currently, and it doesn't affect non-hotplug slots.

I think so.

As you say, it's not worse and worst case, we try the recovery and
fail, which can then lead to an unplug if we wish to do so. No biggie.

Cheers,
Ben,
diff mbox series

Patch

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f02e334..3414445 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -287,15 +287,20 @@  void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
 	struct pci_bus *parent;
 	struct pci_dev *pdev, *temp;
 	pci_ers_result_t result;
+	bool is_bridge_device = false;
+	u16 domain = pci_domain_nr(dev->bus);
+	u8 bus = dev->bus->number;
+	u8 devfn = dev->devfn;
 
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+		is_bridge_device = true;
 		udev = dev;
-	else
+	} else {
 		udev = dev->bus->self;
+	}
 
 	parent = udev->subordinate;
 	pci_lock_rescan_remove();
-	pci_dev_get(dev);
 	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
 					 bus_list) {
 		pci_dev_get(pdev);
@@ -309,27 +314,35 @@  void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
 
 	result = reset_link(udev, service);
 
-	if ((service == PCIE_PORT_SERVICE_AER) &&
-	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
+	if (service == PCIE_PORT_SERVICE_AER && is_bridge_device) {
 		/*
 		 * If the error is reported by a bridge, we think this error
 		 * is related to the downstream link of the bridge, so we
 		 * do error recovery on all subordinates of the bridge instead
 		 * of the bridge and clear the error status of the bridge.
 		 */
-		pci_cleanup_aer_uncorrect_error_status(dev);
+		pci_cleanup_aer_uncorrect_error_status(udev);
 	}
 
 	if (result == PCI_ERS_RESULT_RECOVERED) {
 		if (pcie_wait_for_link(udev, true))
 			pci_rescan_bus(udev->bus);
-		pci_info(dev, "Device recovery from fatal error successful\n");
+		/* find the pci_dev after rescanning the bus */
+		dev = pci_get_domain_bus_and_slot(domain, bus, devfn);
+		if (dev)
+			pci_info(dev, "Device recovery from fatal error successful\n");
+		else
+			pr_err("AER: Can not find pci_dev for %04x:%02x:%02x.%x\n",
+			       domain, bus,
+			       PCI_SLOT(devfn), PCI_FUNC(devfn));
+		pci_dev_put(dev);
 	} else {
-		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
-		pci_info(dev, "Device recovery from fatal error failed\n");
+		if (is_bridge_device)
+			pci_uevent_ers(udev, PCI_ERS_RESULT_DISCONNECT);
+		pr_err("AER: Device %04x:%02x:%02x.%x recovery from fatal error failed\n",
+		       domain, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
 	}
 
-	pci_dev_put(dev);
 	pci_unlock_rescan_remove();
 }