[v13,6/6] PCI/DPC: Do not do recovery for hotplug enabled system

Message ID 1523284914-2037-7-git-send-email-poza@codeaurora.org
State New
Delegated to: Bjorn Helgaas
Headers show
Series
  • Address error and recovery for AER and DPC
Related show

Commit Message

Oza Pawandeep April 9, 2018, 2:41 p.m.
DPC and AER should attempt recovery in the same way, except the
cases where system is with hotplug enabled.

Signed-off-by: Oza Pawandeep <poza@codeaurora.org>

Comments

Bjorn Helgaas April 10, 2018, 9:03 p.m. | #1
On Mon, Apr 09, 2018 at 10:41:54AM -0400, Oza Pawandeep wrote:
> DPC and AER should attempt recovery in the same way, except the
> cases where system is with hotplug enabled.

What's the connection with hotplug?  I see from the patch that for
hotplug bridges you remove the tree below the bridge, and otherwise
you just reset the secondary link (I think).

The changelog should explain why we need the difference.

I'm a little skeptical to begin with, because I'm not sure why we
should handle a DPC event differently just because a bridge has the
*capability* of hotplug.  Even if a hotplug bridge reports a DPC
event, that doesn't necessarily mean a hotplug has occurred.

> Signed-off-by: Oza Pawandeep <poza@codeaurora.org>
> 
> diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
> index 8e1553b..6d9a841 100644
> --- a/drivers/pci/pcie/pcie-dpc.c
> +++ b/drivers/pci/pcie/pcie-dpc.c
> @@ -108,8 +108,6 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
>   */
>  static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
>  {
> -	struct pci_bus *parent = pdev->subordinate;
> -	struct pci_dev *dev, *temp;
>  	struct dpc_dev *dpc;
>  	struct pcie_device *pciedev;
>  	struct device *devdpc;
> @@ -120,19 +118,6 @@ static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
>  	dpc = get_service_data(pciedev);
>  	cap = dpc->cap_pos;
>  
> -	pci_lock_rescan_remove();
> -	list_for_each_entry_safe_reverse(dev, temp, &parent->devices,
> -					 bus_list) {
> -		pci_dev_get(dev);
> -		pci_dev_set_disconnected(dev, NULL);
> -		if (pci_has_subordinate(dev))
> -			pci_walk_bus(dev->subordinate,
> -				     pci_dev_set_disconnected, NULL);
> -		pci_stop_and_remove_bus_device(dev);
> -		pci_dev_put(dev);
> -	}
> -	pci_unlock_rescan_remove();
> -
>  	dpc_wait_link_inactive(dpc);
>  	if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
>  		return PCI_ERS_RESULT_DISCONNECT;
> @@ -152,13 +137,37 @@ static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
>  	return PCI_ERS_RESULT_RECOVERED;
>  }
>  
> +static void dpc_reset_link_remove_dev(struct pci_dev *pdev)
> +{
> +	struct pci_bus *parent = pdev->subordinate;
> +	struct pci_dev *dev, *temp;
> +
> +	pci_lock_rescan_remove();
> +	list_for_each_entry_safe_reverse(dev, temp, &parent->devices,
> +					 bus_list) {
> +		pci_dev_get(dev);
> +		pci_dev_set_disconnected(dev, NULL);
> +		if (pci_has_subordinate(dev))
> +			pci_walk_bus(dev->subordinate,
> +				     pci_dev_set_disconnected, NULL);
> +		pci_stop_and_remove_bus_device(dev);
> +		pci_dev_put(dev);
> +	}
> +	pci_unlock_rescan_remove();
> +
> +	dpc_reset_link(pdev);
> +}
> +
>  static void dpc_work(struct work_struct *work)
>  {
>  	struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
>  	struct pci_dev *pdev = dpc->dev->port;
>  
>  	/* From DPC point of view error is always FATAL. */
> -	pcie_do_recovery(pdev, DPC_FATAL);
> +	if (!pdev->is_hotplug_bridge)
> +		pcie_do_recovery(pdev, DPC_FATAL);
> +	else
> +		dpc_reset_link_remove_dev(pdev);
>  }
>  static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
>  {
> -- 
> 2.7.4
>
Sinan Kaya April 12, 2018, 1:41 a.m. | #2
On 4/10/2018 5:03 PM, Bjorn Helgaas wrote:
>> DPC and AER should attempt recovery in the same way, except the
>> cases where system is with hotplug enabled.
> What's the connection with hotplug?  I see from the patch that for
> hotplug bridges you remove the tree below the bridge, and otherwise
> you just reset the secondary link (I think).
> 
> The changelog should explain why we need the difference.
> 
> I'm a little skeptical to begin with, because I'm not sure why we
> should handle a DPC event differently just because a bridge has the
> *capability* of hotplug.  Even if a hotplug bridge reports a DPC
> event, that doesn't necessarily mean a hotplug has occurred.
> 

Let's do a recap on what we have discussed about this until now.

There are two conflicting error recovery mechanisms for PCIe. 

If a system supports both hotplug and DPC, endpoint can be removed and inserted safely.
DPC driver shuts down the driver on link down. When link comes back up,
hotplug driver takes over and initiates an enumeration process. 
Keith mentioned the stop and re-enumerate design was chosen because 
someone could remove a drive and insert an unrelated drive back to the
system.  We can't really save and restore state as we do in the AER path. 

Now, let's assume a system without hotplug capability.
Second mechanism is to go through DPC/AER path and do an automatic link
down recovery via DPC retrain/secondary bus reset including register save and
restore.  Second mechanism is more suitable for handling "surprise link down"
event. The goal is to retrain the link and continue driver operation. 

The goal of this patch to separate these two cases from each other as
the DPC driver needs to work on both contexts. Current DPC code doesn't handle
the second use case.
Bjorn Helgaas April 12, 2018, 2:06 p.m. | #3
[+cc Alex because of his interest in device reset]

For context, here's the part of the patch we're discussing:

> >>  static void dpc_work(struct work_struct *work)
> >>  {
> >>         struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
> >>         struct pci_dev *pdev = dpc->dev->port;
> >> 
> >>         /* From DPC point of view error is always FATAL. */
> >> -       pcie_do_recovery(pdev, DPC_FATAL);
> >> +       if (!pdev->is_hotplug_bridge)
> >> +               pcie_do_recovery(pdev, DPC_FATAL);
> >> +       else
> >> +               dpc_reset_link_remove_dev(pdev);
> >>  }

This is at the point where DPC has been triggered in the hardware and
the DPC driver is starting recovery, and I'm wondering why we need to
handle the "!pdev->is_hotplug_bridge" case differently.

On Wed, Apr 11, 2018 at 09:41:56PM -0400, Sinan Kaya wrote:
> On 4/10/2018 5:03 PM, Bjorn Helgaas wrote:
> >> DPC and AER should attempt recovery in the same way, except the
> >> cases where system is with hotplug enabled.
> > What's the connection with hotplug?  I see from the patch that for
> > hotplug bridges you remove the tree below the bridge, and otherwise
> > you just reset the secondary link (I think).
> > 
> > The changelog should explain why we need the difference.
> > 
> > I'm a little skeptical to begin with, because I'm not sure why we
> > should handle a DPC event differently just because a bridge has the
> > *capability* of hotplug.  Even if a hotplug bridge reports a DPC
> > event, that doesn't necessarily mean a hotplug has occurred.
> 
> Let's do a recap on what we have discussed about this until now.
> 
> There are two conflicting error recovery mechanisms for PCIe. 
> 
> If a system supports both hotplug and DPC, endpoint can be removed
> and inserted safely.  DPC driver shuts down the driver on link down.
> When link comes back up, hotplug driver takes over and initiates an
> enumeration process.  Keith mentioned the stop and re-enumerate
> design was chosen because someone could remove a drive and insert an
> unrelated drive back to the system.  We can't really save and
> restore state as we do in the AER path. 
> 
> Now, let's assume a system without hotplug capability.  Second
> mechanism is to go through DPC/AER path and do an automatic link
> down recovery via DPC retrain/secondary bus reset including register
> save and restore.  Second mechanism is more suitable for handling
> "surprise link down" event. The goal is to retrain the link and
> continue driver operation. 
> 
> The goal of this patch to separate these two cases from each other
> as the DPC driver needs to work on both contexts. Current DPC code
> doesn't handle the second use case.

I think the scenario you are describing is two systems that are
identical except that in the first, the endpoint is below a hotplug
bridge, while in the second, it's below a non-hotplug bridge.  There's
no physical hotplug (no drive removed or inserted), and DPC is
triggered in both systems.

I suggest that DPC should be handled identically in both systems:

  - The PCI core should have the same view of the endpoint: it should
    be removed and re-added in both cases (or in neither case).

  - The endpoint itself should not be able to tell the difference: it
    should see a link down event, followed by a link retrain, followed
    by the same sequence of config accesses, etc.

  - The endpoint driver should not be able to tell the difference,
    i.e., we should be calling the same pci_error_handlers callbacks
    in both cases.

It's true that in the non-hotplug system, pciehp probably won't start
re-enumeration, so we might need an alternate path to trigger that.

But that's not what we're doing in this patch.  In this patch we're
adding a much bigger difference: for hotplug bridges, we stop and
remove the hierarchy below the bridge; for non-hotplug bridges, we do
the AER-style flow of calling pci_error_handlers callbacks.

Bjorn
Sinan Kaya April 12, 2018, 2:34 p.m. | #4
On 4/12/2018 10:06 AM, Bjorn Helgaas wrote:
> [+cc Alex because of his interest in device reset]
> 
> For context, here's the part of the patch we're discussing:
> 
>>>>  static void dpc_work(struct work_struct *work)
>>>>  {
>>>>         struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
>>>>         struct pci_dev *pdev = dpc->dev->port;
>>>>
>>>>         /* From DPC point of view error is always FATAL. */
>>>> -       pcie_do_recovery(pdev, DPC_FATAL);
>>>> +       if (!pdev->is_hotplug_bridge)
>>>> +               pcie_do_recovery(pdev, DPC_FATAL);
>>>> +       else
>>>> +               dpc_reset_link_remove_dev(pdev);
>>>>  }
> 
> This is at the point where DPC has been triggered in the hardware and
> the DPC driver is starting recovery, and I'm wondering why we need to
> handle the "!pdev->is_hotplug_bridge" case differently.
> 
> On Wed, Apr 11, 2018 at 09:41:56PM -0400, Sinan Kaya wrote:
>> On 4/10/2018 5:03 PM, Bjorn Helgaas wrote:
>>>> DPC and AER should attempt recovery in the same way, except the
>>>> cases where system is with hotplug enabled.
>>> What's the connection with hotplug?  I see from the patch that for
>>> hotplug bridges you remove the tree below the bridge, and otherwise
>>> you just reset the secondary link (I think).
>>>
>>> The changelog should explain why we need the difference.
>>>
>>> I'm a little skeptical to begin with, because I'm not sure why we
>>> should handle a DPC event differently just because a bridge has the
>>> *capability* of hotplug.  Even if a hotplug bridge reports a DPC
>>> event, that doesn't necessarily mean a hotplug has occurred.
>>
>> Let's do a recap on what we have discussed about this until now.
>>
>> There are two conflicting error recovery mechanisms for PCIe. 
>>
>> If a system supports both hotplug and DPC, endpoint can be removed
>> and inserted safely.  DPC driver shuts down the driver on link down.
>> When link comes back up, hotplug driver takes over and initiates an
>> enumeration process.  Keith mentioned the stop and re-enumerate
>> design was chosen because someone could remove a drive and insert an
>> unrelated drive back to the system.  We can't really save and
>> restore state as we do in the AER path. 
>>
>> Now, let's assume a system without hotplug capability.  Second
>> mechanism is to go through DPC/AER path and do an automatic link
>> down recovery via DPC retrain/secondary bus reset including register
>> save and restore.  Second mechanism is more suitable for handling
>> "surprise link down" event. The goal is to retrain the link and
>> continue driver operation. 
>>
>> The goal of this patch to separate these two cases from each other
>> as the DPC driver needs to work on both contexts. Current DPC code
>> doesn't handle the second use case.
> 
> I think the scenario you are describing is two systems that are
> identical except that in the first, the endpoint is below a hotplug
> bridge, while in the second, it's below a non-hotplug bridge.  There's
> no physical hotplug (no drive removed or inserted), and DPC is
> triggered in both systems.
> 
> I suggest that DPC should be handled identically in both systems:
> 
>   - The PCI core should have the same view of the endpoint: it should
>     be removed and re-added in both cases (or in neither case).
> 
>   - The endpoint itself should not be able to tell the difference: it
>     should see a link down event, followed by a link retrain, followed
>     by the same sequence of config accesses, etc.
> 
>   - The endpoint driver should not be able to tell the difference,
>     i.e., we should be calling the same pci_error_handlers callbacks
>     in both cases.
> 
> It's true that in the non-hotplug system, pciehp probably won't start
> re-enumeration, so we might need an alternate path to trigger that.
> 
> But that's not what we're doing in this patch.  In this patch we're
> adding a much bigger difference: for hotplug bridges, we stop and
> remove the hierarchy below the bridge; for non-hotplug bridges, we do
> the AER-style flow of calling pci_error_handlers callbacks.

Our approach on V12 was to go to AER style recovery for all DPC events
regardless of hotplug support or not. 

Keith was not comfortable with this approach. That's why, we special cased
hotplug.

If we drop 6/6 on this patch on v13, we achieve this. We still have to
take care of Keith's inputs on individual patches.

we have been struggling with the direction for a while.

Keith, what do you think?

> 
> Bjorn
>
Keith Busch April 12, 2018, 2:39 p.m. | #5
On Thu, Apr 12, 2018 at 10:34:37AM -0400, Sinan Kaya wrote:
> On 4/12/2018 10:06 AM, Bjorn Helgaas wrote:
> > 
> > I think the scenario you are describing is two systems that are
> > identical except that in the first, the endpoint is below a hotplug
> > bridge, while in the second, it's below a non-hotplug bridge.  There's
> > no physical hotplug (no drive removed or inserted), and DPC is
> > triggered in both systems.
> > 
> > I suggest that DPC should be handled identically in both systems:
> > 
> >   - The PCI core should have the same view of the endpoint: it should
> >     be removed and re-added in both cases (or in neither case).
> > 
> >   - The endpoint itself should not be able to tell the difference: it
> >     should see a link down event, followed by a link retrain, followed
> >     by the same sequence of config accesses, etc.
> > 
> >   - The endpoint driver should not be able to tell the difference,
> >     i.e., we should be calling the same pci_error_handlers callbacks
> >     in both cases.
> > 
> > It's true that in the non-hotplug system, pciehp probably won't start
> > re-enumeration, so we might need an alternate path to trigger that.
> > 
> > But that's not what we're doing in this patch.  In this patch we're
> > adding a much bigger difference: for hotplug bridges, we stop and
> > remove the hierarchy below the bridge; for non-hotplug bridges, we do
> > the AER-style flow of calling pci_error_handlers callbacks.
> 
> Our approach on V12 was to go to AER style recovery for all DPC events
> regardless of hotplug support or not. 
> 
> Keith was not comfortable with this approach. That's why, we special cased
> hotplug.
> 
> If we drop 6/6 on this patch on v13, we achieve this. We still have to
> take care of Keith's inputs on individual patches.
> 
> we have been struggling with the direction for a while.
> 
> Keith, what do you think?

My only concern was for existing production environments that use DPC
for handling surprise removal, and I don't wish to break the existing
uses.
Keith Busch April 12, 2018, 3:02 p.m. | #6
On Thu, Apr 12, 2018 at 08:39:54AM -0600, Keith Busch wrote:
> On Thu, Apr 12, 2018 at 10:34:37AM -0400, Sinan Kaya wrote:
> > On 4/12/2018 10:06 AM, Bjorn Helgaas wrote:
> > > 
> > > I think the scenario you are describing is two systems that are
> > > identical except that in the first, the endpoint is below a hotplug
> > > bridge, while in the second, it's below a non-hotplug bridge.  There's
> > > no physical hotplug (no drive removed or inserted), and DPC is
> > > triggered in both systems.
> > > 
> > > I suggest that DPC should be handled identically in both systems:
> > > 
> > >   - The PCI core should have the same view of the endpoint: it should
> > >     be removed and re-added in both cases (or in neither case).
> > > 
> > >   - The endpoint itself should not be able to tell the difference: it
> > >     should see a link down event, followed by a link retrain, followed
> > >     by the same sequence of config accesses, etc.
> > > 
> > >   - The endpoint driver should not be able to tell the difference,
> > >     i.e., we should be calling the same pci_error_handlers callbacks
> > >     in both cases.
> > > 
> > > It's true that in the non-hotplug system, pciehp probably won't start
> > > re-enumeration, so we might need an alternate path to trigger that.
> > > 
> > > But that's not what we're doing in this patch.  In this patch we're
> > > adding a much bigger difference: for hotplug bridges, we stop and
> > > remove the hierarchy below the bridge; for non-hotplug bridges, we do
> > > the AER-style flow of calling pci_error_handlers callbacks.
> > 
> > Our approach on V12 was to go to AER style recovery for all DPC events
> > regardless of hotplug support or not. 
> > 
> > Keith was not comfortable with this approach. That's why, we special cased
> > hotplug.
> > 
> > If we drop 6/6 on this patch on v13, we achieve this. We still have to
> > take care of Keith's inputs on individual patches.
> > 
> > we have been struggling with the direction for a while.
> > 
> > Keith, what do you think?
> 
> My only concern was for existing production environments that use DPC
> for handling surprise removal, and I don't wish to break the existing
> uses.

Also, I thought the plan was to keep hotplug and non-hotplug the same,
except for the very end: if not a hotplug bridge, initiate the rescan
automatically after releasing from containment, otherwise let pciehp
handle it when the link reactivates.
Sinan Kaya April 12, 2018, 4:27 p.m. | #7
On 4/12/2018 11:02 AM, Keith Busch wrote:
> On Thu, Apr 12, 2018 at 08:39:54AM -0600, Keith Busch wrote:
>> On Thu, Apr 12, 2018 at 10:34:37AM -0400, Sinan Kaya wrote:
>>> On 4/12/2018 10:06 AM, Bjorn Helgaas wrote:
>>>>
>>>> I think the scenario you are describing is two systems that are
>>>> identical except that in the first, the endpoint is below a hotplug
>>>> bridge, while in the second, it's below a non-hotplug bridge.  There's
>>>> no physical hotplug (no drive removed or inserted), and DPC is
>>>> triggered in both systems.
>>>>
>>>> I suggest that DPC should be handled identically in both systems:
>>>>
>>>>   - The PCI core should have the same view of the endpoint: it should
>>>>     be removed and re-added in both cases (or in neither case).
>>>>
>>>>   - The endpoint itself should not be able to tell the difference: it
>>>>     should see a link down event, followed by a link retrain, followed
>>>>     by the same sequence of config accesses, etc.
>>>>
>>>>   - The endpoint driver should not be able to tell the difference,
>>>>     i.e., we should be calling the same pci_error_handlers callbacks
>>>>     in both cases.
>>>>
>>>> It's true that in the non-hotplug system, pciehp probably won't start
>>>> re-enumeration, so we might need an alternate path to trigger that.
>>>>
>>>> But that's not what we're doing in this patch.  In this patch we're
>>>> adding a much bigger difference: for hotplug bridges, we stop and
>>>> remove the hierarchy below the bridge; for non-hotplug bridges, we do
>>>> the AER-style flow of calling pci_error_handlers callbacks.
>>>
>>> Our approach on V12 was to go to AER style recovery for all DPC events
>>> regardless of hotplug support or not. 
>>>
>>> Keith was not comfortable with this approach. That's why, we special cased
>>> hotplug.
>>>
>>> If we drop 6/6 on this patch on v13, we achieve this. We still have to
>>> take care of Keith's inputs on individual patches.
>>>
>>> we have been struggling with the direction for a while.
>>>
>>> Keith, what do you think?
>>
>> My only concern was for existing production environments that use DPC
>> for handling surprise removal, and I don't wish to break the existing
>> uses.
> 
> Also, I thought the plan was to keep hotplug and non-hotplug the same,
> except for the very end: if not a hotplug bridge, initiate the rescan
> automatically after releasing from containment, otherwise let pciehp
> handle it when the link reactivates.
> 

Hmm...

AER driver doesn't do stop and rescan approach for fatal errors. AER driver
makes an error callback followed by secondary bus reset and finally driver
the resume callback on the endpoint only if link recovery is successful.
Otherwise, AER driver bails out with recovery unsuccessful message.

Why do we need an additional rescan in the DPC driver if the link is up
and driver resumes operation?

If hotplug is supported and somebody removed the device, link won't come up.
The AER error recovery sequence will fail after timeout.

When the drive is inserted, hotplug driver observes a link up interrupt,
Hotplug driver does a rescan. Drive is functional one more time. 

This should satisfy both use cases, right?
Keith Busch April 12, 2018, 5:09 p.m. | #8
On Thu, Apr 12, 2018 at 12:27:20PM -0400, Sinan Kaya wrote:
> On 4/12/2018 11:02 AM, Keith Busch wrote:
> > 
> > Also, I thought the plan was to keep hotplug and non-hotplug the same,
> > except for the very end: if not a hotplug bridge, initiate the rescan
> > automatically after releasing from containment, otherwise let pciehp
> > handle it when the link reactivates.
> > 
> 
> Hmm...
> 
> AER driver doesn't do stop and rescan approach for fatal errors. AER driver
> makes an error callback followed by secondary bus reset and finally driver
> the resume callback on the endpoint only if link recovery is successful.
> Otherwise, AER driver bails out with recovery unsuccessful message.

I'm not sure if that's necessarily true. People have reported AER
handling triggers PCIe hotplug events, and creates some interesting race
conditions:

https://marc.info/?l=linux-pci&m=152336615707640&w=2

https://www.spinics.net/lists/linux-pci/msg70614.html

> Why do we need an additional rescan in the DPC driver if the link is up
> and driver resumes operation?

I thought the plan was to have DPC always go through the removal path
to ensure all devices are properly configured when containment is
released. In order to reconfigure those, you'll need to initiate the
rescan from somewhere.
Sinan Kaya April 12, 2018, 5:41 p.m. | #9
On 4/12/2018 1:09 PM, Keith Busch wrote:
> On Thu, Apr 12, 2018 at 12:27:20PM -0400, Sinan Kaya wrote:
>> On 4/12/2018 11:02 AM, Keith Busch wrote:
>>>
>>> Also, I thought the plan was to keep hotplug and non-hotplug the same,
>>> except for the very end: if not a hotplug bridge, initiate the rescan
>>> automatically after releasing from containment, otherwise let pciehp
>>> handle it when the link reactivates.
>>>
>>
>> Hmm...
>>
>> AER driver doesn't do stop and rescan approach for fatal errors. AER driver
>> makes an error callback followed by secondary bus reset and finally driver
>> the resume callback on the endpoint only if link recovery is successful.
>> Otherwise, AER driver bails out with recovery unsuccessful message.
> 
> I'm not sure if that's necessarily true. People have reported AER
> handling triggers PCIe hotplug events, and creates some interesting race
> conditions:

By reading the code, I don't see a stop and rescan in the AER error recovery
path.

As both logs indicate, stop and rescan is initiated in response to link down
and link up interrupts triggered by the secondary bus reset. 
The SW entity handling these is not AER driver. It is the hotplug driver
running asynchronous to the AER driver.

AER driver should have tried a slot reset before attempting to do a secondary
bus reset.

/**
 * pci_reset_slot - reset a PCI slot
 * @slot: PCI slot to reset
 *
 * A PCI bus may host multiple slots, each slot may support a reset mechanism
 * independent of other slots.  For instance, some slots may support slot power
 * control.  In the case of a 1:1 bus to slot architecture, this function may
 * wrap the bus reset to avoid spurious slot related events such as hotplug.
 * Generally a slot reset should be attempted before a bus reset.  All of the
 * function of the slot and any subordinate buses behind the slot are reset
 * through this function.  PCI config space of all devices in the slot and
 * behind the slot is saved before and restored after reset.
 *
 * Return 0 on success, non-zero on error.
 */
int pci_reset_slot(struct pci_slot *slot)

Slot reset is there to mask hotplug interrupts before the reset and unmask them
after reset.

> 
> https://marc.info/?l=linux-pci&m=152336615707640&w=2
> 
> https://www.spinics.net/lists/linux-pci/msg70614.html
> 
>> Why do we need an additional rescan in the DPC driver if the link is up
>> and driver resumes operation?
> 
> I thought the plan was to have DPC always go through the removal path
> to ensure all devices are properly configured when containment is
> released. In order to reconfigure those, you'll need to initiate the
> rescan from somewhere.
> 

This is where the contradiction is. 

Bjorn is asking for a unified error handling for both AER and DPC.

Current AER error recovery framework is error callback + secondary
bus reset + resume callback.

How does this stop + rescan model fit?

Do we want to change the error recovery framework? I suppose this will 
become a bigger conversation as there are more customers of this.
Sinan Kaya April 14, 2018, 3:53 p.m. | #10
Hi Keith, Bjorn;

On 4/12/2018 1:41 PM, Sinan Kaya wrote:
> On 4/12/2018 1:09 PM, Keith Busch wrote:
>> On Thu, Apr 12, 2018 at 12:27:20PM -0400, Sinan Kaya wrote:
>>> On 4/12/2018 11:02 AM, Keith Busch wrote:
>>>>
>>>> Also, I thought the plan was to keep hotplug and non-hotplug the same,
>>>> except for the very end: if not a hotplug bridge, initiate the rescan
>>>> automatically after releasing from containment, otherwise let pciehp
>>>> handle it when the link reactivates.
>>>>
>>>
>>> Hmm...
>>>
>>> AER driver doesn't do stop and rescan approach for fatal errors. AER driver
>>> makes an error callback followed by secondary bus reset and finally driver
>>> the resume callback on the endpoint only if link recovery is successful.
>>> Otherwise, AER driver bails out with recovery unsuccessful message.
>>
>> I'm not sure if that's necessarily true. People have reported AER
>> handling triggers PCIe hotplug events, and creates some interesting race
>> conditions:
> 
> By reading the code, I don't see a stop and rescan in the AER error recovery
> path.
> 
> As both logs indicate, stop and rescan is initiated in response to link down
> and link up interrupts triggered by the secondary bus reset. 
> The SW entity handling these is not AER driver. It is the hotplug driver
> running asynchronous to the AER driver.
> 
> AER driver should have tried a slot reset before attempting to do a secondary
> bus reset.
> 
> /**
>  * pci_reset_slot - reset a PCI slot
>  * @slot: PCI slot to reset
>  *
>  * A PCI bus may host multiple slots, each slot may support a reset mechanism
>  * independent of other slots.  For instance, some slots may support slot power
>  * control.  In the case of a 1:1 bus to slot architecture, this function may
>  * wrap the bus reset to avoid spurious slot related events such as hotplug.
>  * Generally a slot reset should be attempted before a bus reset.  All of the
>  * function of the slot and any subordinate buses behind the slot are reset
>  * through this function.  PCI config space of all devices in the slot and
>  * behind the slot is saved before and restored after reset.
>  *
>  * Return 0 on success, non-zero on error.
>  */
> int pci_reset_slot(struct pci_slot *slot)
> 
> Slot reset is there to mask hotplug interrupts before the reset and unmask them
> after reset.
> 
>>
>> https://marc.info/?l=linux-pci&m=152336615707640&w=2
>>
>> https://www.spinics.net/lists/linux-pci/msg70614.html
>>
>>> Why do we need an additional rescan in the DPC driver if the link is up
>>> and driver resumes operation?
>>
>> I thought the plan was to have DPC always go through the removal path
>> to ensure all devices are properly configured when containment is
>> released. In order to reconfigure those, you'll need to initiate the
>> rescan from somewhere.
>>
> 
> This is where the contradiction is. 
> 
> Bjorn is asking for a unified error handling for both AER and DPC.
> 
> Current AER error recovery framework is error callback + secondary
> bus reset + resume callback.
> 
> How does this stop + rescan model fit?
> 
> Do we want to change the error recovery framework? I suppose this will 
> become a bigger conversation as there are more customers of this.
> 

I also want to highlight that the PCI Error recovery sequence is well
documented here.

https://www.kernel.org/doc/Documentation/PCI/pci-error-recovery.txt

We don't really have to guess what Linux does. 

IMO, the hotplug issues Keith is seeing are orthogonal and needs to be
addressed independent of this series by following the pci slot reset
procedure.

Hotplug driver handles link up/down events due to insertion/removal.
Hotplug driver is expected to do the re-enumeration.

I don't understand why we need to do another re-enumeration if system
observes a PCIe error handled by the AER/DPC driver. 

These two are independent events.

PCIe error recovery framework does the reset callback + SBR + resume
behavior today.

Bjorn,

You indicated that you want to unify the AER and DPC behavior. Let's
settle on what we want to do one more time. We have been going forth
and back on the direction.

We are on V13. I hope we won't hit V20 :)

Sinan
Bjorn Helgaas April 16, 2018, 3:17 a.m. | #11
On Sat, Apr 14, 2018 at 11:53:17AM -0400, Sinan Kaya wrote:

> You indicated that you want to unify the AER and DPC behavior. Let's
> settle on what we want to do one more time. We have been going forth
> and back on the direction.

My thinking is that as much as possible, similar events should be
handled similarly, whether the mechanism is AER, DPC, EEH, etc.
Ideally, drivers shouldn't have to be aware of which mechanism is in
use.

Error recovery includes conventional PCI as well, but right now I
think we're only concerned with PCIe.  The following error types are
from PCIe r4.0, sec 6.2.2:

  ERR_COR
    Corrected by hardware with no software intervention.  Software
    involved for logging only.

    Handled by AER via pci_error_handlers; DPC is never involved.

    Link is unaffected.

  ERR_NONFATAL
    A transaction is unreliable but the link is fully functional.

    If DPC is not supported, handled by AER via pci_error_handlers and
    the link is unaffected.

    If DPC supported, handled by DPC (because we set
    PCI_EXP_DPC_CTL_EN_NONFATAL) via remove/re-enumerate.

  ERR_FATAL
    The link is unreliable.

    If DPC is not supported, handled by AER via pci_error_handlers and
    the link is reset.

    If DPC supported, handled by DPC via remove/re-enumerate.

It doesn't seem right to me that we handle both ERR_NONFATAL and
ERR_FATAL events differently if we happen to have DPC support in a
switch.

Maybe we should consider triggering DPC only on ERR_FATAL?  That would
keep DPC out of the ERR_NONFATAL cases.

For ERR_FATAL, maybe we should bite the bullet and use
remove/re-enumerate for AER as well as for DPC.  That would be painful
for higher-level software, but if we're willing to accept that pain
for new systems that support DPC, maybe life would be better overall
if it worked the same way on systems without DPC?

Bjorn
Oza Pawandeep April 16, 2018, 5:33 a.m. | #12
On 2018-04-16 08:47, Bjorn Helgaas wrote:
> On Sat, Apr 14, 2018 at 11:53:17AM -0400, Sinan Kaya wrote:
> 
>> You indicated that you want to unify the AER and DPC behavior. Let's
>> settle on what we want to do one more time. We have been going forth
>> and back on the direction.
> 
> My thinking is that as much as possible, similar events should be
> handled similarly, whether the mechanism is AER, DPC, EEH, etc.
> Ideally, drivers shouldn't have to be aware of which mechanism is in
> use.
> 
> Error recovery includes conventional PCI as well, but right now I
> think we're only concerned with PCIe.  The following error types are
> from PCIe r4.0, sec 6.2.2:
> 
>   ERR_COR
>     Corrected by hardware with no software intervention.  Software
>     involved for logging only.
> 
>     Handled by AER via pci_error_handlers; DPC is never involved.
> 
>     Link is unaffected.
> 
>   ERR_NONFATAL
>     A transaction is unreliable but the link is fully functional.
> 
>     If DPC is not supported, handled by AER via pci_error_handlers and
>     the link is unaffected.
> 
>     If DPC supported, handled by DPC (because we set
>     PCI_EXP_DPC_CTL_EN_NONFATAL) via remove/re-enumerate.
> 
>   ERR_FATAL
>     The link is unreliable.
> 
>     If DPC is not supported, handled by AER via pci_error_handlers and
>     the link is reset.
> 
>     If DPC supported, handled by DPC via remove/re-enumerate.
> 
> It doesn't seem right to me that we handle both ERR_NONFATAL and
> ERR_FATAL events differently if we happen to have DPC support in a
> switch.
> 
> Maybe we should consider triggering DPC only on ERR_FATAL?  That would
> keep DPC out of the ERR_NONFATAL cases.
> 
> For ERR_FATAL, maybe we should bite the bullet and use
> remove/re-enumerate for AER as well as for DPC.  That would be painful
> for higher-level software, but if we're willing to accept that pain
> for new systems that support DPC, maybe life would be better overall
> if it worked the same way on systems without DPC?
> 
> Bjorn

This had crossed my mind when I first looked at the code.
DPC is getting triggered for both ERR_NONFATAL and ERR_FATAL case.
I thought the primary purpose of DPC to recover fatal errors, by 
triggering HW recovery.
but what if some platform wants to handle both FATAL and NON_FATAL with 
DPC ?

As you said AER FATAL cases and DPC FATAL cases should be handled 
similarly.
e.g. remove/re-enumerate the devices.

while NON_FATAL case; only AER would come into picture.
if some platform would like to handle DPC NON_FATAL then it should 
follow AER NON_FATAL path  (where it does not do remove/re-enumerate)

And the case where hotplug is enabled, remove/re-enumerate more sense in 
case of ERR_FATAL.
And the case where hotplug is disabled, only re-enumeration is required. 
(no need to remove the devices)
but then do we need to handle this case specifically, what is the harm 
in removing the devices in all the cases followed by re-enumerate ?

Regards,
Oza.
Oza Pawandeep April 16, 2018, 5:51 a.m. | #13
On 2018-04-16 11:03, poza@codeaurora.org wrote:
> On 2018-04-16 08:47, Bjorn Helgaas wrote:
>> On Sat, Apr 14, 2018 at 11:53:17AM -0400, Sinan Kaya wrote:
>> 
>>> You indicated that you want to unify the AER and DPC behavior. Let's
>>> settle on what we want to do one more time. We have been going forth
>>> and back on the direction.
>> 
>> My thinking is that as much as possible, similar events should be
>> handled similarly, whether the mechanism is AER, DPC, EEH, etc.
>> Ideally, drivers shouldn't have to be aware of which mechanism is in
>> use.
>> 
>> Error recovery includes conventional PCI as well, but right now I
>> think we're only concerned with PCIe.  The following error types are
>> from PCIe r4.0, sec 6.2.2:
>> 
>>   ERR_COR
>>     Corrected by hardware with no software intervention.  Software
>>     involved for logging only.
>> 
>>     Handled by AER via pci_error_handlers; DPC is never involved.
>> 
>>     Link is unaffected.
>> 
>>   ERR_NONFATAL
>>     A transaction is unreliable but the link is fully functional.
>> 
>>     If DPC is not supported, handled by AER via pci_error_handlers and
>>     the link is unaffected.
>> 
>>     If DPC supported, handled by DPC (because we set
>>     PCI_EXP_DPC_CTL_EN_NONFATAL) via remove/re-enumerate.
>> 
>>   ERR_FATAL
>>     The link is unreliable.
>> 
>>     If DPC is not supported, handled by AER via pci_error_handlers and
>>     the link is reset.
>> 
>>     If DPC supported, handled by DPC via remove/re-enumerate.
>> 
>> It doesn't seem right to me that we handle both ERR_NONFATAL and
>> ERR_FATAL events differently if we happen to have DPC support in a
>> switch.
>> 
>> Maybe we should consider triggering DPC only on ERR_FATAL?  That would
>> keep DPC out of the ERR_NONFATAL cases.
>> 
>> For ERR_FATAL, maybe we should bite the bullet and use
>> remove/re-enumerate for AER as well as for DPC.  That would be painful
>> for higher-level software, but if we're willing to accept that pain
>> for new systems that support DPC, maybe life would be better overall
>> if it worked the same way on systems without DPC?
>> 
>> Bjorn
> 
> This had crossed my mind when I first looked at the code.
> DPC is getting triggered for both ERR_NONFATAL and ERR_FATAL case.
> I thought the primary purpose of DPC to recover fatal errors, by
> triggering HW recovery.
> but what if some platform wants to handle both FATAL and NON_FATAL with 
> DPC ?
> 
> As you said AER FATAL cases and DPC FATAL cases should be handled 
> similarly.
> e.g. remove/re-enumerate the devices.
> 
> while NON_FATAL case; only AER would come into picture.
> if some platform would like to handle DPC NON_FATAL then it should
> follow AER NON_FATAL path  (where it does not do remove/re-enumerate)
> 
> And the case where hotplug is enabled, remove/re-enumerate more sense
> in case of ERR_FATAL.
> And the case where hotplug is disabled, only re-enumeration is
> required. (no need to remove the devices)
> but then do we need to handle this case specifically, what is the harm
> in removing the devices in all the cases followed by re-enumerate ?

To Clarify the last line, what I meant here was, in case of ERR_FATAL we 
can always remove/re-enumerate the devices irrespective of hotplug is 
enabled or not.

and in case of ERR_NONFATAL, DPC will follow AER path (where it just 
tries to recover)
although I am not very sure that how to handle ERR_NONFATAL case if 
hotplug is enabled. Because as Keith suggested device might have been 
changed run-time.

> 
> Regards,
> Oza.
Bjorn Helgaas April 16, 2018, 2:01 p.m. | #14
On Mon, Apr 16, 2018 at 11:21:15AM +0530, poza@codeaurora.org wrote:
> On 2018-04-16 11:03, poza@codeaurora.org wrote:
> > On 2018-04-16 08:47, Bjorn Helgaas wrote:
> > > On Sat, Apr 14, 2018 at 11:53:17AM -0400, Sinan Kaya wrote:
> > > 
> > > > You indicated that you want to unify the AER and DPC behavior. Let's
> > > > settle on what we want to do one more time. We have been going forth
> > > > and back on the direction.
> > > 
> > > My thinking is that as much as possible, similar events should be
> > > handled similarly, whether the mechanism is AER, DPC, EEH, etc.
> > > Ideally, drivers shouldn't have to be aware of which mechanism is in
> > > use.
> > > 
> > > Error recovery includes conventional PCI as well, but right now I
> > > think we're only concerned with PCIe.  The following error types are
> > > from PCIe r4.0, sec 6.2.2:
> > > 
> > >   ERR_COR
> > >     Corrected by hardware with no software intervention.  Software
> > >     involved for logging only.
> > > 
> > >     Handled by AER via pci_error_handlers; DPC is never involved.
> > > 
> > >     Link is unaffected.
> > > 
> > >   ERR_NONFATAL
> > >     A transaction is unreliable but the link is fully functional.
> > > 
> > >     If DPC is not supported, handled by AER via pci_error_handlers and
> > >     the link is unaffected.
> > > 
> > >     If DPC supported, handled by DPC (because we set
> > >     PCI_EXP_DPC_CTL_EN_NONFATAL) via remove/re-enumerate.
> > > 
> > >   ERR_FATAL
> > >     The link is unreliable.
> > > 
> > >     If DPC is not supported, handled by AER via pci_error_handlers and
> > >     the link is reset.
> > > 
> > >     If DPC supported, handled by DPC via remove/re-enumerate.
> > > 
> > > It doesn't seem right to me that we handle both ERR_NONFATAL and
> > > ERR_FATAL events differently if we happen to have DPC support in a
> > > switch.
> > > 
> > > Maybe we should consider triggering DPC only on ERR_FATAL?  That would
> > > keep DPC out of the ERR_NONFATAL cases.
> > > 
> > > For ERR_FATAL, maybe we should bite the bullet and use
> > > remove/re-enumerate for AER as well as for DPC.  That would be painful
> > > for higher-level software, but if we're willing to accept that pain
> > > for new systems that support DPC, maybe life would be better overall
> > > if it worked the same way on systems without DPC?
> > > 
> > > Bjorn
> > 
> > This had crossed my mind when I first looked at the code.
> > DPC is getting triggered for both ERR_NONFATAL and ERR_FATAL case.
> > I thought the primary purpose of DPC to recover fatal errors, by
> > triggering HW recovery.
> > but what if some platform wants to handle both FATAL and NON_FATAL
> > with DPC ?

AER usage is coordinated between the platform and the OS via _OSC (PCI
Firmware spec r3.2, sec 4.5.1) My understanding is that if the
platform grants the OS permission to control AER, the OS is free to
set the AER control registers however it wishes.  If the platform
depends on certain AER control settings, it must decline to grant AER
control to the OS.

As far as I know, there's nothing new in _OSC related to DPC, but
based on the implementation note in PCIe r4.0, sec 6.2.10, we
currently assume the OS controls DPC if and only if it controls AER.

Therefore, if a platform depends on certain DPC control settings,
e.g., if it wants to handle both ERR_FATAL and ERR_NONFATAL with DPC,
I would argue that the platform needs to retain control over AER.

If the platform grants AER control to the OS, I think the OS is free
to set both AER and DPC control registers as it desires.

> > As you said AER FATAL cases and DPC FATAL cases should be handled
> > similarly.  e.g. remove/re-enumerate the devices.
> > 
> > while NON_FATAL case; only AER would come into picture.  if some
> > platform would like to handle DPC NON_FATAL then it should follow
> > AER NON_FATAL path  (where it does not do remove/re-enumerate)
> > 
> > And the case where hotplug is enabled, remove/re-enumerate more
> > sense in case of ERR_FATAL.  And the case where hotplug is
> > disabled, only re-enumeration is required. (no need to remove the
> > devices) but then do we need to handle this case specifically,
> > what is the harm in removing the devices in all the cases followed
> > by re-enumerate ?
> 
> To Clarify the last line, what I meant here was, in case of
> ERR_FATAL we can always remove/re-enumerate the devices irrespective
> of hotplug is enabled or not.

ERR_FATAL means the link is unreliable.  A hotplug event may have
occurred.  In this case, I think remove/re-enumerate is a safe option.
It might be more than is needed in some cases, but I'm not sure we can
reliably determine when we don't need it, and if we
remove/re-enumerate for some ERR_FATAL errors but not others, I think
it complicates the situation for drivers and other higher-level
software.

> and in case of ERR_NONFATAL, DPC will follow AER path (where it just
> tries to recover) although I am not very sure that how to handle
> ERR_NONFATAL case if hotplug is enabled. Because as Keith suggested
> device might have been changed run-time.

ERR_NONFATAL means a particular transaction might be unreliable but
the link itself is fully functional.  No hotplug is possible if the
link has stayed fully functional.

Bjorn
Sinan Kaya April 16, 2018, 2:46 p.m. | #15
On 4/15/2018 11:17 PM, Bjorn Helgaas wrote:
> It doesn't seem right to me that we handle both ERR_NONFATAL and
> ERR_FATAL events differently if we happen to have DPC support in a
> switch.
> 
> Maybe we should consider triggering DPC only on ERR_FATAL?  That would
> keep DPC out of the ERR_NONFATAL cases.
> 
From reliability perspective, it makes sense. DPC handles NONFATAL errors
by bringing down the link. If error happened behind a switch and root port
is handling DPC, we are impacting a lot of devices operation because of one
faulty device.

Keith, do you have any preference on this direction?

> For ERR_FATAL, maybe we should bite the bullet and use
> remove/re-enumerate for AER as well as for DPC.  That would be painful
> for higher-level software, but if we're willing to accept that pain
> for new systems that support DPC, maybe life would be better overall
> if it worked the same way on systems without DPC?

Sure, we can go to this route as well.
Oza Pawandeep April 16, 2018, 5:15 p.m. | #16
On 2018-04-16 20:16, Sinan Kaya wrote:
> On 4/15/2018 11:17 PM, Bjorn Helgaas wrote:
>> It doesn't seem right to me that we handle both ERR_NONFATAL and
>> ERR_FATAL events differently if we happen to have DPC support in a
>> switch.
>> 
>> Maybe we should consider triggering DPC only on ERR_FATAL?  That would
>> keep DPC out of the ERR_NONFATAL cases.
>> 
> From reliability perspective, it makes sense. DPC handles NONFATAL 
> errors
> by bringing down the link. If error happened behind a switch and root 
> port
> is handling DPC, we are impacting a lot of devices operation because of 
> one
> faulty device.
> 
> Keith, do you have any preference on this direction?
> 
>> For ERR_FATAL, maybe we should bite the bullet and use
>> remove/re-enumerate for AER as well as for DPC.  That would be painful
>> for higher-level software, but if we're willing to accept that pain
>> for new systems that support DPC, maybe life would be better overall
>> if it worked the same way on systems without DPC?
> 
> Sure, we can go to this route as well.


ok so finally this is what is being proposed and so far Bjorn, Sinan and 
myself agreed on following:

I need to move the stop and re-enumerate code into the AER path instead 
of patch #6 for both DPC_FATAL and AER_FATAL error types.
Also, I should turn off DPC NON_FATAL error detection.

Keith, please confirm if you are okay with above proposal.

Regards,
Oza.

Patch

diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index 8e1553b..6d9a841 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -108,8 +108,6 @@  static void dpc_wait_link_inactive(struct dpc_dev *dpc)
  */
 static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-	struct pci_bus *parent = pdev->subordinate;
-	struct pci_dev *dev, *temp;
 	struct dpc_dev *dpc;
 	struct pcie_device *pciedev;
 	struct device *devdpc;
@@ -120,19 +118,6 @@  static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 	dpc = get_service_data(pciedev);
 	cap = dpc->cap_pos;
 
-	pci_lock_rescan_remove();
-	list_for_each_entry_safe_reverse(dev, temp, &parent->devices,
-					 bus_list) {
-		pci_dev_get(dev);
-		pci_dev_set_disconnected(dev, NULL);
-		if (pci_has_subordinate(dev))
-			pci_walk_bus(dev->subordinate,
-				     pci_dev_set_disconnected, NULL);
-		pci_stop_and_remove_bus_device(dev);
-		pci_dev_put(dev);
-	}
-	pci_unlock_rescan_remove();
-
 	dpc_wait_link_inactive(dpc);
 	if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
 		return PCI_ERS_RESULT_DISCONNECT;
@@ -152,13 +137,37 @@  static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 	return PCI_ERS_RESULT_RECOVERED;
 }
 
+static void dpc_reset_link_remove_dev(struct pci_dev *pdev)
+{
+	struct pci_bus *parent = pdev->subordinate;
+	struct pci_dev *dev, *temp;
+
+	pci_lock_rescan_remove();
+	list_for_each_entry_safe_reverse(dev, temp, &parent->devices,
+					 bus_list) {
+		pci_dev_get(dev);
+		pci_dev_set_disconnected(dev, NULL);
+		if (pci_has_subordinate(dev))
+			pci_walk_bus(dev->subordinate,
+				     pci_dev_set_disconnected, NULL);
+		pci_stop_and_remove_bus_device(dev);
+		pci_dev_put(dev);
+	}
+	pci_unlock_rescan_remove();
+
+	dpc_reset_link(pdev);
+}
+
 static void dpc_work(struct work_struct *work)
 {
 	struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
 	struct pci_dev *pdev = dpc->dev->port;
 
 	/* From DPC point of view error is always FATAL. */
-	pcie_do_recovery(pdev, DPC_FATAL);
+	if (!pdev->is_hotplug_bridge)
+		pcie_do_recovery(pdev, DPC_FATAL);
+	else
+		dpc_reset_link_remove_dev(pdev);
 }
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
 {