diff mbox series

[RFC,5/9] PCI/AER: Apply function level reset to RCiEP on fatal error

Message ID 20200724172223.145608-6-sean.v.kelley@intel.com
State New
Headers show
Series Add RCEC handling to PCI/AER | expand

Commit Message

Kelley, Sean V July 24, 2020, 5:22 p.m. UTC
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

Attempt to do function level reset for an RCiEP associated with an
RCEC device on fatal error.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
---
 drivers/pci/pcie/err.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

Comments

Jonathan Cameron July 27, 2020, 11:17 a.m. UTC | #1
On Fri, 24 Jul 2020 10:22:19 -0700
Sean V Kelley <sean.v.kelley@intel.com> wrote:

> From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> 
> Attempt to do function level reset for an RCiEP associated with an
> RCEC device on fatal error.

I'd like to understand more on your reasoning for flr here.
Is it simply that it is all we can do, or is there some basis
in a spec somewhere?

> 
> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> ---
>  drivers/pci/pcie/err.c | 31 ++++++++++++++++++++++---------
>  1 file changed, 22 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index 044df004f20b..9b3ec94bdf1d 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -170,6 +170,17 @@ static void pci_walk_dev_affected(struct pci_dev *dev, int (*cb)(struct pci_dev
>  	}
>  }
>  
> +static enum pci_channel_state flr_on_rciep(struct pci_dev *dev)
> +{
> +	if (!pcie_has_flr(dev))
> +		return PCI_ERS_RESULT_NONE;
> +
> +	if (pcie_flr(dev))
> +		return PCI_ERS_RESULT_DISCONNECT;
> +
> +	return PCI_ERS_RESULT_RECOVERED;
> +}
> +
>  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>  			enum pci_channel_state state,
>  			pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
> @@ -191,15 +202,17 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>  	if (state == pci_channel_io_frozen) {
>  		pci_walk_dev_affected(dev, report_frozen_detected, &status);
>  		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) {
> -			pci_warn(dev, "link reset not possible for RCiEP\n");
> -			status = PCI_ERS_RESULT_NONE;
> -			goto failed;
> -		}
> -
> -		status = reset_link(dev);
> -		if (status != PCI_ERS_RESULT_RECOVERED) {
> -			pci_warn(dev, "link reset failed\n");
> -			goto failed;
> +			status = flr_on_rciep(dev);
> +			if (status != PCI_ERS_RESULT_RECOVERED) {
> +				pci_warn(dev, "function level reset failed\n");
> +				goto failed;
> +			}
> +		} else {
> +			status = reset_link(dev);
> +			if (status != PCI_ERS_RESULT_RECOVERED) {
> +				pci_warn(dev, "link reset failed\n");
> +				goto failed;
> +			}
>  		}
>  	} else {
>  		pci_walk_dev_affected(dev, report_normal_detected, &status);
Zhuo, Qiuxu July 28, 2020, 1:27 p.m. UTC | #2
> From: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
> Sent: Monday, July 27, 2020 7:17 PM
> To: Kelley, Sean V <sean.v.kelley@intel.com>
> Cc: bhelgaas@google.com; rjw@rjwysocki.net; ashok.raj@kernel.org; Luck,
> Tony <tony.luck@intel.com>;
> sathyanarayanan.kuppuswamy@linux.intel.com; linux-pci@vger.kernel.org;
> linux-kernel@vger.kernel.org; Zhuo, Qiuxu <qiuxu.zhuo@intel.com>
> Subject: Re: [RFC PATCH 5/9] PCI/AER: Apply function level reset to RCiEP
> on fatal error
> 
> On Fri, 24 Jul 2020 10:22:19 -0700
> Sean V Kelley <sean.v.kelley@intel.com> wrote:
> 
> > From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> >
> > Attempt to do function level reset for an RCiEP associated with an
> > RCEC device on fatal error.
> 
> I'd like to understand more on your reasoning for flr here.
> Is it simply that it is all we can do, or is there some basis in a spec
> somewhere?
> 

Yes. Though there isn't the link reset for the RCiEP here, I think we should still be able to reset the RCiEP via FLR on fatal error, if the RCiEP supports FLR.

-Qiuxu

> >
> > Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> > ---
> >  drivers/pci/pcie/err.c | 31 ++++++++++++++++++++++---------
> >  1 file changed, 22 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index
> > 044df004f20b..9b3ec94bdf1d 100644
> > --- a/drivers/pci/pcie/err.c
> > +++ b/drivers/pci/pcie/err.c
> > @@ -170,6 +170,17 @@ static void pci_walk_dev_affected(struct
> pci_dev *dev, int (*cb)(struct pci_dev
> >  	}
> >  }
> >
> > +static enum pci_channel_state flr_on_rciep(struct pci_dev *dev) {
> > +	if (!pcie_has_flr(dev))
> > +		return PCI_ERS_RESULT_NONE;
> > +
> > +	if (pcie_flr(dev))
> > +		return PCI_ERS_RESULT_DISCONNECT;
> > +
> > +	return PCI_ERS_RESULT_RECOVERED;
> > +}
> > +
> >  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >  			enum pci_channel_state state,
> >  			pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
> @@ -191,15
> > +202,17 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >  	if (state == pci_channel_io_frozen) {
> >  		pci_walk_dev_affected(dev, report_frozen_detected,
> &status);
> >  		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) {
> > -			pci_warn(dev, "link reset not possible for RCiEP\n");
> > -			status = PCI_ERS_RESULT_NONE;
> > -			goto failed;
> > -		}
> > -
> > -		status = reset_link(dev);
> > -		if (status != PCI_ERS_RESULT_RECOVERED) {
> > -			pci_warn(dev, "link reset failed\n");
> > -			goto failed;
> > +			status = flr_on_rciep(dev);
> > +			if (status != PCI_ERS_RESULT_RECOVERED) {
> > +				pci_warn(dev, "function level reset failed\n");
> > +				goto failed;
> > +			}
> > +		} else {
> > +			status = reset_link(dev);
> > +			if (status != PCI_ERS_RESULT_RECOVERED) {
> > +				pci_warn(dev, "link reset failed\n");
> > +				goto failed;
> > +			}
> >  		}
> >  	} else {
> >  		pci_walk_dev_affected(dev, report_normal_detected,
> &status);
>
Kelley, Sean V July 28, 2020, 4:14 p.m. UTC | #3
On 28 Jul 2020, at 6:27, Zhuo, Qiuxu wrote:

>> From: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
>> Sent: Monday, July 27, 2020 7:17 PM
>> To: Kelley, Sean V <sean.v.kelley@intel.com>
>> Cc: bhelgaas@google.com; rjw@rjwysocki.net; ashok.raj@kernel.org; 
>> Luck,
>> Tony <tony.luck@intel.com>;
>> sathyanarayanan.kuppuswamy@linux.intel.com; 
>> linux-pci@vger.kernel.org;
>> linux-kernel@vger.kernel.org; Zhuo, Qiuxu <qiuxu.zhuo@intel.com>
>> Subject: Re: [RFC PATCH 5/9] PCI/AER: Apply function level reset to 
>> RCiEP
>> on fatal error
>>
>> On Fri, 24 Jul 2020 10:22:19 -0700
>> Sean V Kelley <sean.v.kelley@intel.com> wrote:
>>
>>> From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
>>>
>>> Attempt to do function level reset for an RCiEP associated with an
>>> RCEC device on fatal error.
>>
>> I'd like to understand more on your reasoning for flr here.
>> Is it simply that it is all we can do, or is there some basis in a 
>> spec
>> somewhere?
>>
>
> Yes. Though there isn't the link reset for the RCiEP here, I think we 
> should still be able to reset the RCiEP via FLR on fatal error, if the 
> RCiEP supports FLR.
>
> -Qiuxu
>

Also see PCIe 5.0-1, Sec. 6.6.2 Function Level Reset (FLR)

Implementation of FLR is optional (not required), but is strongly 
recommended. For an example use case consider CXL. Function 0 DVSEC 
instances control for the CXL functionality of the entire CXL device. 
FLR may succeed in recovering from CXL.io domain errors.

Thanks,

Sean

>>>
>>> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
>>> ---
>>>  drivers/pci/pcie/err.c | 31 ++++++++++++++++++++++---------
>>>  1 file changed, 22 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index
>>> 044df004f20b..9b3ec94bdf1d 100644
>>> --- a/drivers/pci/pcie/err.c
>>> +++ b/drivers/pci/pcie/err.c
>>> @@ -170,6 +170,17 @@ static void pci_walk_dev_affected(struct
>> pci_dev *dev, int (*cb)(struct pci_dev
>>>  }
>>>  }
>>>
>>> +static enum pci_channel_state flr_on_rciep(struct pci_dev *dev) {
>>> +if (!pcie_has_flr(dev))
>>> +return PCI_ERS_RESULT_NONE;
>>> +
>>> +if (pcie_flr(dev))
>>> +return PCI_ERS_RESULT_DISCONNECT;
>>> +
>>> +return PCI_ERS_RESULT_RECOVERED;
>>> +}
>>> +
>>>  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>>  enum pci_channel_state state,
>>>  pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
>> @@ -191,15
>>> +202,17 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>>  if (state == pci_channel_io_frozen) {
>>>  pci_walk_dev_affected(dev, report_frozen_detected,
>> &status);
>>>  if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) {
>>> -pci_warn(dev, "link reset not possible for RCiEP\n");
>>> -status = PCI_ERS_RESULT_NONE;
>>> -goto failed;
>>> -}
>>> -
>>> -status = reset_link(dev);
>>> -if (status != PCI_ERS_RESULT_RECOVERED) {
>>> -pci_warn(dev, "link reset failed\n");
>>> -goto failed;
>>> +status = flr_on_rciep(dev);
>>> +if (status != PCI_ERS_RESULT_RECOVERED) {
>>> +pci_warn(dev, "function level reset failed\n");
>>> +goto failed;
>>> +}
>>> +} else {
>>> +status = reset_link(dev);
>>> +if (status != PCI_ERS_RESULT_RECOVERED) {
>>> +pci_warn(dev, "link reset failed\n");
>>> +goto failed;
>>> +}
>>>  }
>>>  } else {
>>>  pci_walk_dev_affected(dev, report_normal_detected,
>> &status);
>>
Jonathan Cameron July 28, 2020, 5:02 p.m. UTC | #4
On Tue, 28 Jul 2020 09:14:11 -0700
Sean V Kelley <sean.v.kelley@intel.com> wrote:

> On 28 Jul 2020, at 6:27, Zhuo, Qiuxu wrote:
> 
> >> From: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
> >> Sent: Monday, July 27, 2020 7:17 PM
> >> To: Kelley, Sean V <sean.v.kelley@intel.com>
> >> Cc: bhelgaas@google.com; rjw@rjwysocki.net; ashok.raj@kernel.org; 
> >> Luck,
> >> Tony <tony.luck@intel.com>;
> >> sathyanarayanan.kuppuswamy@linux.intel.com; 
> >> linux-pci@vger.kernel.org;
> >> linux-kernel@vger.kernel.org; Zhuo, Qiuxu <qiuxu.zhuo@intel.com>
> >> Subject: Re: [RFC PATCH 5/9] PCI/AER: Apply function level reset to 
> >> RCiEP
> >> on fatal error
> >>
> >> On Fri, 24 Jul 2020 10:22:19 -0700
> >> Sean V Kelley <sean.v.kelley@intel.com> wrote:
> >>  
> >>> From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> >>>
> >>> Attempt to do function level reset for an RCiEP associated with an
> >>> RCEC device on fatal error.  
> >>
> >> I'd like to understand more on your reasoning for flr here.
> >> Is it simply that it is all we can do, or is there some basis in a 
> >> spec
> >> somewhere?
> >>  
> >
> > Yes. Though there isn't the link reset for the RCiEP here, I think we 
> > should still be able to reset the RCiEP via FLR on fatal error, if the 
> > RCiEP supports FLR.
> >
> > -Qiuxu
> >  
> 
> Also see PCIe 5.0-1, Sec. 6.6.2 Function Level Reset (FLR)
> 
> Implementation of FLR is optional (not required), but is strongly 
> recommended. For an example use case consider CXL. Function 0 DVSEC 
> instances control for the CXL functionality of the entire CXL device. 
> FLR may succeed in recovering from CXL.io domain errors.

That feels a little bit of a weak argument in favour.  PCI spec lists examples
of use only for FLR and I can't see this matching any of them, but then they
are only examples, so we could argue it doesn't exclude this use. It's not
allowed to affect the link state, but I guess it 'might' recover from some
other type of error?

I'd have read the statement in the CXL spec you are referring to as matching
with the first example in the PCIe spec which is about recovering from
software errors.  For example, unexpected VM tear down.

@Bjorn / All.  What's your view on using FLR as a reset to do when you don't
have any other hammers to use?

Personally I don't have a particular problem with this, it just doesn't fit
with my mental model of what FLR is for (which may well need adjusting :)

Jonathan


> 
> Thanks,
> 
> Sean
> 
> >>>
> >>> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
> >>> ---
> >>>  drivers/pci/pcie/err.c | 31 ++++++++++++++++++++++---------
> >>>  1 file changed, 22 insertions(+), 9 deletions(-)
> >>>
> >>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index
> >>> 044df004f20b..9b3ec94bdf1d 100644
> >>> --- a/drivers/pci/pcie/err.c
> >>> +++ b/drivers/pci/pcie/err.c
> >>> @@ -170,6 +170,17 @@ static void pci_walk_dev_affected(struct  
> >> pci_dev *dev, int (*cb)(struct pci_dev  
> >>>  }
> >>>  }
> >>>
> >>> +static enum pci_channel_state flr_on_rciep(struct pci_dev *dev) {
> >>> +if (!pcie_has_flr(dev))
> >>> +return PCI_ERS_RESULT_NONE;
> >>> +
> >>> +if (pcie_flr(dev))
> >>> +return PCI_ERS_RESULT_DISCONNECT;
> >>> +
> >>> +return PCI_ERS_RESULT_RECOVERED;
> >>> +}
> >>> +
> >>>  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >>>  enum pci_channel_state state,
> >>>  pci_ers_result_t (*reset_link)(struct pci_dev *pdev))  
> >> @@ -191,15  
> >>> +202,17 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >>>  if (state == pci_channel_io_frozen) {
> >>>  pci_walk_dev_affected(dev, report_frozen_detected,  
> >> &status);  
> >>>  if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) {
> >>> -pci_warn(dev, "link reset not possible for RCiEP\n");
> >>> -status = PCI_ERS_RESULT_NONE;
> >>> -goto failed;
> >>> -}
> >>> -
> >>> -status = reset_link(dev);
> >>> -if (status != PCI_ERS_RESULT_RECOVERED) {
> >>> -pci_warn(dev, "link reset failed\n");
> >>> -goto failed;
> >>> +status = flr_on_rciep(dev);
> >>> +if (status != PCI_ERS_RESULT_RECOVERED) {
> >>> +pci_warn(dev, "function level reset failed\n");
> >>> +goto failed;
> >>> +}
> >>> +} else {
> >>> +status = reset_link(dev);
> >>> +if (status != PCI_ERS_RESULT_RECOVERED) {
> >>> +pci_warn(dev, "link reset failed\n");
> >>> +goto failed;
> >>> +}
> >>>  }
> >>>  } else {
> >>>  pci_walk_dev_affected(dev, report_normal_detected,  
> >> &status);
> >>
Kelley, Sean V July 28, 2020, 5:42 p.m. UTC | #5
On 28 Jul 2020, at 10:02, Jonathan Cameron wrote:

> On Tue, 28 Jul 2020 09:14:11 -0700
> Sean V Kelley <sean.v.kelley@intel.com> wrote:
>
>> On 28 Jul 2020, at 6:27, Zhuo, Qiuxu wrote:
>>
>>>> From: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
>>>> Sent: Monday, July 27, 2020 7:17 PM
>>>> To: Kelley, Sean V <sean.v.kelley@intel.com>
>>>> Cc: bhelgaas@google.com; rjw@rjwysocki.net; ashok.raj@kernel.org;
>>>> Luck,
>>>> Tony <tony.luck@intel.com>;
>>>> sathyanarayanan.kuppuswamy@linux.intel.com;
>>>> linux-pci@vger.kernel.org;
>>>> linux-kernel@vger.kernel.org; Zhuo, Qiuxu <qiuxu.zhuo@intel.com>
>>>> Subject: Re: [RFC PATCH 5/9] PCI/AER: Apply function level reset to
>>>> RCiEP
>>>> on fatal error
>>>>
>>>> On Fri, 24 Jul 2020 10:22:19 -0700
>>>> Sean V Kelley <sean.v.kelley@intel.com> wrote:
>>>>
>>>>> From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
>>>>>
>>>>> Attempt to do function level reset for an RCiEP associated with an
>>>>> RCEC device on fatal error.
>>>>
>>>> I'd like to understand more on your reasoning for flr here.
>>>> Is it simply that it is all we can do, or is there some basis in a
>>>> spec
>>>> somewhere?
>>>>
>>>
>>> Yes. Though there isn't the link reset for the RCiEP here, I think 
>>> we
>>> should still be able to reset the RCiEP via FLR on fatal error, if 
>>> the
>>> RCiEP supports FLR.
>>>
>>> -Qiuxu
>>>
>>
>> Also see PCIe 5.0-1, Sec. 6.6.2 Function Level Reset (FLR)
>>
>> Implementation of FLR is optional (not required), but is strongly
>> recommended. For an example use case consider CXL. Function 0 DVSEC
>> instances control for the CXL functionality of the entire CXL device.
>> FLR may succeed in recovering from CXL.io domain errors.
>
> That feels a little bit of a weak argument in favour.  PCI spec lists 
> examples
> of use only for FLR and I can't see this matching any of them, but 
> then they
> are only examples, so we could argue it doesn't exclude this use. It's 
> not
> allowed to affect the link state, but I guess it 'might' recover from 
> some
> other type of error?
>
> I'd have read the statement in the CXL spec you are referring to as 
> matching
> with the first example in the PCIe spec which is about recovering from
> software errors.  For example, unexpected VM tear down.

 From my perspective, it can add value as the point is to address device 
functions and their associated software states. As the section in the 
spec goes on to state:

“The FLR mechanism enables software to quiesce and reset Endpoint 
hardware with Function-level granularity. Three example usage models 
illustrate the benefits of this feature:…”

Later changes in CXL 2.0 Section 9.8 (as of 0.9 draft) further look to 
extend FLR with an eFLR or now referred to as CXL Reset.

“All Functions in a CXL 2.0 (Single Logical Device) SLD that 
participate in CXL.cache or CXL.mem are required to support either FLR 
or CXL Reset. MLDs (Multiple Logical Devices), on the other hand, are 
required to support CXL Reset.”

In my mind the question is whether this change is too limited in scope 
with this patch series (RCiEP) and whether FLR should be considered in a 
broader, i.e., EP, as a ‘hammer’ so to speak.

Thanks,

Sean

>
> @Bjorn / All.  What's your view on using FLR as a reset to do when you 
> don't
> have any other hammers to use?
>
> Personally I don't have a particular problem with this, it just 
> doesn't fit
> with my mental model of what FLR is for (which may well need adjusting 
> :)
>
> Jonathan
>
>
>>
>> Thanks,
>>
>> Sean
>>
>>>>>
>>>>> Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
>>>>> ---
>>>>>  drivers/pci/pcie/err.c | 31 ++++++++++++++++++++++---------
>>>>>  1 file changed, 22 insertions(+), 9 deletions(-)
>>>>>
>>>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index
>>>>> 044df004f20b..9b3ec94bdf1d 100644
>>>>> --- a/drivers/pci/pcie/err.c
>>>>> +++ b/drivers/pci/pcie/err.c
>>>>> @@ -170,6 +170,17 @@ static void pci_walk_dev_affected(struct
>>>> pci_dev *dev, int (*cb)(struct pci_dev
>>>>>  }
>>>>>  }
>>>>>
>>>>> +static enum pci_channel_state flr_on_rciep(struct pci_dev *dev) {
>>>>> +if (!pcie_has_flr(dev))
>>>>> +return PCI_ERS_RESULT_NONE;
>>>>> +
>>>>> +if (pcie_flr(dev))
>>>>> +return PCI_ERS_RESULT_DISCONNECT;
>>>>> +
>>>>> +return PCI_ERS_RESULT_RECOVERED;
>>>>> +}
>>>>> +
>>>>>  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>>>>  enum pci_channel_state state,
>>>>>  pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
>>>> @@ -191,15
>>>>> +202,17 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>>>>  if (state == pci_channel_io_frozen) {
>>>>>  pci_walk_dev_affected(dev, report_frozen_detected,
>>>> &status);
>>>>>  if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) {
>>>>> -pci_warn(dev, "link reset not possible for RCiEP\n");
>>>>> -status = PCI_ERS_RESULT_NONE;
>>>>> -goto failed;
>>>>> -}
>>>>> -
>>>>> -status = reset_link(dev);
>>>>> -if (status != PCI_ERS_RESULT_RECOVERED) {
>>>>> -pci_warn(dev, "link reset failed\n");
>>>>> -goto failed;
>>>>> +status = flr_on_rciep(dev);
>>>>> +if (status != PCI_ERS_RESULT_RECOVERED) {
>>>>> +pci_warn(dev, "function level reset failed\n");
>>>>> +goto failed;
>>>>> +}
>>>>> +} else {
>>>>> +status = reset_link(dev);
>>>>> +if (status != PCI_ERS_RESULT_RECOVERED) {
>>>>> +pci_warn(dev, "link reset failed\n");
>>>>> +goto failed;
>>>>> +}
>>>>>  }
>>>>>  } else {
>>>>>  pci_walk_dev_affected(dev, report_normal_detected,
>>>> &status);
>>>>
diff mbox series

Patch

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 044df004f20b..9b3ec94bdf1d 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -170,6 +170,17 @@  static void pci_walk_dev_affected(struct pci_dev *dev, int (*cb)(struct pci_dev
 	}
 }
 
+static enum pci_channel_state flr_on_rciep(struct pci_dev *dev)
+{
+	if (!pcie_has_flr(dev))
+		return PCI_ERS_RESULT_NONE;
+
+	if (pcie_flr(dev))
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
 pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 			enum pci_channel_state state,
 			pci_ers_result_t (*reset_link)(struct pci_dev *pdev))
@@ -191,15 +202,17 @@  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 	if (state == pci_channel_io_frozen) {
 		pci_walk_dev_affected(dev, report_frozen_detected, &status);
 		if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) {
-			pci_warn(dev, "link reset not possible for RCiEP\n");
-			status = PCI_ERS_RESULT_NONE;
-			goto failed;
-		}
-
-		status = reset_link(dev);
-		if (status != PCI_ERS_RESULT_RECOVERED) {
-			pci_warn(dev, "link reset failed\n");
-			goto failed;
+			status = flr_on_rciep(dev);
+			if (status != PCI_ERS_RESULT_RECOVERED) {
+				pci_warn(dev, "function level reset failed\n");
+				goto failed;
+			}
+		} else {
+			status = reset_link(dev);
+			if (status != PCI_ERS_RESULT_RECOVERED) {
+				pci_warn(dev, "link reset failed\n");
+				goto failed;
+			}
 		}
 	} else {
 		pci_walk_dev_affected(dev, report_normal_detected, &status);