diff mbox series

[v17,09/12] PCI/AER: Allow clearing Error Status Register in FF mode

Message ID 29fb514a0d86e9bcc75cec4ea8474cd4db33adbf.1583286655.git.sathyanarayanan.kuppuswamy@linux.intel.com
State New
Headers show
Series Add Error Disconnect Recover (EDR) support | expand

Commit Message

Kuppuswamy, Sathyanarayanan March 4, 2020, 2:36 a.m. UTC
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>

As per PCI firmware specification r3.2 System Firmware Intermediary
(SFI) _OSC and DPC Updates ECR
(https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
Event Handling Implementation Note", page 10, Error Disconnect Recover
(EDR) support allows OS to handle error recovery and clearing Error
Registers even in FF mode. So create new API pci_aer_raw_clear_status()
which allows clearing AER registers without FF mode checks.

Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
---
 drivers/pci/pci.h      |  2 ++
 drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

Comments

Kuppuswamy, Sathyanarayanan March 6, 2020, 5:45 a.m. UTC | #1
On 3/3/2020 6:36 PM, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
> As per PCI firmware specification r3.2 System Firmware Intermediary
> (SFI) _OSC and DPC Updates ECR
> (https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
> Event Handling Implementation Note", page 10, Error Disconnect Recover
> (EDR) support allows OS to handle error recovery and clearing Error
> Registers even in FF mode. So create new API pci_aer_raw_clear_status()
> which allows clearing AER registers without FF mode checks.
> 
> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> ---
>   drivers/pci/pci.h      |  2 ++
>   drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
>   2 files changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index e57e78b619f8..c239e6dd2542 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -655,6 +655,7 @@ extern const struct attribute_group aer_stats_attr_group;
>   void pci_aer_clear_fatal_status(struct pci_dev *dev);
>   void pci_aer_clear_device_status(struct pci_dev *dev);
>   int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
> +int pci_aer_raw_clear_status(struct pci_dev *dev);
>   #else
>   static inline void pci_no_aer(void) { }
>   static inline void pci_aer_init(struct pci_dev *d) { }
> @@ -665,6 +666,7 @@ static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>   {
>   	return -EINVAL;
>   }
> +int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
Its missing static specifier. It needs to be fixed. I can fix it in next 
version.
Bjorn, if there is no need for next version, can you please make this 
change ?
>   #endif
>   
>   #ifdef CONFIG_ACPI
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index c0540c3761dc..41afefa562b7 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -420,7 +420,16 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
>   		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
>   }
>   
> -int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
> +/**
> + * pci_aer_raw_clear_status - Clear AER error registers.
> + * @dev: the PCI device
> + *
> + * NOTE: Allows clearing error registers in both FF and
> + * non FF modes.
> + *
> + * Returns 0 on success, or negative on failure.
> + */
> +int pci_aer_raw_clear_status(struct pci_dev *dev)
>   {
>   	int pos;
>   	u32 status;
> @@ -433,9 +442,6 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>   	if (!pos)
>   		return -EIO;
>   
> -	if (pcie_aer_get_firmware_first(dev))
> -		return -EIO;
> -
>   	port_type = pci_pcie_type(dev);
>   	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
>   		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
> @@ -451,6 +457,14 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>   	return 0;
>   }
>   
> +int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
> +{
> +	if (pcie_aer_get_firmware_first(dev))
> +		return -EIO;
> +
> +	return pci_aer_raw_clear_status(dev);
> +}
> +
>   void pci_save_aer_state(struct pci_dev *dev)
>   {
>   	struct pci_cap_saved_state *save_state;
>
Bjorn Helgaas March 6, 2020, 4:04 p.m. UTC | #2
On Thu, Mar 05, 2020 at 09:45:46PM -0800, Kuppuswamy, Sathyanarayanan wrote:
> On 3/3/2020 6:36 PM, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > 
> > As per PCI firmware specification r3.2 System Firmware Intermediary
> > (SFI) _OSC and DPC Updates ECR
> > (https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
> > Event Handling Implementation Note", page 10, Error Disconnect Recover
> > (EDR) support allows OS to handle error recovery and clearing Error
> > Registers even in FF mode. So create new API pci_aer_raw_clear_status()
> > which allows clearing AER registers without FF mode checks.
> > 
> > Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > ---
> >   drivers/pci/pci.h      |  2 ++
> >   drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
> >   2 files changed, 20 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > index e57e78b619f8..c239e6dd2542 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -655,6 +655,7 @@ extern const struct attribute_group aer_stats_attr_group;
> >   void pci_aer_clear_fatal_status(struct pci_dev *dev);
> >   void pci_aer_clear_device_status(struct pci_dev *dev);
> >   int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
> > +int pci_aer_raw_clear_status(struct pci_dev *dev);
> >   #else
> >   static inline void pci_no_aer(void) { }
> >   static inline void pci_aer_init(struct pci_dev *d) { }
> > @@ -665,6 +666,7 @@ static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
> >   {
> >   	return -EINVAL;
> >   }
> > +int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }

> It's missing static specifier. It needs to be fixed. I can fix it in
> next version.  Bjorn, if there is no need for next version, can you
> please make this change?

pci_aer_raw_clear_status() is defined in aer.c and called from aer.c
and edr.c, so I do not think it can be static.  Am I missing
something?

I have a review/edr branch that I hope becomes what will be applied.

Bjorn
Kuppuswamy, Sathyanarayanan March 6, 2020, 4:11 p.m. UTC | #3
On 3/6/2020 8:04 AM, Bjorn Helgaas wrote:
> On Thu, Mar 05, 2020 at 09:45:46PM -0800, Kuppuswamy, Sathyanarayanan wrote:
>> On 3/3/2020 6:36 PM, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
>>> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>>
>>> As per PCI firmware specification r3.2 System Firmware Intermediary
>>> (SFI) _OSC and DPC Updates ECR
>>> (https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
>>> Event Handling Implementation Note", page 10, Error Disconnect Recover
>>> (EDR) support allows OS to handle error recovery and clearing Error
>>> Registers even in FF mode. So create new API pci_aer_raw_clear_status()
>>> which allows clearing AER registers without FF mode checks.
>>>
>>> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>> ---
>>>    drivers/pci/pci.h      |  2 ++
>>>    drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
>>>    2 files changed, 20 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>>> index e57e78b619f8..c239e6dd2542 100644
>>> --- a/drivers/pci/pci.h
>>> +++ b/drivers/pci/pci.h
>>> @@ -655,6 +655,7 @@ extern const struct attribute_group aer_stats_attr_group;
>>>    void pci_aer_clear_fatal_status(struct pci_dev *dev);
>>>    void pci_aer_clear_device_status(struct pci_dev *dev);
>>>    int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
>>> +int pci_aer_raw_clear_status(struct pci_dev *dev);
>>>    #else
>>>    static inline void pci_no_aer(void) { }
>>>    static inline void pci_aer_init(struct pci_dev *d) { }
>>> @@ -665,6 +666,7 @@ static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>>    {
>>>    	return -EINVAL;
>>>    }
>>> +int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
> 
>> It's missing static specifier. It needs to be fixed. I can fix it in
>> next version.  Bjorn, if there is no need for next version, can you
>> please make this change?
> 
> pci_aer_raw_clear_status() is defined in aer.c and called from aer.c
> and edr.c, so I do not think it can be static.  Am I missing
> something?
> 
> I have a review/edr branch that I hope becomes what will be applied.
For kernel configs that does not define CONFIG_PCIEAER, it will create 
redefinition error since pci.h can be included in many files.
> 
> Bjorn
>
Bjorn Helgaas March 6, 2020, 4:41 p.m. UTC | #4
On Fri, Mar 06, 2020 at 08:11:41AM -0800, Kuppuswamy, Sathyanarayanan wrote:
> On 3/6/2020 8:04 AM, Bjorn Helgaas wrote:
> > On Thu, Mar 05, 2020 at 09:45:46PM -0800, Kuppuswamy, Sathyanarayanan wrote:
> > > On 3/3/2020 6:36 PM, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> > > > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > 
> > > > As per PCI firmware specification r3.2 System Firmware Intermediary
> > > > (SFI) _OSC and DPC Updates ECR
> > > > (https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
> > > > Event Handling Implementation Note", page 10, Error Disconnect Recover
> > > > (EDR) support allows OS to handle error recovery and clearing Error
> > > > Registers even in FF mode. So create new API pci_aer_raw_clear_status()
> > > > which allows clearing AER registers without FF mode checks.
> > > > 
> > > > Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> > > > ---
> > > >    drivers/pci/pci.h      |  2 ++
> > > >    drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
> > > >    2 files changed, 20 insertions(+), 4 deletions(-)
> > > > 
> > > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > > > index e57e78b619f8..c239e6dd2542 100644
> > > > --- a/drivers/pci/pci.h
> > > > +++ b/drivers/pci/pci.h
> > > > @@ -655,6 +655,7 @@ extern const struct attribute_group aer_stats_attr_group;
> > > >    void pci_aer_clear_fatal_status(struct pci_dev *dev);
> > > >    void pci_aer_clear_device_status(struct pci_dev *dev);
> > > >    int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
> > > > +int pci_aer_raw_clear_status(struct pci_dev *dev);
> > > >    #else
> > > >    static inline void pci_no_aer(void) { }
> > > >    static inline void pci_aer_init(struct pci_dev *d) { }
> > > > @@ -665,6 +666,7 @@ static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
> > > >    {
> > > >    	return -EINVAL;
> > > >    }
> > > > +int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
> > 
> > > It's missing static specifier. It needs to be fixed. I can fix it in
> > > next version.  Bjorn, if there is no need for next version, can you
> > > please make this change?
> > 
> > pci_aer_raw_clear_status() is defined in aer.c and called from aer.c
> > and edr.c, so I do not think it can be static.  Am I missing
> > something?

> For kernel configs that does not define CONFIG_PCIEAER, it will create
> redefinition error since pci.h can be included in many files.

Oh, right, I thought you were talking about the definition in aer.c.
The stub in pci.h is missing "inline" as well as "static".

Fixed.
Bjorn Helgaas March 10, 2020, 2:40 a.m. UTC | #5
[+cc Austin, tentative Linux patches on this git branch:
https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git/tree/drivers/pci/pcie?h=review/edr]

On Tue, Mar 03, 2020 at 06:36:32PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
> As per PCI firmware specification r3.2 System Firmware Intermediary
> (SFI) _OSC and DPC Updates ECR
> (https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
> Event Handling Implementation Note", page 10, Error Disconnect Recover
> (EDR) support allows OS to handle error recovery and clearing Error
> Registers even in FF mode. So create new API pci_aer_raw_clear_status()
> which allows clearing AER registers without FF mode checks.

I see that this ECR was released as an ECN a few days ago:
https://members.pcisig.com/wg/PCI-SIG/document/14076
Regrettably the title in the PDF still says "ECR" (the rendered title
*page* says "ENGINEERING CHANGE NOTIFICATION", but some metadata
buried in the file says "ECR - SFI _OSC Support and DPC Updates".

Anyway, I think I see the note you refer to (now on page 12):

  IMPLEMENTATION NOTE
  DPC Event Handling

  The flow chart below documents the behavior when firmware maintains
  control of AER and DPC and grants control of PCIe Hot-Plug to the
  operating system.

  ...

  Capture and clear device AER status. OS may choose to offline
  devices3, either via SW (not load driver) or HW (power down device,
  disable Link5,6,7). Otherwise process _HPX, complete device
  enumeration, load drivers

This clearly suggests that the OS should clear device AER status.
However, according to the intro text, firmware has retained control of
AER, so what gives the OS the right to clear AER status?

The Downstream Port Containment Related Enhancements ECN (sec 4.5.1,
table 4-6) contains an exception that allows the OS to read/write
DPC registers during recovery.  But

  - that is for *DPC* registers, not for AER registers, and

  - that exception only applies between OS receipt of the EDR
    notification and OS release of DPC by clearing the DPC Trigger
    Status bit.

The flowchart in the SFI ECN shows the OS releasing DPC before
clearing AER status:

  - Receive EDR notification

  - Cleanup - Notify and unload child drivers below Port

  - Bring Port out of DPC, clear port error status, assign bus numbers
    to child devices.

    I assume this box includes clearing DPC error status and clearing
    Trigger Status?  They seem to be out of order in the box.

  - Evaluate _OST

  - Capture and clear device AER status.

    This seems suspect to me.  Where does it say the OS is allowed to
    write AER status when firmware retains control of AER?

This patch series does things in this order:

  - Receive EDR notification (edr_handle_event(), edr.c)

  - Read, log, and clear DPC error regs (dpc_process_error(), dpc.c).

    This also clears AER uncorrectable error status when the relevant
    HEST entries do not have the FIRMWARE_FIRST bit set.  I think this
    is incorrect: the test should be based the _OSC negotiation for
    AER ownership, not on the HEST entries.  But this problem
    pre-dates this patch series.

  - Clear AER status (pci_aer_raw_clear_status(), aer.c).

    This is at least inside the EDR recovery window, but again, I
    don't see where it says the OS is allowed to write the AER status.

  - Attempt recovery (pcie_do_recovery(), err.c)

  - Clear DPC Trigger Status (dpc_reset_link(), dpc.c)

  - Evaluate _OST (acpi_send_edr_status(), edr.c)

What am I missing?

> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
> ---
>  drivers/pci/pci.h      |  2 ++
>  drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
>  2 files changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index e57e78b619f8..c239e6dd2542 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -655,6 +655,7 @@ extern const struct attribute_group aer_stats_attr_group;
>  void pci_aer_clear_fatal_status(struct pci_dev *dev);
>  void pci_aer_clear_device_status(struct pci_dev *dev);
>  int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
> +int pci_aer_raw_clear_status(struct pci_dev *dev);
>  #else
>  static inline void pci_no_aer(void) { }
>  static inline void pci_aer_init(struct pci_dev *d) { }
> @@ -665,6 +666,7 @@ static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>  {
>  	return -EINVAL;
>  }
> +int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
>  #endif
>  
>  #ifdef CONFIG_ACPI
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index c0540c3761dc..41afefa562b7 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -420,7 +420,16 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
>  		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
>  }
>  
> -int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
> +/**
> + * pci_aer_raw_clear_status - Clear AER error registers.
> + * @dev: the PCI device
> + *
> + * NOTE: Allows clearing error registers in both FF and
> + * non FF modes.
> + *
> + * Returns 0 on success, or negative on failure.
> + */
> +int pci_aer_raw_clear_status(struct pci_dev *dev)
>  {
>  	int pos;
>  	u32 status;
> @@ -433,9 +442,6 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>  	if (!pos)
>  		return -EIO;
>  
> -	if (pcie_aer_get_firmware_first(dev))
> -		return -EIO;
> -
>  	port_type = pci_pcie_type(dev);
>  	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
>  		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
> @@ -451,6 +457,14 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>  	return 0;
>  }
>  
> +int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
> +{
> +	if (pcie_aer_get_firmware_first(dev))
> +		return -EIO;
> +
> +	return pci_aer_raw_clear_status(dev);
> +}
> +
>  void pci_save_aer_state(struct pci_dev *dev)
>  {
>  	struct pci_cap_saved_state *save_state;
> -- 
> 2.25.1
>
Kuppuswamy, Sathyanarayanan March 10, 2020, 4:28 a.m. UTC | #6
Hi Bjorn,

On 3/9/2020 7:40 PM, Bjorn Helgaas wrote:
> [+cc Austin, tentative Linux patches on this git branch:
> https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git/tree/drivers/pci/pcie?h=review/edr]
> 
> On Tue, Mar 03, 2020 at 06:36:32PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
>> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>
>> As per PCI firmware specification r3.2 System Firmware Intermediary
>> (SFI) _OSC and DPC Updates ECR
>> (https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
>> Event Handling Implementation Note", page 10, Error Disconnect Recover
>> (EDR) support allows OS to handle error recovery and clearing Error
>> Registers even in FF mode. So create new API pci_aer_raw_clear_status()
>> which allows clearing AER registers without FF mode checks.
> 
> I see that this ECR was released as an ECN a few days ago:
> https://members.pcisig.com/wg/PCI-SIG/document/14076
> Regrettably the title in the PDF still says "ECR" (the rendered title
> *page* says "ENGINEERING CHANGE NOTIFICATION", but some metadata
> buried in the file says "ECR - SFI _OSC Support and DPC Updates".
> 
> Anyway, I think I see the note you refer to (now on page 12):
> 
>    IMPLEMENTATION NOTE
>    DPC Event Handling
> 
>    The flow chart below documents the behavior when firmware maintains
>    control of AER and DPC and grants control of PCIe Hot-Plug to the
>    operating system.
> 
>    ...
> 
>    Capture and clear device AER status. OS may choose to offline
>    devices3, either via SW (not load driver) or HW (power down device,
>    disable Link5,6,7). Otherwise process _HPX, complete device
>    enumeration, load drivers
> 
> This clearly suggests that the OS should clear device AER status.
> However, according to the intro text, firmware has retained control of
> AER, so what gives the OS the right to clear AER status?
> 
> The Downstream Port Containment Related Enhancements ECN (sec 4.5.1,
> table 4-6) contains an exception that allows the OS to read/write
> DPC registers during recovery.  But
> 
>    - that is for *DPC* registers, not for AER registers, and
> 
>    - that exception only applies between OS receipt of the EDR
>      notification and OS release of DPC by clearing the DPC Trigger
>      Status bit.
> 
> The flowchart in the SFI ECN shows the OS releasing DPC before
> clearing AER status:
> 
>    - Receive EDR notification
> 
>    - Cleanup - Notify and unload child drivers below Port
> 
>    - Bring Port out of DPC, clear port error status, assign bus numbers
>      to child devices.
> 
>      I assume this box includes clearing DPC error status and clearing
>      Trigger Status?  They seem to be out of order in the box.
> 
>    - Evaluate _OST
> 
>    - Capture and clear device AER status.
> 
>      This seems suspect to me.  Where does it say the OS is allowed to
>      write AER status when firmware retains control of AER?
> 
> This patch series does things in this order:
> 
>    - Receive EDR notification (edr_handle_event(), edr.c)
> 
>    - Read, log, and clear DPC error regs (dpc_process_error(), dpc.c).
> 
>      This also clears AER uncorrectable error status when the relevant
>      HEST entries do not have the FIRMWARE_FIRST bit set.  I think this
>      is incorrect: the test should be based the _OSC negotiation for
>      AER ownership, not on the HEST entries.  But this problem
>      pre-dates this patch series.
> 
>    - Clear AER status (pci_aer_raw_clear_status(), aer.c).
> 
>      This is at least inside the EDR recovery window, but again, I
>      don't see where it says the OS is allowed to write the AER status.

Implementation note is the only reference we have regarding clearing the
AER registers.

But since the spec says both DPC and AER needs to be always controlled
together by the either OS or firmware, and when firmware relinquishes
control over DPC registers in EDR notification window, we can assume
that we also have control over AER registers.

But I agree that is not explicitly spelled out any where outside the
implementation note.


Austin,

May be ECN (section 4.5.1, table 4-6) needs to be updated to add this
clarification.

> 
>    - Attempt recovery (pcie_do_recovery(), err.c)
> 
>    - Clear DPC Trigger Status (dpc_reset_link(), dpc.c)
> 
>    - Evaluate _OST (acpi_send_edr_status(), edr.c)
> 
> What am I missing?
> 
>> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>> ---
>>   drivers/pci/pci.h      |  2 ++
>>   drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
>>   2 files changed, 20 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>> index e57e78b619f8..c239e6dd2542 100644
>> --- a/drivers/pci/pci.h
>> +++ b/drivers/pci/pci.h
>> @@ -655,6 +655,7 @@ extern const struct attribute_group aer_stats_attr_group;
>>   void pci_aer_clear_fatal_status(struct pci_dev *dev);
>>   void pci_aer_clear_device_status(struct pci_dev *dev);
>>   int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
>> +int pci_aer_raw_clear_status(struct pci_dev *dev);
>>   #else
>>   static inline void pci_no_aer(void) { }
>>   static inline void pci_aer_init(struct pci_dev *d) { }
>> @@ -665,6 +666,7 @@ static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>   {
>>   	return -EINVAL;
>>   }
>> +int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
>>   #endif
>>   
>>   #ifdef CONFIG_ACPI
>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>> index c0540c3761dc..41afefa562b7 100644
>> --- a/drivers/pci/pcie/aer.c
>> +++ b/drivers/pci/pcie/aer.c
>> @@ -420,7 +420,16 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
>>   		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
>>   }
>>   
>> -int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>> +/**
>> + * pci_aer_raw_clear_status - Clear AER error registers.
>> + * @dev: the PCI device
>> + *
>> + * NOTE: Allows clearing error registers in both FF and
>> + * non FF modes.
>> + *
>> + * Returns 0 on success, or negative on failure.
>> + */
>> +int pci_aer_raw_clear_status(struct pci_dev *dev)
>>   {
>>   	int pos;
>>   	u32 status;
>> @@ -433,9 +442,6 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>   	if (!pos)
>>   		return -EIO;
>>   
>> -	if (pcie_aer_get_firmware_first(dev))
>> -		return -EIO;
>> -
>>   	port_type = pci_pcie_type(dev);
>>   	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
>>   		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
>> @@ -451,6 +457,14 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>   	return 0;
>>   }
>>   
>> +int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>> +{
>> +	if (pcie_aer_get_firmware_first(dev))
>> +		return -EIO;
>> +
>> +	return pci_aer_raw_clear_status(dev);
>> +}
>> +
>>   void pci_save_aer_state(struct pci_dev *dev)
>>   {
>>   	struct pci_cap_saved_state *save_state;
>> -- 
>> 2.25.1
>>
Austin.Bolen@dell.com March 10, 2020, 6:14 p.m. UTC | #7
On 3/9/2020 11:28 PM, Kuppuswamy, Sathyanarayanan wrote:
> 
> [EXTERNAL EMAIL]
> 
> Hi Bjorn,
> 
> On 3/9/2020 7:40 PM, Bjorn Helgaas wrote:
>> [+cc Austin, tentative Linux patches on this git branch:
>> https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git/tree/drivers/pci/pcie?h=review/edr]
>>
>> On Tue, Mar 03, 2020 at 06:36:32PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote:
>>> From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>>
>>> As per PCI firmware specification r3.2 System Firmware Intermediary
>>> (SFI) _OSC and DPC Updates ECR
>>> (https://members.pcisig.com/wg/PCI-SIG/document/13563), sec titled "DPC
>>> Event Handling Implementation Note", page 10, Error Disconnect Recover
>>> (EDR) support allows OS to handle error recovery and clearing Error
>>> Registers even in FF mode. So create new API pci_aer_raw_clear_status()
>>> which allows clearing AER registers without FF mode checks.
>>
>> I see that this ECR was released as an ECN a few days ago:
>> https://members.pcisig.com/wg/PCI-SIG/document/14076
>> Regrettably the title in the PDF still says "ECR" (the rendered title
>> *page* says "ENGINEERING CHANGE NOTIFICATION", but some metadata
>> buried in the file says "ECR - SFI _OSC Support and DPC Updates".

I'll see if PCI-SIG can update the metadata and repost.

>>
>> Anyway, I think I see the note you refer to (now on page 12):
>>
>>     IMPLEMENTATION NOTE
>>     DPC Event Handling
>>
>>     The flow chart below documents the behavior when firmware maintains
>>     control of AER and DPC and grants control of PCIe Hot-Plug to the
>>     operating system.
>>
>>     ...
>>
>>     Capture and clear device AER status. OS may choose to offline
>>     devices3, either via SW (not load driver) or HW (power down device,
>>     disable Link5,6,7). Otherwise process _HPX, complete device
>>     enumeration, load drivers
>>
>> This clearly suggests that the OS should clear device AER status.
>> However, according to the intro text, firmware has retained control of
>> AER, so what gives the OS the right to clear AER status?
>>
>> The Downstream Port Containment Related Enhancements ECN (sec 4.5.1,
>> table 4-6) contains an exception that allows the OS to read/write
>> DPC registers during recovery.  But
>>
>>     - that is for *DPC* registers, not for AER registers, and
>>
>>     - that exception only applies between OS receipt of the EDR
>>       notification and OS release of DPC by clearing the DPC Trigger
>>       Status bit.
>>
>> The flowchart in the SFI ECN shows the OS releasing DPC before
>> clearing AER status:
>>
>>     - Receive EDR notification
>>
>>     - Cleanup - Notify and unload child drivers below Port
>>
>>     - Bring Port out of DPC, clear port error status, assign bus numbers
>>       to child devices.
>>
>>       I assume this box includes clearing DPC error status and clearing
>>       Trigger Status?  They seem to be out of order in the box.

OS clears the DPC Trigger Status bit which will bring port below it out 
of containment. Then OS will clear the "port" error status bits (i.e., 
the AER and DPC status bits in the root port or downstream port that 
triggered containment). I don't think it would hurt to do this two steps 
in reverse order but don't think it is necessary. Note that error status 
bits for devices below the port in containment are cleared later after 
f/w has a chance to log them.

>>
>>     - Evaluate _OST
>>
>>     - Capture and clear device AER status.
>>
>>       This seems suspect to me.  Where does it say the OS is allowed to
>>       write AER status when firmware retains control of AER?
>>
>> This patch series does things in this order:
>>
>>     - Receive EDR notification (edr_handle_event(), edr.c)
>>
>>     - Read, log, and clear DPC error regs (dpc_process_error(), dpc.c).
>>
>>       This also clears AER uncorrectable error status when the relevant
>>       HEST entries do not have the FIRMWARE_FIRST bit set.  I think this
>>       is incorrect: the test should be based the _OSC negotiation for
>>       AER ownership, not on the HEST entries.  But this problem
>>       pre-dates this patch series.
>>
>>     - Clear AER status (pci_aer_raw_clear_status(), aer.c).
>>
>>       This is at least inside the EDR recovery window, but again, I
>>       don't see where it says the OS is allowed to write the AER status.
> 
> Implementation note is the only reference we have regarding clearing the
> AER registers.
> 
> But since the spec says both DPC and AER needs to be always controlled
> together by the either OS or firmware, and when firmware relinquishes
> control over DPC registers in EDR notification window, we can assume
> that we also have control over AER registers.
> 
> But I agree that is not explicitly spelled out any where outside the
> implementation note.
> 
> 
> Austin,
> 
> May be ECN (section 4.5.1, table 4-6) needs to be updated to add this
> clarification.

Sure we can update to section 4.5.1, table 4-6 to indicate when OS can 
clear the AER status bits. It will just follow what's done in the 
implementation note so I think it's acceptable to follow implementation 
guidance for now.

> 
>>
>>     - Attempt recovery (pcie_do_recovery(), err.c)
>>
>>     - Clear DPC Trigger Status (dpc_reset_link(), dpc.c)
>>
>>     - Evaluate _OST (acpi_send_edr_status(), edr.c)
>>
>> What am I missing?
>>
>>> Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
>>> ---
>>>    drivers/pci/pci.h      |  2 ++
>>>    drivers/pci/pcie/aer.c | 22 ++++++++++++++++++----
>>>    2 files changed, 20 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>>> index e57e78b619f8..c239e6dd2542 100644
>>> --- a/drivers/pci/pci.h
>>> +++ b/drivers/pci/pci.h
>>> @@ -655,6 +655,7 @@ extern const struct attribute_group aer_stats_attr_group;
>>>    void pci_aer_clear_fatal_status(struct pci_dev *dev);
>>>    void pci_aer_clear_device_status(struct pci_dev *dev);
>>>    int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
>>> +int pci_aer_raw_clear_status(struct pci_dev *dev);
>>>    #else
>>>    static inline void pci_no_aer(void) { }
>>>    static inline void pci_aer_init(struct pci_dev *d) { }
>>> @@ -665,6 +666,7 @@ static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>>    {
>>>    	return -EINVAL;
>>>    }
>>> +int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
>>>    #endif
>>>    
>>>    #ifdef CONFIG_ACPI
>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>> index c0540c3761dc..41afefa562b7 100644
>>> --- a/drivers/pci/pcie/aer.c
>>> +++ b/drivers/pci/pcie/aer.c
>>> @@ -420,7 +420,16 @@ void pci_aer_clear_fatal_status(struct pci_dev *dev)
>>>    		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
>>>    }
>>>    
>>> -int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>> +/**
>>> + * pci_aer_raw_clear_status - Clear AER error registers.
>>> + * @dev: the PCI device
>>> + *
>>> + * NOTE: Allows clearing error registers in both FF and
>>> + * non FF modes.
>>> + *
>>> + * Returns 0 on success, or negative on failure.
>>> + */
>>> +int pci_aer_raw_clear_status(struct pci_dev *dev)
>>>    {
>>>    	int pos;
>>>    	u32 status;
>>> @@ -433,9 +442,6 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>>    	if (!pos)
>>>    		return -EIO;
>>>    
>>> -	if (pcie_aer_get_firmware_first(dev))
>>> -		return -EIO;
>>> -
>>>    	port_type = pci_pcie_type(dev);
>>>    	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
>>>    		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
>>> @@ -451,6 +457,14 @@ int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>>    	return 0;
>>>    }
>>>    
>>> +int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
>>> +{
>>> +	if (pcie_aer_get_firmware_first(dev))
>>> +		return -EIO;
>>> +
>>> +	return pci_aer_raw_clear_status(dev);
>>> +}
>>> +
>>>    void pci_save_aer_state(struct pci_dev *dev)
>>>    {
>>>    	struct pci_cap_saved_state *save_state;
>>> -- 
>>> 2.25.1
>>>
>
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index e57e78b619f8..c239e6dd2542 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -655,6 +655,7 @@  extern const struct attribute_group aer_stats_attr_group;
 void pci_aer_clear_fatal_status(struct pci_dev *dev);
 void pci_aer_clear_device_status(struct pci_dev *dev);
 int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
+int pci_aer_raw_clear_status(struct pci_dev *dev);
 #else
 static inline void pci_no_aer(void) { }
 static inline void pci_aer_init(struct pci_dev *d) { }
@@ -665,6 +666,7 @@  static inline int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
 {
 	return -EINVAL;
 }
+int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; }
 #endif
 
 #ifdef CONFIG_ACPI
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index c0540c3761dc..41afefa562b7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -420,7 +420,16 @@  void pci_aer_clear_fatal_status(struct pci_dev *dev)
 		pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 }
 
-int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
+/**
+ * pci_aer_raw_clear_status - Clear AER error registers.
+ * @dev: the PCI device
+ *
+ * NOTE: Allows clearing error registers in both FF and
+ * non FF modes.
+ *
+ * Returns 0 on success, or negative on failure.
+ */
+int pci_aer_raw_clear_status(struct pci_dev *dev)
 {
 	int pos;
 	u32 status;
@@ -433,9 +442,6 @@  int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
 	if (!pos)
 		return -EIO;
 
-	if (pcie_aer_get_firmware_first(dev))
-		return -EIO;
-
 	port_type = pci_pcie_type(dev);
 	if (port_type == PCI_EXP_TYPE_ROOT_PORT) {
 		pci_read_config_dword(dev, pos + PCI_ERR_ROOT_STATUS, &status);
@@ -451,6 +457,14 @@  int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
 	return 0;
 }
 
+int pci_cleanup_aer_error_status_regs(struct pci_dev *dev)
+{
+	if (pcie_aer_get_firmware_first(dev))
+		return -EIO;
+
+	return pci_aer_raw_clear_status(dev);
+}
+
 void pci_save_aer_state(struct pci_dev *dev)
 {
 	struct pci_cap_saved_state *save_state;