diff mbox series

pci: pcie: AER: Fix logging of Correctable errors

Message ID 20200618155511.16009-1-Kangie@footclan.ninja
State New
Headers show
Series pci: pcie: AER: Fix logging of Correctable errors | expand

Commit Message

Matt Jolly June 18, 2020, 3:55 p.m. UTC
The AER documentation indicates that correctable (severity=Corrected)
errors should be output as a warning so that users can filter these
errors if they choose to; This functionality does not appear to have been implemented.

This patch modifies the functions aer_print_error and __aer_print_error
to send correctable errors as a warning (pci_warn), rather than as an error (pci_err). It
partially addresses several bugs in relation to kernel message buffer
spam for misbehaving devices - the root cause (possibly device firmware?) isn't
addressed, but the dmesg output is less alarming for end users, and can
be filtered separately from uncorrectable errors. This should hopefully
reduce the need for users to disable AER to suppress corrected errors.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=201517
Link: https://bugzilla.kernel.org/show_bug.cgi?id=196183

Signed-off-by: Matt Jolly <Kangie@footclan.ninja>
---
 drivers/pci/pcie/aer.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

Comments

Sinan Kaya June 19, 2020, 5:17 p.m. UTC | #1
On 6/18/2020 11:55 AM, Matt Jolly wrote:

> +		pci_warn(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +			dev->vendor, dev->device,
> +			info->status, info->mask);
> +	} else {

<snip>

> +		pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +			dev->vendor, dev->device,
> +			info->status, info->mask);


Function pointers for pci_warn vs. pci_err ?

This looks like a lot of copy/paste.
Joe Perches June 19, 2020, 6:09 p.m. UTC | #2
On Fri, 2020-06-19 at 13:17 -0400, Sinan Kaya wrote:
> On 6/18/2020 11:55 AM, Matt Jolly wrote:
> 
> > +		pci_warn(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> > +			dev->vendor, dev->device,
> > +			info->status, info->mask);
> > +	} else {
> 
> <snip>
> 
> > +		pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> > +			dev->vendor, dev->device,
> > +			info->status, info->mask);
> 
> Function pointers for pci_warn vs. pci_err ?

Not really possible as both are function-like macros.
Bjorn Helgaas July 8, 2020, 12:10 a.m. UTC | #3
On Fri, Jun 19, 2020 at 01:55:11AM +1000, Matt Jolly wrote:
> The AER documentation indicates that correctable (severity=Corrected)
> errors should be output as a warning so that users can filter these
> errors if they choose to; This functionality does not appear to have been implemented.
> 
> This patch modifies the functions aer_print_error and __aer_print_error
> to send correctable errors as a warning (pci_warn), rather than as an error (pci_err). It
> partially addresses several bugs in relation to kernel message buffer
> spam for misbehaving devices - the root cause (possibly device firmware?) isn't
> addressed, but the dmesg output is less alarming for end users, and can
> be filtered separately from uncorrectable errors. This should hopefully
> reduce the need for users to disable AER to suppress corrected errors.
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=201517
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=196183
> 
> Signed-off-by: Matt Jolly <Kangie@footclan.ninja>
> ---
>  drivers/pci/pcie/aer.c | 36 ++++++++++++++++++++++++++----------
>  1 file changed, 26 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 3acf56683915..131ecc0df2cb 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -662,12 +662,18 @@ static void __aer_print_error(struct pci_dev *dev,
>  			errmsg = i < ARRAY_SIZE(aer_uncorrectable_error_string) ?
>  				aer_uncorrectable_error_string[i] : NULL;
>  
> -		if (errmsg)
> -			pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
> -				info->first_error == i ? " (First)" : "");
> -		else
> +		if (errmsg) {
> +			if (info->severity == AER_CORRECTABLE) {
> +				pci_warn(dev, "   [%2d] %-22s%s\n", i, errmsg,

I think we can use pci_printk() here to reduce the code duplication.

And I think we can also simplify the aer_correctable_error_string/
aer_uncorrectable_error_string stuff above, which would make this even
simpler.

I'll respond to this with my proposal.

> +					info->first_error == i ? " (First)" : "");
> +			} else {
> +				pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
> +					info->first_error == i ? " (First)" : "");
> +			}
> +		} else {
>  			pci_err(dev, "   [%2d] Unknown Error Bit%s\n",
>  				i, info->first_error == i ? " (First)" : "");
> +		}
>  	}
>  	pci_dev_aer_stats_incr(dev, info);
>  }
> @@ -686,13 +692,23 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
>  	agent = AER_GET_AGENT(info->severity, info->status);
>  
> -	pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> -		aer_error_severity_string[info->severity],
> -		aer_error_layer[layer], aer_agent_string[agent]);
> +	if  (info->severity == AER_CORRECTABLE) {
> +		pci_warn(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> +			aer_error_severity_string[info->severity],
> +			aer_error_layer[layer], aer_agent_string[agent]);
>  
> -	pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> -		dev->vendor, dev->device,
> -		info->status, info->mask);
> +		pci_warn(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +			dev->vendor, dev->device,
> +			info->status, info->mask);
> +	} else {
> +		pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> +			aer_error_severity_string[info->severity],
> +			aer_error_layer[layer], aer_agent_string[agent]);
> +
> +		pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +			dev->vendor, dev->device,
> +			info->status, info->mask);
> +	}
>  
>  	__aer_print_error(dev, info);
>  
> -- 
> 2.26.2
>
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 3acf56683915..131ecc0df2cb 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -662,12 +662,18 @@  static void __aer_print_error(struct pci_dev *dev,
 			errmsg = i < ARRAY_SIZE(aer_uncorrectable_error_string) ?
 				aer_uncorrectable_error_string[i] : NULL;
 
-		if (errmsg)
-			pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
-				info->first_error == i ? " (First)" : "");
-		else
+		if (errmsg) {
+			if (info->severity == AER_CORRECTABLE) {
+				pci_warn(dev, "   [%2d] %-22s%s\n", i, errmsg,
+					info->first_error == i ? " (First)" : "");
+			} else {
+				pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
+					info->first_error == i ? " (First)" : "");
+			}
+		} else {
 			pci_err(dev, "   [%2d] Unknown Error Bit%s\n",
 				i, info->first_error == i ? " (First)" : "");
+		}
 	}
 	pci_dev_aer_stats_incr(dev, info);
 }
@@ -686,13 +692,23 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
 	agent = AER_GET_AGENT(info->severity, info->status);
 
-	pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
-		aer_error_severity_string[info->severity],
-		aer_error_layer[layer], aer_agent_string[agent]);
+	if  (info->severity == AER_CORRECTABLE) {
+		pci_warn(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
+			aer_error_severity_string[info->severity],
+			aer_error_layer[layer], aer_agent_string[agent]);
 
-	pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
-		dev->vendor, dev->device,
-		info->status, info->mask);
+		pci_warn(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
+			dev->vendor, dev->device,
+			info->status, info->mask);
+	} else {
+		pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
+			aer_error_severity_string[info->severity],
+			aer_error_layer[layer], aer_agent_string[agent]);
+
+		pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
+			dev->vendor, dev->device,
+			info->status, info->mask);
+	}
 
 	__aer_print_error(dev, info);