Patchwork [v5,2/3] aerdrv: Enhanced AER logging

login
register
mail settings
Submitter Lance Ortiz
Date Dec. 3, 2012, 9:20 p.m.
Message ID <20121203212053.24373.10914.stgit@grignak.americas.hpqcorp.net>
Download mbox | patch
Permalink /patch/203443/
State Superseded
Headers show

Comments

Lance Ortiz - Dec. 3, 2012, 9:20 p.m.
This patch will provide a more reliable and easy way for user-space
applications to have access to AER logs rather than reading them from the
message buffer. It also provides a way to notify user-space when an AER
event occurs.

The aer driver is updated to generate a trace event of function 'aer_event'
when a PCIe error is reported over the AER interface.  The trace event was
added to both the interrupt based aer path and the firmware first path.

v1-v2 fix compile errors in ifdefs.
v2-v3 Update to new location of trace header. Update print to remove
warning.
v3-v4 Reworked logic when getting ready to call cper_print_aer
Signed-off-by: Lance Ortiz <lance.ortiz@hp.com>
---

 drivers/acpi/apei/cper.c               |   19 ++++++++++++++++---
 drivers/pci/pcie/aer/aerdrv_errprint.c |   10 +++++++++-
 include/linux/aer.h                    |    2 +-
 3 files changed, 26 insertions(+), 5 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov - Dec. 4, 2012, 9:30 a.m.
On Mon, Dec 03, 2012 at 02:20:54PM -0700, Lance Ortiz wrote:
> This patch will provide a more reliable and easy way for user-space
> applications to have access to AER logs rather than reading them from the
> message buffer. It also provides a way to notify user-space when an AER
> event occurs.
> 
> The aer driver is updated to generate a trace event of function 'aer_event'
> when a PCIe error is reported over the AER interface.  The trace event was
> added to both the interrupt based aer path and the firmware first path.
> 
> v1-v2 fix compile errors in ifdefs.
> v2-v3 Update to new location of trace header. Update print to remove
> warning.
> v3-v4 Reworked logic when getting ready to call cper_print_aer
> Signed-off-by: Lance Ortiz <lance.ortiz@hp.com>
> ---
> 
>  drivers/acpi/apei/cper.c               |   19 ++++++++++++++++---
>  drivers/pci/pcie/aer/aerdrv_errprint.c |   10 +++++++++-
>  include/linux/aer.h                    |    2 +-
>  3 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
> index e6defd8..4a3e945 100644
> --- a/drivers/acpi/apei/cper.c
> +++ b/drivers/acpi/apei/cper.c
> @@ -29,6 +29,7 @@
>  #include <linux/time.h>
>  #include <linux/cper.h>
>  #include <linux/acpi.h>
> +#include <linux/pci.h>
>  #include <linux/aer.h>
>  
>  /*
> @@ -249,6 +250,10 @@ static const char *cper_pcie_port_type_strs[] = {
>  static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
>  			    const struct acpi_hest_generic_data *gdata)
>  {
> +#ifdef CONFIG_ACPI_APEI_PCIEAER
> +	struct pci_dev *dev;
> +#endif
> +
>  	if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
>  		printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
>  		       pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ?
> @@ -281,10 +286,18 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
>  	"%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
>  	pfx, pcie->bridge.secondary_status, pcie->bridge.control);
>  #ifdef CONFIG_ACPI_APEI_PCIEAER
> -	if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) {
> -		struct aer_capability_regs *aer_regs = (void *)pcie->aer_info;
> -		cper_print_aer(pfx, gdata->error_severity, aer_regs);
> +	dev = pci_get_domain_bus_and_slot(pcie->device_id.segment,
> +			pcie->device_id.bus, pcie->device_id.function);
> +	if (!dev) {
> +		pr_info("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n",
> +			pcie->device_id.segment, pcie->device_id.bus,
> +			pcie->device_id.slot, pcie->device_id.function);
> +		return;
>  	}
> +	if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO)
> +		cper_print_aer(dev, gdata->error_severity,
> +				(struct aer_capability_regs *) pcie->aer_info);
> +	pci_dev_put(dev);
>  #endif
>  }
>  
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index 3ea5173..34d96e4 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -23,6 +23,9 @@
>  
>  #include "aerdrv.h"
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/ras.h>

Steve,

AFAIU, this will create all tracepoint code from the ras.h header
in this compilation unit, i.e. aerdrv_errprint.c. It has only one
tracepoint now but with time, as more RAS TPs are being added, it would
make sense to have that CREATE_TRACE_POINTS code at a more central place
in the kernel, right?

And, on configs with PCIEAER disabled, we won't have the TPs available
so the CREATE_TRACE_POINTS thing should be in a compilation unit which
gets included unconditionally, correct?

Thanks.
Steven Rostedt - Dec. 6, 2012, 1:40 a.m.
On Tue, 2012-12-04 at 10:30 +0100, Borislav Petkov wrote:

> Steve,
> 
> AFAIU, this will create all tracepoint code from the ras.h header
> in this compilation unit, i.e. aerdrv_errprint.c. It has only one
> tracepoint now but with time, as more RAS TPs are being added, it would
> make sense to have that CREATE_TRACE_POINTS code at a more central place
> in the kernel, right?

Yes.

> 
> And, on configs with PCIEAER disabled, we won't have the TPs available
> so the CREATE_TRACE_POINTS thing should be in a compilation unit which
> gets included unconditionally, correct?

You can have a config that enables these trace points, and when you
enable one of the systems that uses them, have that config select the
config that enables tracepoints. Have that config compile the file for
tracepoints. For example, in a Makefile:

obj-$(CONFIG_RAS_TRACE_POINTS) += ras-trace.o


Use #ifdef CONFIG_FOO_BAR around tracepoints that are required for
specific systems if the tracepoints in the header file are for different
subsystems that can be enabled or disabled separately by configs.

-- Steve



--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Borislav Petkov - Dec. 6, 2012, 10:24 a.m.
On Wed, Dec 05, 2012 at 08:40:31PM -0500, Steven Rostedt wrote:
> You can have a config that enables these trace points, and when you
> enable one of the systems that uses them, have that config select the
> config that enables tracepoints. Have that config compile the file for
> tracepoints. For example, in a Makefile:
> 
> obj-$(CONFIG_RAS_TRACE_POINTS) += ras-trace.o
> 
> 
> Use #ifdef CONFIG_FOO_BAR around tracepoints that are required for
> specific systems if the tracepoints in the header file are for different
> subsystems that can be enabled or disabled separately by configs.

Cool, this sounds exactly like what we should do, I'll hack it up soon.

Thanks a lot.

Patch

diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c
index e6defd8..4a3e945 100644
--- a/drivers/acpi/apei/cper.c
+++ b/drivers/acpi/apei/cper.c
@@ -29,6 +29,7 @@ 
 #include <linux/time.h>
 #include <linux/cper.h>
 #include <linux/acpi.h>
+#include <linux/pci.h>
 #include <linux/aer.h>
 
 /*
@@ -249,6 +250,10 @@  static const char *cper_pcie_port_type_strs[] = {
 static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
 			    const struct acpi_hest_generic_data *gdata)
 {
+#ifdef CONFIG_ACPI_APEI_PCIEAER
+	struct pci_dev *dev;
+#endif
+
 	if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
 		printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
 		       pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ?
@@ -281,10 +286,18 @@  static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
 	"%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
 	pfx, pcie->bridge.secondary_status, pcie->bridge.control);
 #ifdef CONFIG_ACPI_APEI_PCIEAER
-	if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) {
-		struct aer_capability_regs *aer_regs = (void *)pcie->aer_info;
-		cper_print_aer(pfx, gdata->error_severity, aer_regs);
+	dev = pci_get_domain_bus_and_slot(pcie->device_id.segment,
+			pcie->device_id.bus, pcie->device_id.function);
+	if (!dev) {
+		pr_info("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n",
+			pcie->device_id.segment, pcie->device_id.bus,
+			pcie->device_id.slot, pcie->device_id.function);
+		return;
 	}
+	if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO)
+		cper_print_aer(dev, gdata->error_severity,
+				(struct aer_capability_regs *) pcie->aer_info);
+	pci_dev_put(dev);
 #endif
 }
 
diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index 3ea5173..34d96e4 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -23,6 +23,9 @@ 
 
 #include "aerdrv.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ras.h>
+
 #define AER_AGENT_RECEIVER		0
 #define AER_AGENT_REQUESTER		1
 #define AER_AGENT_COMPLETER		2
@@ -194,6 +197,8 @@  void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
 	if (info->id && info->error_dev_num > 1 && info->id == id)
 		printk("%s""  Error of this Agent(%04x) is reported first\n",
 			prefix, id);
+	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
+			info->severity);
 }
 
 void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
@@ -217,12 +222,13 @@  int cper_severity_to_aer(int cper_severity)
 }
 EXPORT_SYMBOL_GPL(cper_severity_to_aer);
 
-void cper_print_aer(const char *prefix, int cper_severity,
+void cper_print_aer(struct pci_dev *dev, int cper_severity,
 		    struct aer_capability_regs *aer)
 {
 	int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0;
 	u32 status, mask;
 	const char **status_strs;
+	char *prefix = NULL;
 
 	aer_severity = cper_severity_to_aer(cper_severity);
 	if (aer_severity == AER_CORRECTABLE) {
@@ -259,5 +265,7 @@  void cper_print_aer(const char *prefix, int cper_severity,
 			*(tlp + 8), *(tlp + 15), *(tlp + 14),
 			*(tlp + 13), *(tlp + 12));
 	}
+	trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+			aer_severity);
 }
 #endif
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 544abdb..7b86dc6 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -49,7 +49,7 @@  static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 }
 #endif
 
-extern void cper_print_aer(const char *prefix, int cper_severity,
+extern void cper_print_aer(struct pci_dev *dev, int cper_severity,
 			   struct aer_capability_regs *aer);
 extern int cper_severity_to_aer(int cper_severity);
 extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,