diff mbox series

[v3,08/11] cxl/pci: add tracepoint events for CXL RAS

Message ID 166879132997.674819.12112190531427523276.stgit@djiang5-desk3.ch.intel.com
State New
Headers show
Series cxl/pci: Add fundamental error handling | expand

Commit Message

Dave Jiang Nov. 18, 2022, 5:08 p.m. UTC
Add tracepoint events for recording the CXL uncorrectable and correctable
errors. For uncorrectable errors, there is additional data of 512B from
the header log register (CXL spec rev3 8.2.4.16.7). The trace event will
intake a dynamic array that will dump the entire Header Log data. If
multiple errors are set in the status register, then the
'first error' field (CXL spec rev3 v8.2.4.16.6) is read from the Error
Capabilities and Control Register in order to determine the error.

This implementation does not include CXL IDE Error details.

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/pci.c          |    2 +
 include/trace/events/cxl.h |  110 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 include/trace/events/cxl.h

Comments

Steven Rostedt Nov. 18, 2022, 5:17 p.m. UTC | #1
On Fri, 18 Nov 2022 10:08:49 -0700
Dave Jiang <dave.jiang@intel.com> wrote:

> +TRACE_EVENT(cxl_aer_uncorrectable_error,
> +	TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl),
> +	TP_ARGS(dev_name, status, fe, hl),
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(u32, status)
> +		__field(u32, first_error)
> +		__dynamic_array(u32, header_log, CXL_HEADERLOG_SIZE_U32)

If this is a fixed size, you do not need to use __dynamic_array, but
instead just use __array()

		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32);

> +	),
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		__entry->status = status;
> +		__entry->first_error = fe;
> +		/*
> +		 * Embed the 512B headerlog data for user app retrieval and
> +		 * parsing, but no need to print this in the trace buffer.
> +		 */
> +		memcpy(__get_dynamic_array(header_log), hl, CXL_HEADERLOG_SIZE);

		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);

This will be smaller and faster.

-- Steve


> +	),
> +	TP_printk("%s: status: '%s' first_error: '%s'",
> +		  __get_str(dev_name),
> +		  show_uc_errs(__entry->status),
> +		  show_uc_errs(__entry->first_error)
> +	)
> +);
> +
Dave Jiang Nov. 18, 2022, 5:31 p.m. UTC | #2
On 11/18/2022 10:17 AM, Steven Rostedt wrote:
> On Fri, 18 Nov 2022 10:08:49 -0700
> Dave Jiang <dave.jiang@intel.com> wrote:
> 
>> +TRACE_EVENT(cxl_aer_uncorrectable_error,
>> +	TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl),
>> +	TP_ARGS(dev_name, status, fe, hl),
>> +	TP_STRUCT__entry(
>> +		__string(dev_name, dev_name)
>> +		__field(u32, status)
>> +		__field(u32, first_error)
>> +		__dynamic_array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> 
> If this is a fixed size, you do not need to use __dynamic_array, but
> instead just use __array()
> 
> 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32);

Thanks! I will update.

> 
>> +	),
>> +	TP_fast_assign(
>> +		__assign_str(dev_name, dev_name);
>> +		__entry->status = status;
>> +		__entry->first_error = fe;
>> +		/*
>> +		 * Embed the 512B headerlog data for user app retrieval and
>> +		 * parsing, but no need to print this in the trace buffer.
>> +		 */
>> +		memcpy(__get_dynamic_array(header_log), hl, CXL_HEADERLOG_SIZE);
> 
> 		memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
> 
> This will be smaller and faster.
> 
> -- Steve
> 
> 
>> +	),
>> +	TP_printk("%s: status: '%s' first_error: '%s'",
>> +		  __get_str(dev_name),
>> +		  show_uc_errs(__entry->status),
>> +		  show_uc_errs(__entry->first_error)
>> +	)
>> +);
>> +
Jonathan Cameron Nov. 21, 2022, 11:37 a.m. UTC | #3
On Fri, 18 Nov 2022 10:08:49 -0700
Dave Jiang <dave.jiang@intel.com> wrote:

> Add tracepoint events for recording the CXL uncorrectable and correctable
> errors. For uncorrectable errors, there is additional data of 512B from
> the header log register (CXL spec rev3 8.2.4.16.7). The trace event will
> intake a dynamic array that will dump the entire Header Log data. If
> multiple errors are set in the status register, then the
> 'first error' field (CXL spec rev3 v8.2.4.16.6) is read from the Error
> Capabilities and Control Register in order to determine the error.
> 
> This implementation does not include CXL IDE Error details.
> 
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
With the stuff Steven raised tidied up this looks good to me now.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>



> ---
>  drivers/cxl/pci.c          |    2 +
>  include/trace/events/cxl.h |  110 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 112 insertions(+)
>  create mode 100644 include/trace/events/cxl.h
> 
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 9428f3e0d99b..0f36a5861a7b 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -13,6 +13,8 @@
>  #include "cxlmem.h"
>  #include "cxlpci.h"
>  #include "cxl.h"
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/cxl.h>
>  
>  /**
>   * DOC: cxl pci
> diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h
> new file mode 100644
> index 000000000000..f8e95d977133
> --- /dev/null
> +++ b/include/trace/events/cxl.h
> @@ -0,0 +1,110 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM cxl
> +
> +#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _CXL_EVENTS_H
> +
> +#include <linux/tracepoint.h>
> +
> +#define CXL_HEADERLOG_SIZE		SZ_512
> +#define CXL_HEADERLOG_SIZE_U32		SZ_512 / sizeof(u32)
> +
> +#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
> +#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
> +#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
> +#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
> +#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
> +#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
> +#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
> +#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
> +#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
> +#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
> +#define CXL_RAS_UC_POISON		BIT(10)
> +#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
> +#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
> +#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
> +#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
> +
> +#define show_uc_errs(status)	__print_flags(status, " | ",		  \
> +	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },	  \
> +	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },	  \
> +	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \
> +	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },		  \
> +	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },	  \
> +	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" },	  \
> +	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" },  \
> +	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },		  \
> +	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },		  \
> +	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },	  \
> +	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },		  \
> +	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },		  \
> +	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
> +	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
> +	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
> +)
> +
> +TRACE_EVENT(cxl_aer_uncorrectable_error,
> +	TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl),
> +	TP_ARGS(dev_name, status, fe, hl),
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(u32, status)
> +		__field(u32, first_error)
> +		__dynamic_array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
> +	),
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		__entry->status = status;
> +		__entry->first_error = fe;
> +		/*
> +		 * Embed the 512B headerlog data for user app retrieval and
> +		 * parsing, but no need to print this in the trace buffer.
> +		 */
> +		memcpy(__get_dynamic_array(header_log), hl, CXL_HEADERLOG_SIZE);
> +	),
> +	TP_printk("%s: status: '%s' first_error: '%s'",
> +		  __get_str(dev_name),
> +		  show_uc_errs(__entry->status),
> +		  show_uc_errs(__entry->first_error)
> +	)
> +);
> +
> +#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
> +#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
> +#define CXL_RAS_CE_CRC_THRESH		BIT(2)
> +#define CXL_RAS_CE_CACHE_POISON		BIT(3)
> +#define CXL_RAS_CE_MEM_POISON		BIT(4)
> +#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(5)
> +
> +#define show_ce_errs(status)	__print_flags(status, " | ",			\
> +	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },			\
> +	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data Ecc Error" },			\
> +	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },				\
> +	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" },		\
> +	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" },		\
> +	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" }	\
> +)
> +
> +TRACE_EVENT(cxl_aer_correctable_error,
> +	TP_PROTO(const char *dev_name, u32 status),
> +	TP_ARGS(dev_name, status),
> +	TP_STRUCT__entry(
> +		__string(dev_name, dev_name)
> +		__field(u32, status)
> +	),
> +	TP_fast_assign(
> +		__assign_str(dev_name, dev_name);
> +		__entry->status = status;
> +	),
> +	TP_printk("%s: status: '%s'",
> +		  __get_str(dev_name), show_ce_errs(__entry->status)
> +	)
> +);
> +
> +#endif /* _CXL_EVENTS_H */
> +
> +/* This part must be outside protection */
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE cxl
> +#include <trace/define_trace.h>
> 
>
Shiju Jose Nov. 21, 2022, 1:08 p.m. UTC | #4
Hi Dave,

Please see few comments.

>-----Original Message-----
>From: Dave Jiang <dave.jiang@intel.com>
>Sent: 18 November 2022 17:09
>To: linux-cxl@vger.kernel.org; linux-pci@vger.kernel.org
>Cc: dan.j.williams@intel.com; ira.weiny@intel.com; vishal.l.verma@intel.com;
>alison.schofield@intel.com; Jonathan Cameron
><jonathan.cameron@huawei.com>; rostedt@goodmis.org;
>terry.bowman@amd.com; bhelgaas@google.com
>Subject: [PATCH v3 08/11] cxl/pci: add tracepoint events for CXL RAS
>
>Add tracepoint events for recording the CXL uncorrectable and correctable
>errors. For uncorrectable errors, there is additional data of 512B from the
>header log register (CXL spec rev3 8.2.4.16.7). The trace event will intake a
>dynamic array that will dump the entire Header Log data. If multiple errors are
>set in the status register, then the 'first error' field (CXL spec rev3 v8.2.4.16.6)
>is read from the Error Capabilities and Control Register in order to determine
>the error.
>
>This implementation does not include CXL IDE Error details.
>
>Cc: Steven Rostedt <rostedt@goodmis.org>
>Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>---
> drivers/cxl/pci.c          |    2 +
> include/trace/events/cxl.h |  110
>++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 112 insertions(+)
> create mode 100644 include/trace/events/cxl.h
>
>diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index
>9428f3e0d99b..0f36a5861a7b 100644
>--- a/drivers/cxl/pci.c
>+++ b/drivers/cxl/pci.c
>@@ -13,6 +13,8 @@
> #include "cxlmem.h"
> #include "cxlpci.h"
> #include "cxl.h"
>+#define CREATE_TRACE_POINTS
>+#include <trace/events/cxl.h>
>
> /**
>  * DOC: cxl pci
>diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h new file
>mode 100644 index 000000000000..f8e95d977133
>--- /dev/null
>+++ b/include/trace/events/cxl.h
>@@ -0,0 +1,110 @@
>+/* SPDX-License-Identifier: GPL-2.0 */
>+#undef TRACE_SYSTEM
>+#define TRACE_SYSTEM cxl
>+
>+#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
>#define
>+_CXL_EVENTS_H
>+
>+#include <linux/tracepoint.h>
>+
>+#define CXL_HEADERLOG_SIZE		SZ_512
>+#define CXL_HEADERLOG_SIZE_U32		SZ_512 / sizeof(u32)
>+
>+#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
>+#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
>+#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
>+#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
>+#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
>+#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
>+#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
>+#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
>+#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
>+#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
>+#define CXL_RAS_UC_POISON		BIT(10)
>+#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
>+#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
>+#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
>+#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
>+
>+#define show_uc_errs(status)	__print_flags(status, " | ",
>	  \
>+	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },
>	  \
>+	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },
>	  \
>+	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" },
>\
>+	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },
>	  \
>+	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },
>	  \
>+	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error"
>},	  \
>+	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error"
>},  \
>+	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },
>	  \
>+	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },
>	  \
>+	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },
>	  \
>+	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },
>	  \
>+	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },
>	  \
>+	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
>+	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
>+	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
>+)
>+
>+TRACE_EVENT(cxl_aer_uncorrectable_error,
>+	TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl),
>+	TP_ARGS(dev_name, status, fe, hl),
>+	TP_STRUCT__entry(
>+		__string(dev_name, dev_name)
>+		__field(u32, status)
>+		__field(u32, first_error)
>+		__dynamic_array(u32, header_log,
>CXL_HEADERLOG_SIZE_U32)
>+	),
>+	TP_fast_assign(
>+		__assign_str(dev_name, dev_name);
>+		__entry->status = status;
>+		__entry->first_error = fe;
>+		/*
>+		 * Embed the 512B headerlog data for user app retrieval and
>+		 * parsing, but no need to print this in the trace buffer.
>+		 */
>+		memcpy(__get_dynamic_array(header_log), hl,
>CXL_HEADERLOG_SIZE);
>+	),
>+	TP_printk("%s: status: '%s' first_error: '%s'",
>+		  __get_str(dev_name),
>+		  show_uc_errs(__entry->status),
>+		  show_uc_errs(__entry->first_error)
>+	)
>+);
>+
>+#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
>+#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
>+#define CXL_RAS_CE_CRC_THRESH		BIT(2)

I think the Bit Location 3  "Retry_Threshold: Retry Threshold Hit. "  as per the 
Correctable Error Status Register in the CXL 3.0 specification is missing?
If so, please correct the bit location of the subsequent corrected errors as well.
  
>+#define CXL_RAS_CE_CACHE_POISON		BIT(3)
>+#define CXL_RAS_CE_MEM_POISON		BIT(4)
>+#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(5)
>+
>+#define show_ce_errs(status)	__print_flags(status, " | ",
>		\
>+	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },
>		\
>+	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data Ecc Error" },

Please change "Ecc" to "ECC".

>		\
>+	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },
>		\
>+	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer"
>},		\
>+	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer"
>},		\
>+	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical
>Layer" }	\
>+)
>+
>+TRACE_EVENT(cxl_aer_correctable_error,
>+	TP_PROTO(const char *dev_name, u32 status),
>+	TP_ARGS(dev_name, status),
>+	TP_STRUCT__entry(
>+		__string(dev_name, dev_name)
>+		__field(u32, status)
>+	),
>+	TP_fast_assign(
>+		__assign_str(dev_name, dev_name);
>+		__entry->status = status;
>+	),
>+	TP_printk("%s: status: '%s'",
>+		  __get_str(dev_name), show_ce_errs(__entry->status)
>+	)
>+);
>+
>+#endif /* _CXL_EVENTS_H */
>+
>+/* This part must be outside protection */ #undef TRACE_INCLUDE_FILE
>+#define TRACE_INCLUDE_FILE cxl #include <trace/define_trace.h>
>

Thanks,
Shiju
Dave Jiang Nov. 28, 2022, 5:54 p.m. UTC | #5
On 11/21/2022 6:08 AM, Shiju Jose wrote:
> Hi Dave,
> 
> Please see few comments.
> 
>> -----Original Message-----
>> From: Dave Jiang <dave.jiang@intel.com>
>> Sent: 18 November 2022 17:09
>> To: linux-cxl@vger.kernel.org; linux-pci@vger.kernel.org
>> Cc: dan.j.williams@intel.com; ira.weiny@intel.com; vishal.l.verma@intel.com;
>> alison.schofield@intel.com; Jonathan Cameron
>> <jonathan.cameron@huawei.com>; rostedt@goodmis.org;
>> terry.bowman@amd.com; bhelgaas@google.com
>> Subject: [PATCH v3 08/11] cxl/pci: add tracepoint events for CXL RAS
>>
>> Add tracepoint events for recording the CXL uncorrectable and correctable
>> errors. For uncorrectable errors, there is additional data of 512B from the
>> header log register (CXL spec rev3 8.2.4.16.7). The trace event will intake a
>> dynamic array that will dump the entire Header Log data. If multiple errors are
>> set in the status register, then the 'first error' field (CXL spec rev3 v8.2.4.16.6)
>> is read from the Error Capabilities and Control Register in order to determine
>> the error.
>>
>> This implementation does not include CXL IDE Error details.
>>
>> Cc: Steven Rostedt <rostedt@goodmis.org>
>> Signed-off-by: Dave Jiang <dave.jiang@intel.com>
>> ---
>> drivers/cxl/pci.c          |    2 +
>> include/trace/events/cxl.h |  110
>> ++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 112 insertions(+)
>> create mode 100644 include/trace/events/cxl.h
>>
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index
>> 9428f3e0d99b..0f36a5861a7b 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -13,6 +13,8 @@
>> #include "cxlmem.h"
>> #include "cxlpci.h"
>> #include "cxl.h"
>> +#define CREATE_TRACE_POINTS
>> +#include <trace/events/cxl.h>
>>
>> /**
>>   * DOC: cxl pci
>> diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h new file
>> mode 100644 index 000000000000..f8e95d977133
>> --- /dev/null
>> +++ b/include/trace/events/cxl.h
>> @@ -0,0 +1,110 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#undef TRACE_SYSTEM
>> +#define TRACE_SYSTEM cxl
>> +
>> +#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
>> #define
>> +_CXL_EVENTS_H
>> +
>> +#include <linux/tracepoint.h>
>> +
>> +#define CXL_HEADERLOG_SIZE		SZ_512
>> +#define CXL_HEADERLOG_SIZE_U32		SZ_512 / sizeof(u32)
>> +
>> +#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
>> +#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
>> +#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
>> +#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
>> +#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
>> +#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
>> +#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
>> +#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
>> +#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
>> +#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
>> +#define CXL_RAS_UC_POISON		BIT(10)
>> +#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
>> +#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
>> +#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
>> +#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
>> +
>> +#define show_uc_errs(status)	__print_flags(status, " | ",
>> 	  \
>> +	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },
>> 	  \
>> +	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },
>> 	  \
>> +	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" },
>> \
>> +	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },
>> 	  \
>> +	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },
>> 	  \
>> +	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error"
>> },	  \
>> +	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error"
>> },  \
>> +	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },
>> 	  \
>> +	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },
>> 	  \
>> +	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },
>> 	  \
>> +	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },
>> 	  \
>> +	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },
>> 	  \
>> +	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
>> +	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
>> +	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
>> +)
>> +
>> +TRACE_EVENT(cxl_aer_uncorrectable_error,
>> +	TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl),
>> +	TP_ARGS(dev_name, status, fe, hl),
>> +	TP_STRUCT__entry(
>> +		__string(dev_name, dev_name)
>> +		__field(u32, status)
>> +		__field(u32, first_error)
>> +		__dynamic_array(u32, header_log,
>> CXL_HEADERLOG_SIZE_U32)
>> +	),
>> +	TP_fast_assign(
>> +		__assign_str(dev_name, dev_name);
>> +		__entry->status = status;
>> +		__entry->first_error = fe;
>> +		/*
>> +		 * Embed the 512B headerlog data for user app retrieval and
>> +		 * parsing, but no need to print this in the trace buffer.
>> +		 */
>> +		memcpy(__get_dynamic_array(header_log), hl,
>> CXL_HEADERLOG_SIZE);
>> +	),
>> +	TP_printk("%s: status: '%s' first_error: '%s'",
>> +		  __get_str(dev_name),
>> +		  show_uc_errs(__entry->status),
>> +		  show_uc_errs(__entry->first_error)
>> +	)
>> +);
>> +
>> +#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
>> +#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
>> +#define CXL_RAS_CE_CRC_THRESH		BIT(2)
> 
> I think the Bit Location 3  "Retry_Threshold: Retry Threshold Hit. "  as per the
> Correctable Error Status Register in the CXL 3.0 specification is missing?
> If so, please correct the bit location of the subsequent corrected errors as well.

Yes thanks! I don't know how I completely skipped over that.

>    
>> +#define CXL_RAS_CE_CACHE_POISON		BIT(3)
>> +#define CXL_RAS_CE_MEM_POISON		BIT(4)
>> +#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(5)
>> +
>> +#define show_ce_errs(status)	__print_flags(status, " | ",
>> 		\
>> +	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },
>> 		\
>> +	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data Ecc Error" },
> 
> Please change "Ecc" to "ECC".

Will fix

> 
>> 		\
>> +	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },
>> 		\
>> +	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer"
>> },		\
>> +	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer"
>> },		\
>> +	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical
>> Layer" }	\
>> +)
>> +
>> +TRACE_EVENT(cxl_aer_correctable_error,
>> +	TP_PROTO(const char *dev_name, u32 status),
>> +	TP_ARGS(dev_name, status),
>> +	TP_STRUCT__entry(
>> +		__string(dev_name, dev_name)
>> +		__field(u32, status)
>> +	),
>> +	TP_fast_assign(
>> +		__assign_str(dev_name, dev_name);
>> +		__entry->status = status;
>> +	),
>> +	TP_printk("%s: status: '%s'",
>> +		  __get_str(dev_name), show_ce_errs(__entry->status)
>> +	)
>> +);
>> +
>> +#endif /* _CXL_EVENTS_H */
>> +
>> +/* This part must be outside protection */ #undef TRACE_INCLUDE_FILE
>> +#define TRACE_INCLUDE_FILE cxl #include <trace/define_trace.h>
>>
> 
> Thanks,
> Shiju
diff mbox series

Patch

diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 9428f3e0d99b..0f36a5861a7b 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -13,6 +13,8 @@ 
 #include "cxlmem.h"
 #include "cxlpci.h"
 #include "cxl.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/cxl.h>
 
 /**
  * DOC: cxl pci
diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h
new file mode 100644
index 000000000000..f8e95d977133
--- /dev/null
+++ b/include/trace/events/cxl.h
@@ -0,0 +1,110 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cxl
+
+#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CXL_EVENTS_H
+
+#include <linux/tracepoint.h>
+
+#define CXL_HEADERLOG_SIZE		SZ_512
+#define CXL_HEADERLOG_SIZE_U32		SZ_512 / sizeof(u32)
+
+#define CXL_RAS_UC_CACHE_DATA_PARITY	BIT(0)
+#define CXL_RAS_UC_CACHE_ADDR_PARITY	BIT(1)
+#define CXL_RAS_UC_CACHE_BE_PARITY	BIT(2)
+#define CXL_RAS_UC_CACHE_DATA_ECC	BIT(3)
+#define CXL_RAS_UC_MEM_DATA_PARITY	BIT(4)
+#define CXL_RAS_UC_MEM_ADDR_PARITY	BIT(5)
+#define CXL_RAS_UC_MEM_BE_PARITY	BIT(6)
+#define CXL_RAS_UC_MEM_DATA_ECC		BIT(7)
+#define CXL_RAS_UC_REINIT_THRESH	BIT(8)
+#define CXL_RAS_UC_RSVD_ENCODE		BIT(9)
+#define CXL_RAS_UC_POISON		BIT(10)
+#define CXL_RAS_UC_RECV_OVERFLOW	BIT(11)
+#define CXL_RAS_UC_INTERNAL_ERR		BIT(14)
+#define CXL_RAS_UC_IDE_TX_ERR		BIT(15)
+#define CXL_RAS_UC_IDE_RX_ERR		BIT(16)
+
+#define show_uc_errs(status)	__print_flags(status, " | ",		  \
+	{ CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" },	  \
+	{ CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" },	  \
+	{ CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \
+	{ CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" },		  \
+	{ CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" },	  \
+	{ CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" },	  \
+	{ CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" },  \
+	{ CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" },		  \
+	{ CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" },		  \
+	{ CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" },	  \
+	{ CXL_RAS_UC_POISON, "Received Poison From Peer" },		  \
+	{ CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" },		  \
+	{ CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" },	  \
+	{ CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" },			  \
+	{ CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" }			  \
+)
+
+TRACE_EVENT(cxl_aer_uncorrectable_error,
+	TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl),
+	TP_ARGS(dev_name, status, fe, hl),
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name)
+		__field(u32, status)
+		__field(u32, first_error)
+		__dynamic_array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+	),
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name);
+		__entry->status = status;
+		__entry->first_error = fe;
+		/*
+		 * Embed the 512B headerlog data for user app retrieval and
+		 * parsing, but no need to print this in the trace buffer.
+		 */
+		memcpy(__get_dynamic_array(header_log), hl, CXL_HEADERLOG_SIZE);
+	),
+	TP_printk("%s: status: '%s' first_error: '%s'",
+		  __get_str(dev_name),
+		  show_uc_errs(__entry->status),
+		  show_uc_errs(__entry->first_error)
+	)
+);
+
+#define CXL_RAS_CE_CACHE_DATA_ECC	BIT(0)
+#define CXL_RAS_CE_MEM_DATA_ECC		BIT(1)
+#define CXL_RAS_CE_CRC_THRESH		BIT(2)
+#define CXL_RAS_CE_CACHE_POISON		BIT(3)
+#define CXL_RAS_CE_MEM_POISON		BIT(4)
+#define CXL_RAS_CE_PHYS_LAYER_ERR	BIT(5)
+
+#define show_ce_errs(status)	__print_flags(status, " | ",			\
+	{ CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" },			\
+	{ CXL_RAS_CE_MEM_DATA_ECC, "Memory Data Ecc Error" },			\
+	{ CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" },				\
+	{ CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" },		\
+	{ CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" },		\
+	{ CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" }	\
+)
+
+TRACE_EVENT(cxl_aer_correctable_error,
+	TP_PROTO(const char *dev_name, u32 status),
+	TP_ARGS(dev_name, status),
+	TP_STRUCT__entry(
+		__string(dev_name, dev_name)
+		__field(u32, status)
+	),
+	TP_fast_assign(
+		__assign_str(dev_name, dev_name);
+		__entry->status = status;
+	),
+	TP_printk("%s: status: '%s'",
+		  __get_str(dev_name), show_ce_errs(__entry->status)
+	)
+);
+
+#endif /* _CXL_EVENTS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE cxl
+#include <trace/define_trace.h>