diff mbox series

opal/hmi: Report NPU2 checkstop reason

Message ID 20190523122135.3757-1-fbarrat@linux.ibm.com
State Accepted
Headers show
Series opal/hmi: Report NPU2 checkstop reason | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (76f7316bc8fc8a18fdbfcbc0e1fe1bb992d2a7d7)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot fail Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Frederic Barrat May 23, 2019, 12:21 p.m. UTC
The NPU2 is currently not passing any information to linux to explain
the cause of an HMI. NPU2 has three Fault Isolation Registers and over
30 of those FIR bits are configured to raise an HMI by default. We
won't be able to fit all possible state in the 32-bit xstop_reason
field of the HMI event, but we can still try to encode up to 4 HMI
reasons.

Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
---
 core/hmi.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

Comments

Andrew Donnellan May 30, 2019, 1:51 a.m. UTC | #1
On 23/5/19 10:21 pm, Frederic Barrat wrote:
> The NPU2 is currently not passing any information to linux to explain
> the cause of an HMI. NPU2 has three Fault Isolation Registers and over
> 30 of those FIR bits are configured to raise an HMI by default. We
> won't be able to fit all possible state in the 32-bit xstop_reason
> field of the HMI event, but we can still try to encode up to 4 HMI
> reasons.
> 
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>

Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>

> ---
>   core/hmi.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 44 insertions(+)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index d97f3fc0..3b2860f8 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -576,6 +576,46 @@ static bool phb_is_npu2(struct dt_node *dn)
>   		dt_node_is_compatible(dn, "ibm,power9-npu-opencapi-pciex"));
>   }
>   
> +static void add_npu2_xstop_reason(uint32_t *xstop_reason, uint8_t reason)
> +{
> +	int i, reason_count;
> +	uint8_t *ptr;
> +
> +	reason_count = sizeof(*xstop_reason) / sizeof(reason);
> +	ptr = (uint8_t *) xstop_reason;
> +	for (i = 0; i < reason_count; i++) {
> +		if (*ptr == 0) {
> +			*ptr = reason;
> +			break;
> +		}
> +		ptr++;
> +	}
> +}
> +
> +static void encode_npu2_xstop_reason(uint32_t *xstop_reason,
> +				uint64_t fir, int fir_number)
> +{
> +	int bit;
> +	uint8_t reason;
> +
> +	/*
> +	 * There are three 64-bit FIRs but the xstop reason field of
> +	 * the hmi event is only 32-bit. Encode which FIR bit is set as:
> +	 * - 2 bits for the FIR number
> +	 * - 6 bits for the bit number (0 -> 63)
> +	 *
> +	 * So we could even encode up to 4 reasons for the HMI, if
> +	 * that can ever happen
> +	 */
> +	while (fir) {
> +		bit = ilog2(fir);
> +		reason = fir_number << 6;
> +		reason |= (63 - bit); // IBM numbering
> +		add_npu2_xstop_reason(xstop_reason, reason);
> +		fir ^= 1ULL << bit;
> +	}
> +}
> +
>   static void find_npu2_checkstop_reason(int flat_chip_id,
>   				      struct OpalHMIEvent *hmi_evt,
>   				      uint64_t *out_flags)
> @@ -592,6 +632,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
>   	uint64_t npu2_fir_action0_addr;
>   	uint64_t npu2_fir_action1_addr;
>   	uint64_t fatal_errors;
> +	uint32_t xstop_reason = 0;
>   	int total_errors = 0;
>   	const char *loc;
>   
> @@ -635,6 +676,8 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
>   			prlog(PR_ERR, "NPU: [Loc: %s] P:%d ACTION0 0x%016llx, ACTION1 0x%016llx\n",
>   					loc, flat_chip_id, npu2_fir_action0, npu2_fir_action1);
>   			total_errors++;
> +
> +			encode_npu2_xstop_reason(&xstop_reason, fatal_errors, i);
>   		}
>   
>   		/* Can't do a fence yet, we are just logging fir information for now */
> @@ -667,6 +710,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
>   	hmi_evt->severity = OpalHMI_SEV_WARNING;
>   	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
>   	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
> +	hmi_evt->u.xstop_error.xstop_reason = xstop_reason;
>   	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
>   
>   	/* Marking the event as recoverable so that we don't crash */
>
Stewart Smith June 5, 2019, 1:38 a.m. UTC | #2
Frederic Barrat <fbarrat@linux.ibm.com> writes:

> The NPU2 is currently not passing any information to linux to explain
> the cause of an HMI. NPU2 has three Fault Isolation Registers and over
> 30 of those FIR bits are configured to raise an HMI by default. We
> won't be able to fit all possible state in the 32-bit xstop_reason
> field of the HMI event, but we can still try to encode up to 4 HMI
> reasons.
>
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> ---
>  core/hmi.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 44 insertions(+)

Thanks! Merged to master as of 2a7e5cb685b83b7d70018636ec95f81c7c60270a
diff mbox series

Patch

diff --git a/core/hmi.c b/core/hmi.c
index d97f3fc0..3b2860f8 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -576,6 +576,46 @@  static bool phb_is_npu2(struct dt_node *dn)
 		dt_node_is_compatible(dn, "ibm,power9-npu-opencapi-pciex"));
 }
 
+static void add_npu2_xstop_reason(uint32_t *xstop_reason, uint8_t reason)
+{
+	int i, reason_count;
+	uint8_t *ptr;
+
+	reason_count = sizeof(*xstop_reason) / sizeof(reason);
+	ptr = (uint8_t *) xstop_reason;
+	for (i = 0; i < reason_count; i++) {
+		if (*ptr == 0) {
+			*ptr = reason;
+			break;
+		}
+		ptr++;
+	}
+}
+
+static void encode_npu2_xstop_reason(uint32_t *xstop_reason,
+				uint64_t fir, int fir_number)
+{
+	int bit;
+	uint8_t reason;
+
+	/*
+	 * There are three 64-bit FIRs but the xstop reason field of
+	 * the hmi event is only 32-bit. Encode which FIR bit is set as:
+	 * - 2 bits for the FIR number
+	 * - 6 bits for the bit number (0 -> 63)
+	 *
+	 * So we could even encode up to 4 reasons for the HMI, if
+	 * that can ever happen
+	 */
+	while (fir) {
+		bit = ilog2(fir);
+		reason = fir_number << 6;
+		reason |= (63 - bit); // IBM numbering
+		add_npu2_xstop_reason(xstop_reason, reason);
+		fir ^= 1ULL << bit;
+	}
+}
+
 static void find_npu2_checkstop_reason(int flat_chip_id,
 				      struct OpalHMIEvent *hmi_evt,
 				      uint64_t *out_flags)
@@ -592,6 +632,7 @@  static void find_npu2_checkstop_reason(int flat_chip_id,
 	uint64_t npu2_fir_action0_addr;
 	uint64_t npu2_fir_action1_addr;
 	uint64_t fatal_errors;
+	uint32_t xstop_reason = 0;
 	int total_errors = 0;
 	const char *loc;
 
@@ -635,6 +676,8 @@  static void find_npu2_checkstop_reason(int flat_chip_id,
 			prlog(PR_ERR, "NPU: [Loc: %s] P:%d ACTION0 0x%016llx, ACTION1 0x%016llx\n",
 					loc, flat_chip_id, npu2_fir_action0, npu2_fir_action1);
 			total_errors++;
+
+			encode_npu2_xstop_reason(&xstop_reason, fatal_errors, i);
 		}
 
 		/* Can't do a fence yet, we are just logging fir information for now */
@@ -667,6 +710,7 @@  static void find_npu2_checkstop_reason(int flat_chip_id,
 	hmi_evt->severity = OpalHMI_SEV_WARNING;
 	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
 	hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_NPU;
+	hmi_evt->u.xstop_error.xstop_reason = xstop_reason;
 	hmi_evt->u.xstop_error.u.chip_id = flat_chip_id;
 
 	/* Marking the event as recoverable so that we don't crash */