diff mbox

[1/2] hmi: refactor malfunction alert handling

Message ID 20170228235817.16746-1-andrew.donnellan@au1.ibm.com
State Accepted
Headers show

Commit Message

Andrew Donnellan Feb. 28, 2017, 11:58 p.m. UTC
The logic in decode_malfunction() is rather tricky to follow. Refactor the
code to make it easier to follow.

No functional change.

Cc: Russell Currey <ruscur@russell.cc>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Ryan Grimm <grimm@linux.vnet.ibm.com>
Cc: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Cc: Daniel Axtens <dja@axtens.net>
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

---

I'd appreciate review here, I hope I haven't introduced any unintended
changes, but I also don't think I fully understand the expected behaviour.
I've given it some limited testing, it doesn't seem to break my CAPI kexec
series which is good.
---
 core/hmi.c | 88 ++++++++++++++++++++++++++++++--------------------------------
 1 file changed, 42 insertions(+), 46 deletions(-)

Comments

Mahesh J Salgaonkar March 1, 2017, 5:34 a.m. UTC | #1
On 03/01/2017 05:28 AM, Andrew Donnellan wrote:
> The logic in decode_malfunction() is rather tricky to follow. Refactor the
> code to make it easier to follow.
> 
> No functional change.
> 
> Cc: Russell Currey <ruscur@russell.cc>
> Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> Cc: Ryan Grimm <grimm@linux.vnet.ibm.com>
> Cc: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
> Cc: Daniel Axtens <dja@axtens.net>
> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>

Looks good to me.

Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

> 
> ---
> 
> I'd appreciate review here, I hope I haven't introduced any unintended
> changes, but I also don't think I fully understand the expected behaviour.
> I've given it some limited testing, it doesn't seem to break my CAPI kexec
> series which is good.
> ---
>  core/hmi.c | 88 ++++++++++++++++++++++++++++++--------------------------------
>  1 file changed, 42 insertions(+), 46 deletions(-)
> 
> diff --git a/core/hmi.c b/core/hmi.c
> index 6fe060dc..31a23ec7 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -296,27 +296,6 @@ static int handle_capp_recoverable(int chip_id, int capp_index)
>  	return 0;
>  }
> 
> -static int decode_one_malfunction(int flat_chip_id, struct OpalHMIEvent *hmi_evt)
> -{
> -	int capp_index;
> -	struct proc_chip *chip = get_chip(flat_chip_id);
> -	int capp_num = CHIP_IS_NAPLES(chip) ? 2 : 1;
> -
> -	hmi_evt->severity = OpalHMI_SEV_FATAL;
> -	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
> -
> -	for (capp_index = 0; capp_index < capp_num; capp_index++)
> -		if (is_capp_recoverable(flat_chip_id, capp_index))
> -			if (handle_capp_recoverable(flat_chip_id, capp_index)) {
> -				hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
> -				hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
> -				return 1;
> -			}
> -
> -	/* TODO check other FIRs */
> -	return 0;
> -}
> -
>  static bool decode_core_fir(struct cpu_thread *cpu,
>  				struct OpalHMIEvent *hmi_evt)
>  {
> @@ -368,7 +347,7 @@ static bool decode_core_fir(struct cpu_thread *cpu,
>  }
> 
>  static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
> -					int *event_generated)
> +				       bool *event_generated)
>  {
>  	struct cpu_thread *cpu;
> 
> @@ -401,8 +380,30 @@ static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
>  	}
>  }
> 
> +static void find_capp_checkstop_reason(int flat_chip_id,
> +				       struct OpalHMIEvent *hmi_evt,
> +				       bool *event_generated)
> +{
> +	int capp_index;
> +	struct proc_chip *chip = get_chip(flat_chip_id);
> +	int capp_num = CHIP_IS_NAPLES(chip) ? 2 : 1;
> +
> +	for (capp_index = 0; capp_index < capp_num; capp_index++) {
> +		if (is_capp_recoverable(flat_chip_id, capp_index)) {
> +			if (handle_capp_recoverable(flat_chip_id, capp_index)) {
> +				hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
> +				hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
> +				queue_hmi_event(hmi_evt, 1);
> +				*event_generated = true;
> +				return;
> +			}
> +		}
> +	}
> +}
> +
>  static void find_nx_checkstop_reason(int flat_chip_id,
> -			struct OpalHMIEvent *hmi_evt, int *event_generated)
> +				     struct OpalHMIEvent *hmi_evt,
> +				     bool *event_generated)
>  {
>  	uint64_t nx_status;
>  	uint64_t nx_dma_fir;
> @@ -461,12 +462,12 @@ static void find_nx_checkstop_reason(int flat_chip_id,
> 
>  	/* Send an HMI event. */
>  	queue_hmi_event(hmi_evt, 0);
> -	*event_generated = 1;
> +	*event_generated = true;
>  }
> 
>  static void find_npu_checkstop_reason(int flat_chip_id,
>  				      struct OpalHMIEvent *hmi_evt,
> -				      int *event_generated)
> +				      bool *event_generated)
>  {
>  	struct phb *phb;
>  	struct npu *p = NULL;
> @@ -530,43 +531,38 @@ static void find_npu_checkstop_reason(int flat_chip_id,
> 
>  	/* The HMI is "recoverable" because it shouldn't crash the system */
>  	queue_hmi_event(hmi_evt, 1);
> -	*event_generated = 1;
> +	*event_generated = true;
>  }
> 
>  static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
>  {
>  	int i;
> -	int recover = -1;
>  	uint64_t malf_alert;
> -	int event_generated = 0;
> +	bool event_generated = false;
> 
>  	xscom_read(this_cpu()->chip_id, 0x2020011, &malf_alert);
> 
> -	for (i = 0; i < 64; i++)
> +	if (!malf_alert)
> +		return;
> +
> +	for (i = 0; i < 64; i++) {
>  		if (malf_alert & PPC_BIT(i)) {
> -			recover = decode_one_malfunction(i, hmi_evt);
>  			xscom_write(this_cpu()->chip_id, 0x02020011, ~PPC_BIT(i));
> -			if (recover) {
> -				queue_hmi_event(hmi_evt, recover);
> -				event_generated = 1;
> -			}
> +			find_capp_checkstop_reason(i, hmi_evt, &event_generated);
>  			find_nx_checkstop_reason(i, hmi_evt, &event_generated);
>  			find_npu_checkstop_reason(i, hmi_evt, &event_generated);
>  		}
> +	}
> 
> -	if (recover != -1) {
> -		find_core_checkstop_reason(hmi_evt, &event_generated);
> +	find_core_checkstop_reason(hmi_evt, &event_generated);
> 
> -		/*
> -		 * In case, if we fail to find checkstop reason send an
> -		 * unknown HMI event.
> -		 */
> -		if (!event_generated) {
> -			hmi_evt->u.xstop_error.xstop_type =
> -						CHECKSTOP_TYPE_UNKNOWN;
> -			hmi_evt->u.xstop_error.xstop_reason = 0;
> -			queue_hmi_event(hmi_evt, recover);
> -		}
> +	/*
> +	 * If we fail to find checkstop reason, send an unknown HMI event.
> +	 */
> +	if (!event_generated) {
> +		hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN;
> +		hmi_evt->u.xstop_error.xstop_reason = 0;
> +		queue_hmi_event(hmi_evt, false);
>  	}
>  }
>
Frederic Barrat March 15, 2017, 5:09 p.m. UTC | #2
Hi Andrew,


Le 01/03/2017 à 00:58, Andrew Donnellan a écrit :
> The logic in decode_malfunction() is rather tricky to follow. Refactor the
> code to make it easier to follow.
>
> No functional change.
>
> Cc: Russell Currey <ruscur@russell.cc>
> Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> Cc: Ryan Grimm <grimm@linux.vnet.ibm.com>
> Cc: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
> Cc: Daniel Axtens <dja@axtens.net>
> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
>
> ---
>
> I'd appreciate review here, I hope I haven't introduced any unintended
> changes, but I also don't think I fully understand the expected behaviour.
> I've given it some limited testing, it doesn't seem to break my CAPI kexec
> series which is good.
> ---

Likewise, the previous code looked convoluted (and wrong?). At least 
with your patch, it makes sense now. At least to me :)

Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>



>  core/hmi.c | 88 ++++++++++++++++++++++++++++++--------------------------------
>  1 file changed, 42 insertions(+), 46 deletions(-)
>
> diff --git a/core/hmi.c b/core/hmi.c
> index 6fe060dc..31a23ec7 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -296,27 +296,6 @@ static int handle_capp_recoverable(int chip_id, int capp_index)
>  	return 0;
>  }
>
> -static int decode_one_malfunction(int flat_chip_id, struct OpalHMIEvent *hmi_evt)
> -{
> -	int capp_index;
> -	struct proc_chip *chip = get_chip(flat_chip_id);
> -	int capp_num = CHIP_IS_NAPLES(chip) ? 2 : 1;
> -
> -	hmi_evt->severity = OpalHMI_SEV_FATAL;
> -	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
> -
> -	for (capp_index = 0; capp_index < capp_num; capp_index++)
> -		if (is_capp_recoverable(flat_chip_id, capp_index))
> -			if (handle_capp_recoverable(flat_chip_id, capp_index)) {
> -				hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
> -				hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
> -				return 1;
> -			}
> -
> -	/* TODO check other FIRs */
> -	return 0;
> -}
> -
>  static bool decode_core_fir(struct cpu_thread *cpu,
>  				struct OpalHMIEvent *hmi_evt)
>  {
> @@ -368,7 +347,7 @@ static bool decode_core_fir(struct cpu_thread *cpu,
>  }
>
>  static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
> -					int *event_generated)
> +				       bool *event_generated)
>  {
>  	struct cpu_thread *cpu;
>
> @@ -401,8 +380,30 @@ static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
>  	}
>  }
>
> +static void find_capp_checkstop_reason(int flat_chip_id,
> +				       struct OpalHMIEvent *hmi_evt,
> +				       bool *event_generated)
> +{
> +	int capp_index;
> +	struct proc_chip *chip = get_chip(flat_chip_id);
> +	int capp_num = CHIP_IS_NAPLES(chip) ? 2 : 1;
> +
> +	for (capp_index = 0; capp_index < capp_num; capp_index++) {
> +		if (is_capp_recoverable(flat_chip_id, capp_index)) {
> +			if (handle_capp_recoverable(flat_chip_id, capp_index)) {
> +				hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
> +				hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
> +				queue_hmi_event(hmi_evt, 1);
> +				*event_generated = true;
> +				return;
> +			}
> +		}
> +	}
> +}
> +
>  static void find_nx_checkstop_reason(int flat_chip_id,
> -			struct OpalHMIEvent *hmi_evt, int *event_generated)
> +				     struct OpalHMIEvent *hmi_evt,
> +				     bool *event_generated)
>  {
>  	uint64_t nx_status;
>  	uint64_t nx_dma_fir;
> @@ -461,12 +462,12 @@ static void find_nx_checkstop_reason(int flat_chip_id,
>
>  	/* Send an HMI event. */
>  	queue_hmi_event(hmi_evt, 0);
> -	*event_generated = 1;
> +	*event_generated = true;
>  }
>
>  static void find_npu_checkstop_reason(int flat_chip_id,
>  				      struct OpalHMIEvent *hmi_evt,
> -				      int *event_generated)
> +				      bool *event_generated)
>  {
>  	struct phb *phb;
>  	struct npu *p = NULL;
> @@ -530,43 +531,38 @@ static void find_npu_checkstop_reason(int flat_chip_id,
>
>  	/* The HMI is "recoverable" because it shouldn't crash the system */
>  	queue_hmi_event(hmi_evt, 1);
> -	*event_generated = 1;
> +	*event_generated = true;
>  }
>
>  static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
>  {
>  	int i;
> -	int recover = -1;
>  	uint64_t malf_alert;
> -	int event_generated = 0;
> +	bool event_generated = false;
>
>  	xscom_read(this_cpu()->chip_id, 0x2020011, &malf_alert);
>
> -	for (i = 0; i < 64; i++)
> +	if (!malf_alert)
> +		return;
> +
> +	for (i = 0; i < 64; i++) {
>  		if (malf_alert & PPC_BIT(i)) {
> -			recover = decode_one_malfunction(i, hmi_evt);
>  			xscom_write(this_cpu()->chip_id, 0x02020011, ~PPC_BIT(i));
> -			if (recover) {
> -				queue_hmi_event(hmi_evt, recover);
> -				event_generated = 1;
> -			}
> +			find_capp_checkstop_reason(i, hmi_evt, &event_generated);
>  			find_nx_checkstop_reason(i, hmi_evt, &event_generated);
>  			find_npu_checkstop_reason(i, hmi_evt, &event_generated);
>  		}
> +	}
>
> -	if (recover != -1) {
> -		find_core_checkstop_reason(hmi_evt, &event_generated);
> +	find_core_checkstop_reason(hmi_evt, &event_generated);
>
> -		/*
> -		 * In case, if we fail to find checkstop reason send an
> -		 * unknown HMI event.
> -		 */
> -		if (!event_generated) {
> -			hmi_evt->u.xstop_error.xstop_type =
> -						CHECKSTOP_TYPE_UNKNOWN;
> -			hmi_evt->u.xstop_error.xstop_reason = 0;
> -			queue_hmi_event(hmi_evt, recover);
> -		}
> +	/*
> +	 * If we fail to find checkstop reason, send an unknown HMI event.
> +	 */
> +	if (!event_generated) {
> +		hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN;
> +		hmi_evt->u.xstop_error.xstop_reason = 0;
> +		queue_hmi_event(hmi_evt, false);
>  	}
>  }
>
Stewart Smith March 16, 2017, 7:10 a.m. UTC | #3
Andrew Donnellan <andrew.donnellan@au1.ibm.com> writes:
> The logic in decode_malfunction() is rather tricky to follow. Refactor the
> code to make it easier to follow.
>
> No functional change.
>
> Cc: Russell Currey <ruscur@russell.cc>
> Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> Cc: Ryan Grimm <grimm@linux.vnet.ibm.com>
> Cc: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
> Cc: Daniel Axtens <dja@axtens.net>
> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
>
> ---
>
> I'd appreciate review here, I hope I haven't introduced any unintended
> changes, but I also don't think I fully understand the expected behaviour.
> I've given it some limited testing, it doesn't seem to break my CAPI kexec
> series which is good.

so, I'm pretty sure it's correct from my cusorary view... and I hope
that now that it's merged to master as of
b2dc494a4950927cdda5f009e1a482aabcb25afa that when Pridhiviraj runs the
full HMI test suite next, we check that we didn't accidentally break
anything that we all missed :)
ppaidipe March 22, 2017, 6:11 a.m. UTC | #4
On 2017-03-16 12:40, Stewart Smith wrote:
> Andrew Donnellan <andrew.donnellan@au1.ibm.com> writes:
>> The logic in decode_malfunction() is rather tricky to follow. Refactor 
>> the
>> code to make it easier to follow.
>> 
>> No functional change.
>> 
>> Cc: Russell Currey <ruscur@russell.cc>
>> Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>> Cc: Ryan Grimm <grimm@linux.vnet.ibm.com>
>> Cc: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
>> Cc: Daniel Axtens <dja@axtens.net>
>> Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
>> 
>> ---
>> 
>> I'd appreciate review here, I hope I haven't introduced any unintended
>> changes, but I also don't think I fully understand the expected 
>> behaviour.
>> I've given it some limited testing, it doesn't seem to break my CAPI 
>> kexec
>> series which is good.
> 
> so, I'm pretty sure it's correct from my cusorary view... and I hope
> that now that it's merged to master as of
> b2dc494a4950927cdda5f009e1a482aabcb25afa that when Pridhiviraj runs the
> full HMI test suite next, we check that we didn't accidentally break
> anything that we all missed :)


Hi
I ran full HMI test suite on both fsp and OpenPower BMC systems
with skiboot has at the level SkiBoot 246fb04.
All tests are worked fine.


Thanks
Pridhiviraj
Andrew Donnellan March 22, 2017, 6:15 a.m. UTC | #5
On 22/03/17 17:11, ppaidipe wrote:
> Hi
> I ran full HMI test suite on both fsp and OpenPower BMC systems
> with skiboot has at the level SkiBoot 246fb04.
> All tests are worked fine.

Woo, thanks for testing!
diff mbox

Patch

diff --git a/core/hmi.c b/core/hmi.c
index 6fe060dc..31a23ec7 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -296,27 +296,6 @@  static int handle_capp_recoverable(int chip_id, int capp_index)
 	return 0;
 }
 
-static int decode_one_malfunction(int flat_chip_id, struct OpalHMIEvent *hmi_evt)
-{
-	int capp_index;
-	struct proc_chip *chip = get_chip(flat_chip_id);
-	int capp_num = CHIP_IS_NAPLES(chip) ? 2 : 1;
-
-	hmi_evt->severity = OpalHMI_SEV_FATAL;
-	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
-
-	for (capp_index = 0; capp_index < capp_num; capp_index++)
-		if (is_capp_recoverable(flat_chip_id, capp_index))
-			if (handle_capp_recoverable(flat_chip_id, capp_index)) {
-				hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
-				hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
-				return 1;
-			}
-
-	/* TODO check other FIRs */
-	return 0;
-}
-
 static bool decode_core_fir(struct cpu_thread *cpu,
 				struct OpalHMIEvent *hmi_evt)
 {
@@ -368,7 +347,7 @@  static bool decode_core_fir(struct cpu_thread *cpu,
 }
 
 static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
-					int *event_generated)
+				       bool *event_generated)
 {
 	struct cpu_thread *cpu;
 
@@ -401,8 +380,30 @@  static void find_core_checkstop_reason(struct OpalHMIEvent *hmi_evt,
 	}
 }
 
+static void find_capp_checkstop_reason(int flat_chip_id,
+				       struct OpalHMIEvent *hmi_evt,
+				       bool *event_generated)
+{
+	int capp_index;
+	struct proc_chip *chip = get_chip(flat_chip_id);
+	int capp_num = CHIP_IS_NAPLES(chip) ? 2 : 1;
+
+	for (capp_index = 0; capp_index < capp_num; capp_index++) {
+		if (is_capp_recoverable(flat_chip_id, capp_index)) {
+			if (handle_capp_recoverable(flat_chip_id, capp_index)) {
+				hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
+				hmi_evt->type = OpalHMI_ERROR_CAPP_RECOVERY;
+				queue_hmi_event(hmi_evt, 1);
+				*event_generated = true;
+				return;
+			}
+		}
+	}
+}
+
 static void find_nx_checkstop_reason(int flat_chip_id,
-			struct OpalHMIEvent *hmi_evt, int *event_generated)
+				     struct OpalHMIEvent *hmi_evt,
+				     bool *event_generated)
 {
 	uint64_t nx_status;
 	uint64_t nx_dma_fir;
@@ -461,12 +462,12 @@  static void find_nx_checkstop_reason(int flat_chip_id,
 
 	/* Send an HMI event. */
 	queue_hmi_event(hmi_evt, 0);
-	*event_generated = 1;
+	*event_generated = true;
 }
 
 static void find_npu_checkstop_reason(int flat_chip_id,
 				      struct OpalHMIEvent *hmi_evt,
-				      int *event_generated)
+				      bool *event_generated)
 {
 	struct phb *phb;
 	struct npu *p = NULL;
@@ -530,43 +531,38 @@  static void find_npu_checkstop_reason(int flat_chip_id,
 
 	/* The HMI is "recoverable" because it shouldn't crash the system */
 	queue_hmi_event(hmi_evt, 1);
-	*event_generated = 1;
+	*event_generated = true;
 }
 
 static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
 {
 	int i;
-	int recover = -1;
 	uint64_t malf_alert;
-	int event_generated = 0;
+	bool event_generated = false;
 
 	xscom_read(this_cpu()->chip_id, 0x2020011, &malf_alert);
 
-	for (i = 0; i < 64; i++)
+	if (!malf_alert)
+		return;
+
+	for (i = 0; i < 64; i++) {
 		if (malf_alert & PPC_BIT(i)) {
-			recover = decode_one_malfunction(i, hmi_evt);
 			xscom_write(this_cpu()->chip_id, 0x02020011, ~PPC_BIT(i));
-			if (recover) {
-				queue_hmi_event(hmi_evt, recover);
-				event_generated = 1;
-			}
+			find_capp_checkstop_reason(i, hmi_evt, &event_generated);
 			find_nx_checkstop_reason(i, hmi_evt, &event_generated);
 			find_npu_checkstop_reason(i, hmi_evt, &event_generated);
 		}
+	}
 
-	if (recover != -1) {
-		find_core_checkstop_reason(hmi_evt, &event_generated);
+	find_core_checkstop_reason(hmi_evt, &event_generated);
 
-		/*
-		 * In case, if we fail to find checkstop reason send an
-		 * unknown HMI event.
-		 */
-		if (!event_generated) {
-			hmi_evt->u.xstop_error.xstop_type =
-						CHECKSTOP_TYPE_UNKNOWN;
-			hmi_evt->u.xstop_error.xstop_reason = 0;
-			queue_hmi_event(hmi_evt, recover);
-		}
+	/*
+	 * If we fail to find checkstop reason, send an unknown HMI event.
+	 */
+	if (!event_generated) {
+		hmi_evt->u.xstop_error.xstop_type = CHECKSTOP_TYPE_UNKNOWN;
+		hmi_evt->u.xstop_error.xstop_reason = 0;
+		queue_hmi_event(hmi_evt, false);
 	}
 }