diff mbox

[v2] opal: Do not overwrite same HMI event for multiple HMI errors.

Message ID 20150205054531.1527.98003.stgit@mars
State Changes Requested
Headers show

Commit Message

Mahesh J Salgaonkar Feb. 5, 2015, 5:45 a.m. UTC
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

The current implementation overwrites the same HMI event if there are
multiple HMI errors reported through a single HMI interrupt. This patch
fixes that issue by sending separate HMI event per error.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
Change in V2:
- Removed the forward declaration for queue_hmi_event() and moved the function
  on top instead.

 core/hmi.c |   73 +++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 30 deletions(-)

Comments

Stewart Smith Feb. 12, 2015, 4:48 a.m. UTC | #1
Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> writes:
> The current implementation overwrites the same HMI event if there are
> multiple HMI errors reported through a single HMI interrupt. This patch
> fixes that issue by sending separate HMI event per error.

Fundamentally looks like a good idea, couple of small notes that'd be
good to fix before merging:

> @@ -247,8 +277,10 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
>  		hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
>  
> -		if (hmi_evt)
> +		if (hmi_evt) {
>  			recover = decode_malfunction(hmi_evt);
> +			queue_hmi_event(hmi_evt, recover);
> +		}
>  	}
>  
>  	/* Assert if we see Hypervisor resource error, we can not continue. */
> @@ -257,6 +289,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>  		if (hmi_evt) {
>  			hmi_evt->severity = OpalHMI_SEV_FATAL;
>  			hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
> +			queue_hmi_event(hmi_evt, recover);
>  		}
>  		recover = 0;

(and actually pasting in the code so it's a bit easier to see)

        /* Assert if we see malfunction alert, we can not continue. */
        if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
                hmer &= ~SPR_HMER_MALFUNCTION_ALERT;

                if (hmi_evt) {
                        recover = decode_malfunction(hmi_evt);
                        queue_hmi_event(hmi_evt, recover);
                }
        }

        /* Assert if we see Hypervisor resource error, we can not continue. */
        if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
                hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
                if (hmi_evt) {
                        hmi_evt->severity = OpalHMI_SEV_FATAL;
                        hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
                        queue_hmi_event(hmi_evt, recover);
                }
                recover = 0;
        }

It looks as though if there's both malfunction alert and hyp resource
err in one, recover for the second will be set based on the first?

Perhaps better to explicitly pass OpalHMI_DISPOSITION_(NOT_)RECOVERED to
queue_hmi_event rather than rely on 1/0 ?
Mahesh J Salgaonkar Feb. 12, 2015, 5:07 a.m. UTC | #2
On 02/12/2015 10:18 AM, Stewart Smith wrote:
> Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> writes:
>> The current implementation overwrites the same HMI event if there are
>> multiple HMI errors reported through a single HMI interrupt. This patch
>> fixes that issue by sending separate HMI event per error.
> 
> Fundamentally looks like a good idea, couple of small notes that'd be
> good to fix before merging:
> 
>> @@ -247,8 +277,10 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>>  	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
>>  		hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
>>  
>> -		if (hmi_evt)
>> +		if (hmi_evt) {
>>  			recover = decode_malfunction(hmi_evt);
>> +			queue_hmi_event(hmi_evt, recover);
>> +		}
>>  	}
>>  
>>  	/* Assert if we see Hypervisor resource error, we can not continue. */
>> @@ -257,6 +289,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
>>  		if (hmi_evt) {
>>  			hmi_evt->severity = OpalHMI_SEV_FATAL;
>>  			hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
>> +			queue_hmi_event(hmi_evt, recover);
>>  		}
>>  		recover = 0;
> 
> (and actually pasting in the code so it's a bit easier to see)
> 
>         /* Assert if we see malfunction alert, we can not continue. */
>         if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
>                 hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
> 
>                 if (hmi_evt) {
>                         recover = decode_malfunction(hmi_evt);
>                         queue_hmi_event(hmi_evt, recover);
>                 }
>         }
> 
>         /* Assert if we see Hypervisor resource error, we can not continue. */
>         if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
>                 hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
>                 if (hmi_evt) {
>                         hmi_evt->severity = OpalHMI_SEV_FATAL;
>                         hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
>                         queue_hmi_event(hmi_evt, recover);
>                 }
>                 recover = 0;
>         }
> 
> It looks as though if there's both malfunction alert and hyp resource
> err in one, recover for the second will be set based on the first?

Nice catch. I think I should move 'recover = 0' before "if (hmi_evt)"
statement.

> 
> Perhaps better to explicitly pass OpalHMI_DISPOSITION_(NOT_)RECOVERED to
> queue_hmi_event rather than rely on 1/0 ?
> 

The recover variable can carry three values -1, 0 and 1 and
queue_hmi_event() makes decision based on these three values.

Will send out v3 with the fix.

Thanks,
-Mahesh.
diff mbox

Patch

diff --git a/core/hmi.c b/core/hmi.c
index 45fc89b..f21ca50 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -146,6 +146,32 @@ 
 
 static struct lock hmi_lock = LOCK_UNLOCKED;
 
+static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover)
+{
+	uint64_t *hmi_data;
+
+	/* Don't queue up event if recover == -1 */
+	if (recover == -1)
+		return 0;
+
+	/* set disposition */
+	if (recover == 1)
+		hmi_evt->disposition = OpalHMI_DISPOSITION_RECOVERED;
+	else if (recover == 0)
+		hmi_evt->disposition = OpalHMI_DISPOSITION_NOT_RECOVERED;
+
+	/*
+	 * struct OpalHMIEvent is of (3 * 64 bits) size and well packed
+	 * structure. Hence use uint64_t pointer to pass entire structure
+	 * using 4 params in generic message format.
+	 */
+	hmi_data = (uint64_t *)hmi_evt;
+
+	/* queue up for delivery to host. */
+	return opal_queue_msg(OPAL_MSG_HMI_EVT, NULL, NULL,
+				hmi_data[0], hmi_data[1], hmi_data[2]);
+}
+
 static int is_capp_recoverable(int chip_id)
 {
 	uint64_t reg;
@@ -214,6 +240,7 @@  static int decode_malfunction(struct OpalHMIEvent *hmi_evt)
 int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 {
 	int recover = 1;
+	uint64_t tfmr;
 
 	printf("HMI: Received HMI interrupt: HMER = 0x%016llx\n", hmer);
 	if (hmi_evt)
@@ -223,6 +250,7 @@  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
+			queue_hmi_event(hmi_evt, recover);
 		}
 		printf("HMI: Processor recovery Done.\n");
 	}
@@ -231,6 +259,7 @@  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
+			queue_hmi_event(hmi_evt, recover);
 		}
 		printf("HMI: Processor recovery Done (masked).\n");
 	}
@@ -239,6 +268,7 @@  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN;
+			queue_hmi_event(hmi_evt, recover);
 		}
 		printf("HMI: Processor recovery occurred again before"
 			"bit2 was cleared\n");
@@ -247,8 +277,10 @@  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
 		hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
 
-		if (hmi_evt)
+		if (hmi_evt) {
 			recover = decode_malfunction(hmi_evt);
+			queue_hmi_event(hmi_evt, recover);
+		}
 	}
 
 	/* Assert if we see Hypervisor resource error, we can not continue. */
@@ -257,6 +289,7 @@  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_FATAL;
 			hmi_evt->type = OpalHMI_ERROR_HYP_RESOURCE;
+			queue_hmi_event(hmi_evt, recover);
 		}
 		recover = 0;
 	}
@@ -266,22 +299,26 @@  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	 * TFMR and take corrective action wherever required.
 	 */
 	if (hmer & SPR_HMER_TFAC_ERROR) {
+		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
 		hmer &= ~SPR_HMER_TFAC_ERROR;
+		recover = chiptod_recover_tb_errors();
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
 			hmi_evt->type = OpalHMI_ERROR_TFAC;
-			hmi_evt->tfmr = mfspr(SPR_TFMR);
+			hmi_evt->tfmr = tfmr;
+			queue_hmi_event(hmi_evt, recover);
 		}
-		recover = chiptod_recover_tb_errors();
 	}
 	if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
+		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
 		hmer &= ~SPR_HMER_TFMR_PARITY_ERROR;
+		recover = 0;
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_FATAL;
 			hmi_evt->type = OpalHMI_ERROR_TFMR_PARITY;
-			hmi_evt->tfmr = mfspr(SPR_TFMR);
+			hmi_evt->tfmr = tfmr;
+			queue_hmi_event(hmi_evt, recover);
 		}
-		recover = 0;
 	}
 
 	/*
@@ -293,44 +330,20 @@  int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	return recover;
 }
 
-static int queue_hmi_event(struct OpalHMIEvent *hmi_evt)
-{
-	uint64_t *hmi_data;
-
-	/*
-	 * struct OpalHMIEvent is of (3 * 64 bits) size and well packed
-	 * structure. Hence use uint64_t pointer to pass entire structure
-	 * using 4 params in generic message format.
-	 */
-	hmi_data = (uint64_t *)hmi_evt;
-
-	/* queue up for delivery to host. */
-	return opal_queue_msg(OPAL_MSG_HMI_EVT, NULL, NULL,
-				hmi_data[0], hmi_data[1], hmi_data[2]);
-}
-
 static int64_t opal_handle_hmi(void)
 {
 	uint64_t hmer;
 	int rc = OPAL_SUCCESS;
 	struct OpalHMIEvent hmi_evt;
-	int recover;
 
 	memset(&hmi_evt, 0, sizeof(struct OpalHMIEvent));
 	hmi_evt.version = OpalHMIEvt_V1;
 
 	lock(&hmi_lock);
 	hmer = mfspr(SPR_HMER);		/* Get HMER register value */
-	recover = handle_hmi_exception(hmer, &hmi_evt);
+	handle_hmi_exception(hmer, &hmi_evt);
 	unlock(&hmi_lock);
 
-	if (recover == 1)
-		hmi_evt.disposition = OpalHMI_DISPOSITION_RECOVERED;
-	else if (recover == 0)
-		hmi_evt.disposition = OpalHMI_DISPOSITION_NOT_RECOVERED;
-
-	if (recover != -1)
-		queue_hmi_event(&hmi_evt);
 	return rc;
 }
 opal_call(OPAL_HANDLE_HMI, opal_handle_hmi, 0);