diff mbox

[v2,5/6] FSP/ELOG: Fix possible event notifier hangs

Message ID 1467473623-17735-6-git-send-email-hegdevasant@linux.vnet.ibm.com
State Accepted
Headers show

Commit Message

Vasant Hegde July 2, 2016, 3:33 p.m. UTC
In some corner cases host may send acknowledgement without
reading actual data (fsp_opal_elog_info -> fsp_opal_elog_ack).
Because of this elog_read_from_fsp_head_state may be stuck in
wrong state (ELOG_STATE_HOST_INFO) and not able to send remaining
ELOG's to host. Hence reset ELOG state and start sending remaining
ELOG's.

Also in normal case we will ACK the logs which are already processed
(elog_read_processed). Hence rearrange the code such that we go
through elog_read_processed first.

Finally return OPAL_PARAMETER if we are not able to find ELOG ID.

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
---
 hw/fsp/fsp-elog-read.c  | 18 +++++++++++++++---
 hw/fsp/fsp-elog-write.c |  3 +++
 2 files changed, 18 insertions(+), 3 deletions(-)

Comments

Mukesh Ojha July 4, 2016, 9:48 a.m. UTC | #1
Few questions.

On Saturday 02 July 2016 09:03 PM, Vasant Hegde wrote:
> In some corner cases host may send acknowledgement without
> reading actual data (fsp_opal_elog_info -> fsp_opal_elog_ack).
> Because of this elog_read_from_fsp_head_state may be stuck in
> wrong state (ELOG_STATE_HOST_INFO) and not able to send remaining
> ELOG's to host. Hence reset ELOG state and start sending remaining
> ELOG's.
>
> Also in normal case we will ACK the logs which are already processed
> (elog_read_processed). Hence rearrange the code such that we go
> through elog_read_processed first.
>
> Finally return OPAL_PARAMETER if we are not able to find ELOG ID.
>
> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
> ---
>   hw/fsp/fsp-elog-read.c  | 18 +++++++++++++++---
>   hw/fsp/fsp-elog-write.c |  3 +++
>   2 files changed, 18 insertions(+), 3 deletions(-)
>
> diff --git a/hw/fsp/fsp-elog-read.c b/hw/fsp/fsp-elog-read.c
> index b606236..8c01a1c 100644
> --- a/hw/fsp/fsp-elog-read.c
> +++ b/hw/fsp/fsp-elog-read.c
> @@ -416,21 +416,33 @@ static int64_t fsp_opal_elog_ack(uint64_t ack_id)
>   		return rc;
>   	}
>   	lock(&elog_read_lock);
> -	list_for_each_safe(&elog_read_pending, record, next_record, link) {
> +	list_for_each_safe(&elog_read_processed, record, next_record, link) {
>   		if (record->log_id != ack_id)
>   			continue;
>   		list_del(&record->link);
>   		list_add(&elog_read_free, &record->link);
> +		unlock(&elog_read_lock);
> +		return rc;
>   	}
> -	list_for_each_safe(&elog_read_processed, record, next_record, link) {
> +	list_for_each_safe(&elog_read_pending, record, next_record, link) {

Is it really necessary to check the ack_id in the pending list?
As this function will be called only for id which is already in the 
processed list.
Same we do it for opal errorlogs.
>   		if (record->log_id != ack_id)
>   			continue;
> +		/* It means host has sent ACK without reading actual data.
> +		 * Because of this elog_read_from_fsp_head_state may be
> +		 * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able
> +		 * to send remaning ELOG's to host. Hence reset ELOG state

s/remaning/remaining

> +		 * and start sending remaining ELOG's.
> +		 */
>   		list_del(&record->link);
>   		list_add(&elog_read_free, &record->link);
> +		elog_reject_head();
> +		unlock(&elog_read_lock);
> +		fsp_elog_check_and_fetch_head();
> +		return rc;
>   	}
>   	unlock(&elog_read_lock);
>   
> -	return rc;
> +	return OPAL_PARAMETER;
>   }
>   
>   /*
> diff --git a/hw/fsp/fsp-elog-write.c b/hw/fsp/fsp-elog-write.c
> index 80a0a39..b78bc20 100644
> --- a/hw/fsp/fsp-elog-write.c
> +++ b/hw/fsp/fsp-elog-write.c
> @@ -248,6 +248,9 @@ bool opal_elog_ack(uint64_t ack_id)
>   			list_del(&record->link);
>   			opal_elog_complete(record, true);
>   			rc = true;
> +			unlock(&elog_write_to_host_lock);
> +			opal_commit_elog_in_host();
> +			return rc;
>   		}
>   	}
>   	unlock(&elog_write_to_host_lock);
Vasant Hegde July 13, 2016, 9:23 a.m. UTC | #2
On 07/04/2016 03:18 PM, Mukesh Ojha wrote:
> Few questions.
>
> On Saturday 02 July 2016 09:03 PM, Vasant Hegde wrote:
>> In some corner cases host may send acknowledgement without
>> reading actual data (fsp_opal_elog_info -> fsp_opal_elog_ack).
>> Because of this elog_read_from_fsp_head_state may be stuck in
>> wrong state (ELOG_STATE_HOST_INFO) and not able to send remaining
>> ELOG's to host. Hence reset ELOG state and start sending remaining
>> ELOG's.
>>
>> Also in normal case we will ACK the logs which are already processed
>> (elog_read_processed). Hence rearrange the code such that we go
>> through elog_read_processed first.
>>
>> Finally return OPAL_PARAMETER if we are not able to find ELOG ID.
>>
>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>> ---
>>   hw/fsp/fsp-elog-read.c  | 18 +++++++++++++++---
>>   hw/fsp/fsp-elog-write.c |  3 +++
>>   2 files changed, 18 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/fsp/fsp-elog-read.c b/hw/fsp/fsp-elog-read.c
>> index b606236..8c01a1c 100644
>> --- a/hw/fsp/fsp-elog-read.c
>> +++ b/hw/fsp/fsp-elog-read.c
>> @@ -416,21 +416,33 @@ static int64_t fsp_opal_elog_ack(uint64_t ack_id)
>>           return rc;
>>       }
>>       lock(&elog_read_lock);
>> -    list_for_each_safe(&elog_read_pending, record, next_record, link) {
>> +    list_for_each_safe(&elog_read_processed, record, next_record, link) {
>>           if (record->log_id != ack_id)
>>               continue;
>>           list_del(&record->link);
>>           list_add(&elog_read_free, &record->link);
>> +        unlock(&elog_read_lock);
>> +        return rc;
>>       }
>> -    list_for_each_safe(&elog_read_processed, record, next_record, link) {
>> +    list_for_each_safe(&elog_read_pending, record, next_record, link) {
>
> Is it really necessary to check the ack_id in the pending list?
> As this function will be called only for id which is already in the processed list.
> Same we do it for opal errorlogs.

Its required. If host ACKs without reading it.. then it will be in pending list.


>>           if (record->log_id != ack_id)
>>               continue;
>> +        /* It means host has sent ACK without reading actual data.
>> +         * Because of this elog_read_from_fsp_head_state may be
>> +         * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able
>> +         * to send remaning ELOG's to host. Hence reset ELOG state
>
> s/remaning/remaining

Ah yes.. will fix .

-Vasant
Vasant Hegde July 13, 2016, 9:27 a.m. UTC | #3
On 07/13/2016 02:53 PM, Vasant Hegde wrote:
> On 07/04/2016 03:18 PM, Mukesh Ojha wrote:
>> Few questions.
>>
>> On Saturday 02 July 2016 09:03 PM, Vasant Hegde wrote:
>>> In some corner cases host may send acknowledgement without
>>> reading actual data (fsp_opal_elog_info -> fsp_opal_elog_ack).
>>> Because of this elog_read_from_fsp_head_state may be stuck in
>>> wrong state (ELOG_STATE_HOST_INFO) and not able to send remaining
>>> ELOG's to host. Hence reset ELOG state and start sending remaining
>>> ELOG's.
>>>
>>> Also in normal case we will ACK the logs which are already processed
>>> (elog_read_processed). Hence rearrange the code such that we go
>>> through elog_read_processed first.
>>>
>>> Finally return OPAL_PARAMETER if we are not able to find ELOG ID.
>>>
>>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>>> ---
>>>   hw/fsp/fsp-elog-read.c  | 18 +++++++++++++++---
>>>   hw/fsp/fsp-elog-write.c |  3 +++
>>>   2 files changed, 18 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/hw/fsp/fsp-elog-read.c b/hw/fsp/fsp-elog-read.c
>>> index b606236..8c01a1c 100644
>>> --- a/hw/fsp/fsp-elog-read.c
>>> +++ b/hw/fsp/fsp-elog-read.c
>>> @@ -416,21 +416,33 @@ static int64_t fsp_opal_elog_ack(uint64_t ack_id)
>>>           return rc;
>>>       }
>>>       lock(&elog_read_lock);
>>> -    list_for_each_safe(&elog_read_pending, record, next_record, link) {
>>> +    list_for_each_safe(&elog_read_processed, record, next_record, link) {
>>>           if (record->log_id != ack_id)
>>>               continue;
>>>           list_del(&record->link);
>>>           list_add(&elog_read_free, &record->link);
>>> +        unlock(&elog_read_lock);
>>> +        return rc;
>>>       }
>>> -    list_for_each_safe(&elog_read_processed, record, next_record, link) {
>>> +    list_for_each_safe(&elog_read_pending, record, next_record, link) {
>>
>> Is it really necessary to check the ack_id in the pending list?
>> As this function will be called only for id which is already in the processed
>> list.
>> Same we do it for opal errorlogs.
>
> Its required. If host ACKs without reading it.. then it will be in pending list.
>
>
>>>           if (record->log_id != ack_id)
>>>               continue;
>>> +        /* It means host has sent ACK without reading actual data.
>>> +         * Because of this elog_read_from_fsp_head_state may be
>>> +         * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able
>>> +         * to send remaning ELOG's to host. Hence reset ELOG state
>>
>> s/remaning/remaining
>
> Ah yes.. will fix .

@Stewart,
   If you don't have any other comment/suggestions, can you fix this spell 
mistake before merging ?

-Vasant
Stewart Smith July 21, 2016, 6:49 a.m. UTC | #4
Vasant Hegde <hegdevasant@linux.vnet.ibm.com> writes:
> On 07/13/2016 02:53 PM, Vasant Hegde wrote:
>> On 07/04/2016 03:18 PM, Mukesh Ojha wrote:
>>> Few questions.
>>>
>>> On Saturday 02 July 2016 09:03 PM, Vasant Hegde wrote:
>>>> In some corner cases host may send acknowledgement without
>>>> reading actual data (fsp_opal_elog_info -> fsp_opal_elog_ack).
>>>> Because of this elog_read_from_fsp_head_state may be stuck in
>>>> wrong state (ELOG_STATE_HOST_INFO) and not able to send remaining
>>>> ELOG's to host. Hence reset ELOG state and start sending remaining
>>>> ELOG's.
>>>>
>>>> Also in normal case we will ACK the logs which are already processed
>>>> (elog_read_processed). Hence rearrange the code such that we go
>>>> through elog_read_processed first.
>>>>
>>>> Finally return OPAL_PARAMETER if we are not able to find ELOG ID.
>>>>
>>>> Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
>>>> ---
>>>>   hw/fsp/fsp-elog-read.c  | 18 +++++++++++++++---
>>>>   hw/fsp/fsp-elog-write.c |  3 +++
>>>>   2 files changed, 18 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/hw/fsp/fsp-elog-read.c b/hw/fsp/fsp-elog-read.c
>>>> index b606236..8c01a1c 100644
>>>> --- a/hw/fsp/fsp-elog-read.c
>>>> +++ b/hw/fsp/fsp-elog-read.c
>>>> @@ -416,21 +416,33 @@ static int64_t fsp_opal_elog_ack(uint64_t ack_id)
>>>>           return rc;
>>>>       }
>>>>       lock(&elog_read_lock);
>>>> -    list_for_each_safe(&elog_read_pending, record, next_record, link) {
>>>> +    list_for_each_safe(&elog_read_processed, record, next_record, link) {
>>>>           if (record->log_id != ack_id)
>>>>               continue;
>>>>           list_del(&record->link);
>>>>           list_add(&elog_read_free, &record->link);
>>>> +        unlock(&elog_read_lock);
>>>> +        return rc;
>>>>       }
>>>> -    list_for_each_safe(&elog_read_processed, record, next_record, link) {
>>>> +    list_for_each_safe(&elog_read_pending, record, next_record, link) {
>>>
>>> Is it really necessary to check the ack_id in the pending list?
>>> As this function will be called only for id which is already in the processed
>>> list.
>>> Same we do it for opal errorlogs.
>>
>> Its required. If host ACKs without reading it.. then it will be in pending list.
>>
>>
>>>>           if (record->log_id != ack_id)
>>>>               continue;
>>>> +        /* It means host has sent ACK without reading actual data.
>>>> +         * Because of this elog_read_from_fsp_head_state may be
>>>> +         * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able
>>>> +         * to send remaning ELOG's to host. Hence reset ELOG state
>>>
>>> s/remaning/remaining
>>
>> Ah yes.. will fix .
>
> @Stewart,
>    If you don't have any other comment/suggestions, can you fix this spell 
> mistake before merging ?

Sure, spelling fixed when I merged.
diff mbox

Patch

diff --git a/hw/fsp/fsp-elog-read.c b/hw/fsp/fsp-elog-read.c
index b606236..8c01a1c 100644
--- a/hw/fsp/fsp-elog-read.c
+++ b/hw/fsp/fsp-elog-read.c
@@ -416,21 +416,33 @@  static int64_t fsp_opal_elog_ack(uint64_t ack_id)
 		return rc;
 	}
 	lock(&elog_read_lock);
-	list_for_each_safe(&elog_read_pending, record, next_record, link) {
+	list_for_each_safe(&elog_read_processed, record, next_record, link) {
 		if (record->log_id != ack_id)
 			continue;
 		list_del(&record->link);
 		list_add(&elog_read_free, &record->link);
+		unlock(&elog_read_lock);
+		return rc;
 	}
-	list_for_each_safe(&elog_read_processed, record, next_record, link) {
+	list_for_each_safe(&elog_read_pending, record, next_record, link) {
 		if (record->log_id != ack_id)
 			continue;
+		/* It means host has sent ACK without reading actual data.
+		 * Because of this elog_read_from_fsp_head_state may be
+		 * stuck in wrong state (ELOG_STATE_HOST_INFO) and not able
+		 * to send remaning ELOG's to host. Hence reset ELOG state
+		 * and start sending remaining ELOG's.
+		 */
 		list_del(&record->link);
 		list_add(&elog_read_free, &record->link);
+		elog_reject_head();
+		unlock(&elog_read_lock);
+		fsp_elog_check_and_fetch_head();
+		return rc;
 	}
 	unlock(&elog_read_lock);
 
-	return rc;
+	return OPAL_PARAMETER;
 }
 
 /*
diff --git a/hw/fsp/fsp-elog-write.c b/hw/fsp/fsp-elog-write.c
index 80a0a39..b78bc20 100644
--- a/hw/fsp/fsp-elog-write.c
+++ b/hw/fsp/fsp-elog-write.c
@@ -248,6 +248,9 @@  bool opal_elog_ack(uint64_t ack_id)
 			list_del(&record->link);
 			opal_elog_complete(record, true);
 			rc = true;
+			unlock(&elog_write_to_host_lock);
+			opal_commit_elog_in_host();
+			return rc;
 		}
 	}
 	unlock(&elog_write_to_host_lock);