diff mbox series

[13/51] libflash/ipmi-hiomap: Don't remove acked events unless we recover

Message ID 20190215065708.6086-14-andrew@aj.id.au
State Changes Requested
Headers show
Series ipmi-hiomap: Tests and fixes for event handling | expand

Commit Message

Andrew Jeffery Feb. 15, 2019, 6:56 a.m. UTC
Whilst we send the acks to the BMC, if they are removed before the
recovery is complete we may wedge the host if the error was transient.
There's no harm in acking already-acked events, and we'll need to send
an ack anyway if another ack-able event has occurred in the meantime.

Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
---
 libflash/ipmi-hiomap.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

Comments

Andrew Jeffery Feb. 15, 2019, 10:55 p.m. UTC | #1
On Fri, 15 Feb 2019, at 17:28, Andrew Jeffery wrote:
> Whilst we send the acks to the BMC, if they are removed before the
> recovery is complete we may wedge the host if the error was transient.
> There's no harm in acking already-acked events, and we'll need to send
> an ack anyway if another ack-able event has occurred in the meantime.
> 
> Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
> ---
>  libflash/ipmi-hiomap.c | 9 ++++++++-
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/libflash/ipmi-hiomap.c b/libflash/ipmi-hiomap.c
> index 86e47396efba..97d2a602af96 100644
> --- a/libflash/ipmi-hiomap.c
> +++ b/libflash/ipmi-hiomap.c
> @@ -637,7 +637,6 @@ static int ipmi_hiomap_handle_events(struct 
> ipmi_hiomap *ctx)
>  
>  	lock(&ctx->lock);
>  	status = ctx->bmc_state;
> -	ctx->bmc_state &= ~HIOMAP_E_ACK_MASK;
>  	unlock(&ctx->lock);
>  
>  	if (!(status & HIOMAP_E_DAEMON_READY)) {
> @@ -681,6 +680,10 @@ static int ipmi_hiomap_handle_events(struct 
> ipmi_hiomap *ctx)
>  			return rc;
>  		}
>  
> +		lock(&ctx->lock);
> +		ctx->bmc_state &= ~HIOMAP_E_PROTOCOL_RESET;
> +		unlock(&ctx->lock);
> +
>  		prlog(PR_INFO, "Restored window state after protocol reset\n");
>  		return 0;
>  	}
> @@ -694,6 +697,10 @@ static int ipmi_hiomap_handle_events(struct 
> ipmi_hiomap *ctx)
>  			return rc;
>  		}
>  
> +		lock(&ctx->lock);
> +		ctx->bmc_state &= ~HIOMAP_E_WINDOW_RESET;
> +		unlock(&ctx->lock);
> +
>  		prlog(PR_INFO, "Restored window state after window reset\n");
>  	}
>  

I came up with a test case that I think will break this change. If we get a reset after recovering the window but before the clear the new reset will be lost. It's not a problem up to and including the window recovery because we'll get an error response and bail out before the clear.

I'll fix this in a v2 after any other comments have been made.

> -- 
> 2.19.1
> 
>
Stewart Smith Feb. 18, 2019, 1:08 a.m. UTC | #2
"Andrew Jeffery" <andrew@aj.id.au> writes:
> On Fri, 15 Feb 2019, at 17:28, Andrew Jeffery wrote:
>> Whilst we send the acks to the BMC, if they are removed before the
>> recovery is complete we may wedge the host if the error was transient.
>> There's no harm in acking already-acked events, and we'll need to send
>> an ack anyway if another ack-able event has occurred in the meantime.
>> 
>> Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
>> ---
>>  libflash/ipmi-hiomap.c | 9 ++++++++-
>>  1 file changed, 8 insertions(+), 1 deletion(-)
>> 
>> diff --git a/libflash/ipmi-hiomap.c b/libflash/ipmi-hiomap.c
>> index 86e47396efba..97d2a602af96 100644
>> --- a/libflash/ipmi-hiomap.c
>> +++ b/libflash/ipmi-hiomap.c
>> @@ -637,7 +637,6 @@ static int ipmi_hiomap_handle_events(struct 
>> ipmi_hiomap *ctx)
>>  
>>  	lock(&ctx->lock);
>>  	status = ctx->bmc_state;
>> -	ctx->bmc_state &= ~HIOMAP_E_ACK_MASK;
>>  	unlock(&ctx->lock);
>>  
>>  	if (!(status & HIOMAP_E_DAEMON_READY)) {
>> @@ -681,6 +680,10 @@ static int ipmi_hiomap_handle_events(struct 
>> ipmi_hiomap *ctx)
>>  			return rc;
>>  		}
>>  
>> +		lock(&ctx->lock);
>> +		ctx->bmc_state &= ~HIOMAP_E_PROTOCOL_RESET;
>> +		unlock(&ctx->lock);
>> +
>>  		prlog(PR_INFO, "Restored window state after protocol reset\n");
>>  		return 0;
>>  	}
>> @@ -694,6 +697,10 @@ static int ipmi_hiomap_handle_events(struct 
>> ipmi_hiomap *ctx)
>>  			return rc;
>>  		}
>>  
>> +		lock(&ctx->lock);
>> +		ctx->bmc_state &= ~HIOMAP_E_WINDOW_RESET;
>> +		unlock(&ctx->lock);
>> +
>>  		prlog(PR_INFO, "Restored window state after window reset\n");
>>  	}
>>  
>
> I came up with a test case that I think will break this change. If we get a reset after recovering the window but before the clear the new reset will be lost. It's not a problem up to and including the window recovery because we'll get an error response and bail out before the clear.
>
> I'll fix this in a v2 after any other comments have been made.

Cool, I'll wait for v2, but should this patch also go to stable?
Andrew Jeffery Feb. 18, 2019, 2:35 a.m. UTC | #3
On Mon, 18 Feb 2019, at 11:38, Stewart Smith wrote:
> "Andrew Jeffery" <andrew@aj.id.au> writes:
> > On Fri, 15 Feb 2019, at 17:28, Andrew Jeffery wrote:
> >> Whilst we send the acks to the BMC, if they are removed before the
> >> recovery is complete we may wedge the host if the error was transient.
> >> There's no harm in acking already-acked events, and we'll need to send
> >> an ack anyway if another ack-able event has occurred in the meantime.
> >> 
> >> Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
> >> ---
> >>  libflash/ipmi-hiomap.c | 9 ++++++++-
> >>  1 file changed, 8 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/libflash/ipmi-hiomap.c b/libflash/ipmi-hiomap.c
> >> index 86e47396efba..97d2a602af96 100644
> >> --- a/libflash/ipmi-hiomap.c
> >> +++ b/libflash/ipmi-hiomap.c
> >> @@ -637,7 +637,6 @@ static int ipmi_hiomap_handle_events(struct 
> >> ipmi_hiomap *ctx)
> >>  
> >>  	lock(&ctx->lock);
> >>  	status = ctx->bmc_state;
> >> -	ctx->bmc_state &= ~HIOMAP_E_ACK_MASK;
> >>  	unlock(&ctx->lock);
> >>  
> >>  	if (!(status & HIOMAP_E_DAEMON_READY)) {
> >> @@ -681,6 +680,10 @@ static int ipmi_hiomap_handle_events(struct 
> >> ipmi_hiomap *ctx)
> >>  			return rc;
> >>  		}
> >>  
> >> +		lock(&ctx->lock);
> >> +		ctx->bmc_state &= ~HIOMAP_E_PROTOCOL_RESET;
> >> +		unlock(&ctx->lock);
> >> +
> >>  		prlog(PR_INFO, "Restored window state after protocol reset\n");
> >>  		return 0;
> >>  	}
> >> @@ -694,6 +697,10 @@ static int ipmi_hiomap_handle_events(struct 
> >> ipmi_hiomap *ctx)
> >>  			return rc;
> >>  		}
> >>  
> >> +		lock(&ctx->lock);
> >> +		ctx->bmc_state &= ~HIOMAP_E_WINDOW_RESET;
> >> +		unlock(&ctx->lock);
> >> +
> >>  		prlog(PR_INFO, "Restored window state after window reset\n");
> >>  	}
> >>  
> >
> > I came up with a test case that I think will break this change. If we get a reset after recovering the window but before the clear the new reset will be lost. It's not a problem up to and including the window recovery because we'll get an error response and bail out before the clear.
> >
> > I'll fix this in a v2 after any other comments have been made.
> 
> Cool, I'll wait for v2, but should this patch also go to stable?

As discussed out of band, the whole series probably should. I'll
Cc stable for all patches in v2.

> 
> -- 
> Stewart Smith
> OPAL Architect, IBM.
>
diff mbox series

Patch

diff --git a/libflash/ipmi-hiomap.c b/libflash/ipmi-hiomap.c
index 86e47396efba..97d2a602af96 100644
--- a/libflash/ipmi-hiomap.c
+++ b/libflash/ipmi-hiomap.c
@@ -637,7 +637,6 @@  static int ipmi_hiomap_handle_events(struct ipmi_hiomap *ctx)
 
 	lock(&ctx->lock);
 	status = ctx->bmc_state;
-	ctx->bmc_state &= ~HIOMAP_E_ACK_MASK;
 	unlock(&ctx->lock);
 
 	if (!(status & HIOMAP_E_DAEMON_READY)) {
@@ -681,6 +680,10 @@  static int ipmi_hiomap_handle_events(struct ipmi_hiomap *ctx)
 			return rc;
 		}
 
+		lock(&ctx->lock);
+		ctx->bmc_state &= ~HIOMAP_E_PROTOCOL_RESET;
+		unlock(&ctx->lock);
+
 		prlog(PR_INFO, "Restored window state after protocol reset\n");
 		return 0;
 	}
@@ -694,6 +697,10 @@  static int ipmi_hiomap_handle_events(struct ipmi_hiomap *ctx)
 			return rc;
 		}
 
+		lock(&ctx->lock);
+		ctx->bmc_state &= ~HIOMAP_E_WINDOW_RESET;
+		unlock(&ctx->lock);
+
 		prlog(PR_INFO, "Restored window state after window reset\n");
 	}