diff mbox series

[v3,06/13] hiomap: Enable Async IPMI messaging

Message ID 1573060953-6464-7-git-send-email-debmc@linux.ibm.com
State Changes Requested
Headers show
Series ipmi-hiomap: Enablement for Async opal_flash_op's | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (d75e82dbfbb9443efeb3f9a5921ac23605aab469)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot fail Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Deb McLemore Nov. 6, 2019, 5:22 p.m. UTC
To provide higher layer async operations to access
the target flash, enable hiomap to perform async
ipmi messaging for call paths thru opal_flash_op.

Special considerations and constraints are to prevent
recursive locking and/or polling inside OPAL calls.

Objective is to remove the hiomap_queue_msg_sync for
moving windows (which already uses pollers) to allow
async requests to perform their functions.

Call paths thru opal_flash_op will determine if the
requested operation needs to be re-queued, to allow
skiboot to jump back out to Linux to prevent RCU or
scheduler issues.

For example, if a flash window move operation
is needed to erase a very large flash segment,
the time spent in opal firmware would be long
enough that Linux would stall.  To avoid the
long running duration of various aspects of this
inherent long running operation we divide up
the tasks in smaller chunks to avoid cumulating
time spent in opal firmware.  The return codes
are used to distinguish differences between cases
where a re-queue of the transaction would be
needed versus a true failure return code.

PR_TRACE used since PR_DEBUG seems to always trigger,
unable to figure out why.

Due to the nature of redefining the sync versus
async capabilities, this patch is larger than
desired due to interrelationships of changes
to functionality.

Signed-off-by: Deb McLemore <debmc@linux.ibm.com>
---
 core/flash.c           | 154 +++++++-
 libflash/blocklevel.h  |   5 +
 libflash/errors.h      |   1 +
 libflash/ipmi-hiomap.c | 955 ++++++++++++++++++++++++++++++++++++++-----------
 libflash/ipmi-hiomap.h |  41 ++-
 5 files changed, 936 insertions(+), 220 deletions(-)

Comments

Oliver O'Halloran Nov. 25, 2019, 3:12 a.m. UTC | #1
On Thu, Nov 7, 2019 at 4:25 AM Deb McLemore <debmc@linux.ibm.com> wrote:
>
> To provide higher layer async operations to access
> the target flash, enable hiomap to perform async
> ipmi messaging for call paths thru opal_flash_op.
>
> Special considerations and constraints are to prevent
> recursive locking and/or polling inside OPAL calls.
>
> Objective is to remove the hiomap_queue_msg_sync for
> moving windows (which already uses pollers) to allow
> async requests to perform their functions.
>
> Call paths thru opal_flash_op will determine if the
> requested operation needs to be re-queued, to allow
> skiboot to jump back out to Linux to prevent RCU or
> scheduler issues.
>
> For example, if a flash window move operation
> is needed to erase a very large flash segment,
> the time spent in opal firmware would be long
> enough that Linux would stall.  To avoid the
> long running duration of various aspects of this
> inherent long running operation we divide up
> the tasks in smaller chunks to avoid cumulating
> time spent in opal firmware.  The return codes
> are used to distinguish differences between cases
> where a re-queue of the transaction would be
> needed versus a true failure return code.
>
> PR_TRACE used since PR_DEBUG seems to always trigger,
> unable to figure out why.
>
> Due to the nature of redefining the sync versus
> async capabilities, this patch is larger than
> desired due to interrelationships of changes
> to functionality.
>
> Signed-off-by: Deb McLemore <debmc@linux.ibm.com>
> ---
>  core/flash.c           | 154 +++++++-
>  libflash/blocklevel.h  |   5 +
>  libflash/errors.h      |   1 +
>  libflash/ipmi-hiomap.c | 955 ++++++++++++++++++++++++++++++++++++++-----------
>  libflash/ipmi-hiomap.h |  41 ++-
>  5 files changed, 936 insertions(+), 220 deletions(-)
>
> diff --git a/core/flash.c b/core/flash.c
> index e98c8e0..98614f7 100644
> --- a/core/flash.c
> +++ b/core/flash.c
> @@ -28,6 +28,14 @@
>  #include <timer.h>
>  #include <timebase.h>
>
> +/*
> + * need to keep this under the BT queue limit
> + * causes are when ipmi to bmc gets bogged down
> + * delay allows for congestion to clear
> + */
> +#define FLASH_RETRY_LIMIT 10
> +#define FLASH_SCHEDULE_DELAY 200
> +
>  enum flash_op {
>         FLASH_OP_READ,
>         FLASH_OP_WRITE,
> @@ -41,6 +49,9 @@ struct flash_async_info {
>         uint64_t pos;
>         uint64_t len;
>         uint64_t buf;
> +       int retry_counter;
> +       int transaction_id;
> +       int in_progress_schedule_delay;
>  };
>
>  struct flash {
> @@ -80,13 +91,63 @@ static u32 nvram_offset, nvram_size;
>  static bool flash_reserve(struct flash *flash)
>  {
>         bool rc = false;
> +       int lock_try_counter = 10;
> +       uint64_t now;
> +       uint64_t start_time;
> +       uint64_t wait_time;
> +       uint64_t flash_reserve_ticks = 10;
> +       uint64_t timeout_counter;
> +
> +       start_time = mftb();
> +       now = mftb();
> +       wait_time = tb_to_msecs(now - start_time);

wait_time is always going to be zero since the two mftb() calls are
going to return a pretty similar values and tb_to_msecs() divides the
difference by 512000 (tb_hz / 1000).

> +       timeout_counter = 0;
> +
> +
> +       while (wait_time < flash_reserve_ticks) {
> +               ++timeout_counter;
> +               if (timeout_counter % 4 == 0) {
> +                       now = mftb();
> +                       wait_time = tb_to_msecs(now - start_time);
> +               }
> +               if (flash->busy == false) {
> +                       break;
> +               }
> +       }

This doesn't make any sense to me. What are you doing here and why?

> -       if (!try_lock(&flash_lock))
> +       while (!try_lock(&flash_lock)) {
> +               --lock_try_counter;
> +               if (lock_try_counter == 0) {
> +                       break;
> +               }
> +       }

If you're going to do this sort of thing then use a for loop. T so the
iteration count is set the same place it's being used.

> +       if (lock_try_counter == 0) {
>                 return false;
> +       }
you should return from inside the loop instead of breaking out and
re-checking the break condition

> +
> +       /* we have the lock if we got here */
>
>         if (!flash->busy) {
>                 flash->busy = true;
>                 rc = true;
> +       } else {
> +               /* probably beat a flash_release and grabbed the lock */
> +               unlock(&flash_lock);
> +               while (!try_lock(&flash_lock)) {
> +                       --lock_try_counter;
> +                       if (lock_try_counter == 0) {
> +                               break;
> +                       }
> +               }
> +               if (lock_try_counter == 0) {
> +                       return false;
> +               }
> +               /* we have the lock if we are here */
> +               if (!flash->busy) {
> +                       flash->busy = true;
> +                       rc = true;
> +               }

This doesn't make much sense to me either. Why is replicating all the
previous logic necessary?

>         }
>         unlock(&flash_lock);

I'm not sure that we even need to use try_lock() with patch 03/13
applied. With that patch the flash's lock should only be held briefly
so we should be able to use the normal (blocking) lock() since we
don't need to deal with waiting for a 64MB flash read to complete.

> @@ -279,12 +340,31 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>                 assert(0);
>         }
>
> -       if (rc)
> -               rc = OPAL_HARDWARE;
> +       if (rc == 0) {
> +               /*
> +                * if we are good to proceed forward
> +                * otherwise we may have to try again
> +                */
> +               flash->async.pos += len;
> +               flash->async.buf += len;
> +               flash->async.len -= len;
> +               /* if we good clear */
> +               flash->async.retry_counter = 0;
> +               /*
> +                * clear the IN_PROGRESS flags
> +                * we only need IN_PROGRESS active on missed_cc
> +                */
> +               flash->bl->flags &= IN_PROGRESS_MASK;
> +               /* reset time for scheduling gap */
> +               flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;

The default delay should come from the blocklevel rather than being
assumed by the user of libflash. The bl has a better idea about what
sort of interval is required between ops and what a reasonable timeout
is.

> +       }
> +
> +       /*
> +        * corner cases if the move window misses and
> +        * the requested window is split (needing adjustment down) problem
> +        * if timing good on move_cb the world is good
> +        */
I can't parse this, but it seems like it's all blocklevel specific
stuff that should be documented there. From the perspective of a
libflash user we don't want to know or care about *why* we need to do
a async delay, just that we have to.

> -       flash->async.pos += len;
> -       flash->async.buf += len;
> -       flash->async.len -= len;
>         if (!rc && flash->async.len) {
>                 /*
>                  * We want to get called pretty much straight away, just have
> @@ -293,10 +373,38 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>                  */
>                 schedule_timer(&flash->async.poller, 0);
>                 return;
> +       } else {
> +               if (rc == FLASH_ERR_MISSED_CC) {

Try to avoid stacking indentation levels. It causes us to run into the
80col limit even faster and tracing through nested if chains gets
tedious pretty quickly.

You can avoid it using linear if() ... else if() ... else chains, e.g.

if (rc == FLASH_ERR_MISSED_CC) {
   /* new stuff goes here */
} else if (rc) {
   /* old stuff goes here */
}

> +                       ++flash->async.retry_counter;
> +                       flash->async.in_progress_schedule_delay += FLASH_SCHEDULE_DELAY;
> +                       if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
> +                               rc = OPAL_HARDWARE;
> +                               prlog(PR_TRACE, "flash_poll PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
> +                                       FLASH_RETRY_LIMIT,
> +                                       flash->async.transaction_id);
> +                       } else {
> +                               /*
> +                                * give the BT time to work and receive response
> +                                * throttle back to allow for congestion to clear
> +                                * cases observed were complete lack of ipmi response until very late
> +                                * or cases immediately following an unaligned window read/move (so slow)
> +                                */
> +                               flash->bl->flags |= IN_PROGRESS;
> +                               schedule_timer(&flash->async.poller, msecs_to_tb(flash->async.in_progress_schedule_delay));
> +                               return;
> +                       }
> +               } else {
> +                       if (rc != 0) {
> +                               rc = OPAL_HARDWARE;
> +                       }
> +               }
>         }
>
> -       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> +       flash->bl->flags &= IN_PROGRESS_MASK;
> +       flash->bl->flags &= ASYNC_REQUIRED_MASK;

Masks are for defining bit fields not the inverse of a specific bit.
Use the normal pattern for clearing a flag:
     flash->bl->flags &= ~IN_PROGRESS;
     flash->bl->flags &= ~ASYNC_REQUIRED;

> +       /* release the flash before we allow next opal entry */
>         flash_release(flash);
> +       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
>  }
>
>  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
> @@ -454,6 +562,7 @@ int flash_register(struct blocklevel_device *bl)
>         flash->size = size;
>         flash->block_size = block_size;
>         flash->id = num_flashes();
> +       flash->async.transaction_id = 0;
>         init_timer(&flash->async.poller, flash_poll, flash);
>
>         rc = ffs_init(0, flash->size, bl, &ffs, 1);
> @@ -487,7 +596,7 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>                 uint64_t buf, uint64_t size, uint64_t token)
>  {
>         struct flash *flash = NULL;
> -       uint64_t len;
> +       uint64_t len = 0;
Is that needed?

>         int rc;
>
>         list_for_each(&flashes, flash, list)
> @@ -516,6 +625,10 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>         flash->async.buf = buf + len;
>         flash->async.len = size - len;
>         flash->async.pos = offset + len;
> +       flash->async.retry_counter = 0;
> +       flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
> +       flash->bl->flags |= ASYNC_REQUIRED;
> +       ++flash->async.transaction_id;
Use post-increment unless you absolutely have to use pre-increment.
There's only a few instances where pre-increment is used in skiboot
and I'd like to keep it that way.

>
>         /*
>          * These ops intentionally have no smarts (ecc correction or erase
> @@ -539,8 +652,27 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>         }
>
>         if (rc) {
> -               prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
> -               rc = OPAL_HARDWARE;
> +               if (rc == FLASH_ERR_MISSED_CC) {

same comment about stacking indents

> +                       ++flash->async.retry_counter;
> +                       flash->async.buf = buf;
> +                       flash->async.len = size;
> +                       flash->async.pos = offset;
> +                       /* for completeness, opal_flash_op is first time pass so unless the retry limit set to 1 */
> +                       if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
> +                               rc = OPAL_HARDWARE;
> +                               prlog(PR_TRACE, "opal_flash_op PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
> +                                       FLASH_RETRY_LIMIT,
> +                                       flash->async.transaction_id);
> +                               goto out;
> +                       }
> +                       flash->bl->flags |= IN_PROGRESS;
> +                       schedule_timer(&flash->async.poller, msecs_to_tb(FLASH_SCHEDULE_DELAY));
> +                       /* Don't release the flash */
explain why

> +                       return OPAL_ASYNC_COMPLETION;
> +               } else {
> +                       prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
> +                       rc = OPAL_HARDWARE;
> +               }
>                 goto out;
>         }
>
> @@ -564,6 +696,8 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>         rc = OPAL_ASYNC_COMPLETION;
>  out:
>         flash_release(flash);
> +       flash->bl->flags &= IN_PROGRESS_MASK;
> +       flash->bl->flags &= ASYNC_REQUIRED_MASK;

>         return rc;
>  }
>
> diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
> index 492918e..63d8690 100644
> --- a/libflash/blocklevel.h
> +++ b/libflash/blocklevel.h
> @@ -18,8 +18,13 @@ struct blocklevel_range {
>         int total_prot;
>  };
>
> +#define ASYNC_REQUIRED_MASK 0xFFFB
> +#define IN_PROGRESS_MASK 0xFFF7
these can go.

> +
>  enum blocklevel_flags {
>         WRITE_NEED_ERASE = 1,
> +       ASYNC_REQUIRED = 4,
> +       IN_PROGRESS = 8,
>  };
>
>  /*
> diff --git a/libflash/errors.h b/libflash/errors.h
> index c800ada..c24166d 100644
> --- a/libflash/errors.h
> +++ b/libflash/errors.h
> @@ -21,6 +21,7 @@
>  #define FLASH_ERR_BAD_READ             15
>  #define FLASH_ERR_DEVICE_GONE  16
>  #define FLASH_ERR_AGAIN        17
> +#define FLASH_ERR_MISSED_CC    18

The libflash return codes are supposed to be independent of the
backing blocklevel. As far as I can tell this is being used as a
generic "async continuation required" return code, similar to how we
use OPAL_ASYNC_COMPLETION, so rename it something that reflects that.

>  #ifdef __SKIBOOT__
>  #include <skiboot.h>

*snip*

I'll send comments about the HIOMAP bits in a seperate mail since this
is getting unwieldy.
Oliver O'Halloran Nov. 25, 2019, 6:59 a.m. UTC | #2
On Thu, Nov 7, 2019 at 4:25 AM Deb McLemore <debmc@linux.ibm.com> wrote:
>
> +static int hiomap_wait_for_cc(struct ipmi_hiomap *ctx, int *cc, uint8_t *seq, uint64_t ticks)
> +{
> +       uint64_t now;
> +       uint64_t start_time;
> +       uint64_t wait_time;
> +       uint64_t ipmi_hiomap_ticks;
> +       uint64_t timeout_counter;
> +       int rc;
> +
> +       prlog(PR_TRACE, "Start wait for cc ipmi seq=%i *cc=%i ticks=%llu\n", *seq, *cc, ticks);
> +       rc = 0;
> +       if (this_cpu()->tb_invalid) {
> +               /*
> +                * SYNC paths already have *cc success
> +                * ASYNC will RE-QUEUE and retry
> +                * we just need to skip the tb logic handling
> +                * we need to close the window to have the logic try the move again
> +                */
> +               if (*cc != IPMI_CC_NO_ERROR) {
> +                       lock(&ctx->lock);
> +                       ctx->window_state = closed_window;
> +                       ++ctx->missed_cc_count;
> +                       prlog(PR_NOTICE, "%s tb_invalid, CLOSING WINDOW for cc "
> +                               "ipmi seq=%i ctx->missed_cc_count=%i\n",
> +                               __func__, *seq, ctx->missed_cc_count);
> +                       unlock(&ctx->lock);
> +                       rc = FLASH_ERR_MISSED_CC;
> +               }
> +               prlog(PR_NOTICE, "%s tb_invalid, hopefully this will "
> +                       "retry/recover rc=%i\n",
> +                       __func__, rc);
> +               return rc;
> +       }

Mahesh, what's the correct way to deal with the timebase going invalid
at runtime if we need to do a time_wait()? You added a hack in
1764f2452565 ("opal: Fix hang in time_wait* calls on HMI for TB
errors.") to no-op the timebase if tb_invalid is set so we don't get
stuck in the HMI recovery path, but I guess it breaks anything else
that needs a wait? I would prefer we didn't start open-coding
tb_invalid checks.
Oliver O'Halloran Nov. 25, 2019, 9:10 a.m. UTC | #3
On Wed, 2019-11-06 at 11:22 -0600, Deb McLemore wrote:
> To provide higher layer async operations to access
> the target flash, enable hiomap to perform async
> ipmi messaging for call paths thru opal_flash_op.
> 
> Special considerations and constraints are to prevent
> recursive locking and/or polling inside OPAL calls.
> 
> Objective is to remove the hiomap_queue_msg_sync for
> moving windows (which already uses pollers) to allow
> async requests to perform their functions.
> 
> Call paths thru opal_flash_op will determine if the
> requested operation needs to be re-queued, to allow
> skiboot to jump back out to Linux to prevent RCU or
> scheduler issues.
> 
> For example, if a flash window move operation
> is needed to erase a very large flash segment,
> the time spent in opal firmware would be long
> enough that Linux would stall.  To avoid the
> long running duration of various aspects of this
> inherent long running operation we divide up
> the tasks in smaller chunks to avoid cumulating
> time spent in opal firmware.  The return codes
> are used to distinguish differences between cases
> where a re-queue of the transaction would be
> needed versus a true failure return code.


> PR_TRACE used since PR_DEBUG seems to always trigger,
> unable to figure out why.

https://lists.ozlabs.org/pipermail/skiboot/2019-October/015645.html

> Due to the nature of redefining the sync versus
> async capabilities, this patch is larger than
> desired due to interrelationships of changes
> to functionality.

Yeah, nah.

The usual pattern for sync->async conversions is to re-implement the
sync parts using the async implementation internally and by having the
"sync" functions wait on a flag variable internally. Doing that then
allows you to expand the async bits gradually into the callers. See
i2c_request_sync() for an example.

Even after this patch there's a lot of HIOMAP commands that are still
required to be sync, so what I'd do is:

1. Keep hiomap_queue_msg_sync() around and continue using it for
commands that need to be sync rather than faking it. That'll remove a
pile of churn to begin with.

2. Split the transction lock() changes into a seperate patch,
preferable with a commit mesage explaining why its needed.

3. Re-implement the window movement using the new async mathods. Keep
the external interface the same.

4. Now convert flash.c to use the new async interface. This might
require moving around where changes occur so that cyril's async-lite
patch comes after the changes above.

5. Move some of the debug prints out of this patch and into a follow
up. Pruning them while reviewing them took out ~200 lines so it'll
help.

Doing that should let you get this down to a few hundred lines at most.
That's still too large, but whatever, at least it's manageable.

I'm not asking you to do this for for the sake of being a pain. I spent
all day trying to unpack what this patch actually does and that only
took all day because it's over a thousand lines of random stuff
happening for random reasons. We don't ask for large changes to be
split into a smaller ones to because we demand the platonic ideal of a
patch. We do it so that it's possible to review them without sending
the reviewer insane.

> Signed-off-by: Deb McLemore <debmc@linux.ibm.com>
> ---
>  core/flash.c           | 154 +++++++-
>  libflash/blocklevel.h  |   5 +
>  libflash/errors.h      |   1 +
>  libflash/ipmi-hiomap.c | 955 ++++++++++++++++++++++++++++++++++++++-----------
>  libflash/ipmi-hiomap.h |  41 ++-
>  5 files changed, 936 insertions(+), 220 deletions(-)

flash changes are in the other mail

> diff --git a/libflash/ipmi-hiomap.c b/libflash/ipmi-hiomap.c
> index 7327b83..08af005 100644
> --- a/libflash/ipmi-hiomap.c
> +++ b/libflash/ipmi-hiomap.c
> @@ -11,6 +11,10 @@
>  #include <stdbool.h>
>  #include <stdint.h>
>  #include <string.h>
> +#include <lock.h>
> +#include <debug_descriptor.h>
> +#include <timebase.h>
> +#include <cpu.h>
>  
>  #include <ccan/container_of/container_of.h>
>  
> @@ -24,7 +28,7 @@ struct ipmi_hiomap_result {
>  	int16_t cc;
>  };
>  
> -#define RESULT_INIT(_name, _ctx) struct ipmi_hiomap_result _name = { _ctx, -1 }
> +static struct hiomap_v2_create_window *window_parms;
>  
>  static inline uint32_t blocks_to_bytes(struct ipmi_hiomap *ctx, uint16_t blocks)
>  {
> @@ -62,9 +66,20 @@ static int hiomap_protocol_ready(struct ipmi_hiomap *ctx)
>  	return 0;
>  }
>  
> -static int hiomap_queue_msg_sync(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
> +static int hiomap_queue_msg(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
>  {
>  	int rc;
> +	int bl_flags;
> +
> +	lock(&ctx->lock);
> +	bl_flags = ctx->bl.flags;
> +	unlock(&ctx->lock);
> +

> +	/*
> +	 * during boot caution to stay duration within skiboot/
> +	 * no exit re-entry due to poller conflicts with synchronous window moves
> +	 * asynchronous usage intended for opal_flash_op and flash_poll paths
> +	 */
doesn't parse


>  	/*
>  	 * There's an unavoidable TOCTOU race here with the BMC sending an
> @@ -73,17 +88,23 @@ static int hiomap_queue_msg_sync(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
>  	 * hiomap_queue_msg_sync() exists to capture the race in a single
>  	 * location.
>  	 */
> -	lock(&ctx->lock);
> -	rc = hiomap_protocol_ready(ctx);
> -	unlock(&ctx->lock);
> -	if (rc) {
> -		ipmi_free_msg(msg);
> -		return rc;
> -	}
>  
> -	ipmi_queue_msg_sync(msg);
> +	if ((opal_booting()) || (!(bl_flags & ASYNC_REQUIRED))) {
> +		lock(&ctx->lock);
> +		rc = hiomap_protocol_ready(ctx);
> +		unlock(&ctx->lock);
> +		if (rc) {
> +			ipmi_free_msg(msg);
> +			return rc;
> +		}
> +		prlog(PR_TRACE, "%s SENDING SYNC\n", __func__);
> +		ipmi_queue_msg_sync(msg);
> +	} else {
> +		prlog(PR_TRACE, "%s SENDING ASYNC\n", __func__);
> +		rc = ipmi_queue_msg(msg);

Is the hiomap_protocol_ready() check required in the async branch too?

> +	}
>  
> -	return 0;
> +	return rc;
>  }
>  
>  /* Call under ctx->lock */
> @@ -100,12 +121,178 @@ static int hiomap_window_valid(struct ipmi_hiomap *ctx, uint64_t pos,
>  		return FLASH_ERR_PARM_ERROR;
>  	if (pos < ctx->current.cur_pos)
>  		return FLASH_ERR_PARM_ERROR;
> -	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size))
> -		return FLASH_ERR_PARM_ERROR;
> +	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size)) {

> +		/* we will compensate the proper values in caller */
compensate?

> +		if ((pos + ctx->current.size) <= (ctx->current.cur_pos + ctx->current.size)) {
> +			prlog(PR_TRACE, "%s OK pos=%llu "
> +				"ctx->current.size=0x%x "
> +				"ctx->current.cur_pos=0x%x\n",
> +				__func__,
> +				pos,
> +				ctx->current.size,
> +				ctx->current.cur_pos);
> +		} else {
> +			prlog(PR_TRACE, "%s CHECKING further pos=%llu "
> +				"for len=%llu ctx->current.size=0x%x "
> +				"ctx->current.cur_pos=0x%x\n",
> +				__func__,
> +				pos,
> +				len,
> +				ctx->current.size,
> +				ctx->current.cur_pos);
> +			if ((pos + ctx->current.adjusted_window_size) <= (ctx->current.cur_pos + ctx->current.size)) {
> +				prlog(PR_TRACE, "%s OK use ADJUSTED pos=%llu "
> +					"adjusted_len=%i for len=%llu "
> +					"ctx->current.size=0x%x "
> +					"ctx->current.cur_pos=0x%x\n",
> +					__func__,
> +					pos,
> +					ctx->current.adjusted_window_size,
> +					len,
> +					ctx->current.size,
> +					ctx->current.cur_pos);
> +			} else {
> +				prlog(PR_TRACE, "%s we need to MOVE the window\n", __func__);
> +				return FLASH_ERR_PARM_ERROR;
> +			}
> +		}
> +	}
>  
> +	prlog(PR_TRACE, "%s ALL GOOD, no move needed\n", __func__);
>  	return 0;
>  }
>  

> +static void move_cb(struct ipmi_msg *msg)
> +{
> +	/*
> +	 * we leave the move_cb outside of the ipmi_hiomap_cmd_cb
> +	 * based on the command we need to special close the window
> +	 */
I don't see why this can't be handled in-line with the reset of the
hiomap commands. Why is closing the window required? Does the existing
sync path do that?

> +	struct ipmi_hiomap_result *res = msg->user_data;
> +	struct ipmi_hiomap *ctx = res->ctx;
> +	/*
> +	 * only a few iterations to try for lock/
> +	 * contention is probably hiomap_window_move trying to setup again
> +	 */
> +	int lock_try_counter = 10;
> +
> +	if ((msg->resp_size != 8) || (msg->cc != IPMI_CC_NO_ERROR) || (msg->data[1] != ctx->inflight_seq)) {
> +		lock(&ctx->lock);
> +		ctx->cc = OPAL_HARDWARE;
> +		ctx->window_state = closed_window;
> +		goto out;
> +	}
> +
> +	window_parms = (struct hiomap_v2_create_window *)&msg->data[2];

> +	while (!try_lock(&ctx->lock)) {
> +		--lock_try_counter;
> +		if (lock_try_counter == 0) {
> +			break;
> +		}
> +	}
> +	if (lock_try_counter == 0) {
> +		/*
> +		 * we cannot get the lock, but update anyway
> +		 * because we cannot communicate this completion
> +		 * and someone will need to retry
> +		 * contention usually with handle_events or window_move
> +		 * this code path is the critical path that will open the window
> +		 */
> +		ctx->window_state = closed_window;
> +		ctx->cc = OPAL_PARAMETER;
> +		goto out2;
> +	}
This lock retry stuff seems fundementally sketchy and I don't see why
it's needed.

> +	/* If here, we got the lock, cc consumed higher up so need in ctx */
> +
> +	ctx->cc = IPMI_CC_NO_ERROR;
> +	ctx->current.lpc_addr =
> +		blocks_to_bytes(ctx, le16_to_cpu(window_parms->lpc_addr));
> +	ctx->current.size =
> +		blocks_to_bytes(ctx, le16_to_cpu(window_parms->size));
> +	ctx->current.cur_pos =
> +		blocks_to_bytes(ctx, le16_to_cpu(window_parms->offset));

> +	/* refresh to current */
> +	ctx->current.adjusted_window_size = ctx->current.size;
> +
> +	/* now that we have moved stuff the values */
> +	*ctx->active_size = ctx->requested_len;

> +	/*
> +	 * Is length past the end of the window?
> +	 * if this condition happens it can cause the async.retry_counter to fail

Again, explain why. The fact it can happen is interesting, but comments
should be answering more questions then they raise.

> +	if ((ctx->requested_pos + ctx->requested_len) > (ctx->current.cur_pos + ctx->current.size)) {
> +		/*
> +		 * Adjust size to meet current window
> +		 * active_size goes back to caller,
> +		 * but caller may expire and we need to store for future use
> +		 */
> +		*ctx->active_size = (ctx->current.cur_pos + ctx->current.size) - ctx->requested_pos;
> +		ctx->current.adjusted_window_size = (ctx->current.cur_pos + ctx->current.size) - ctx->requested_pos;
> +	}

Both of these variables have no business being set here. How much of
the current window that we should read is only interesting at the point
where we do the actual read (i.e. in ipmi_hiomap_read()) and it should
be handled there rather than passing around a pointer (to a static
local variable?) and setting it some random callback. 

hiomap_window_move() already returns the size of the mapped area so
truncate it there.

> +	if (ctx->requested_len != 0 && *ctx->active_size == 0) {
> +		prlog(PR_NOTICE, "%s Invalid window properties: len: %llu, size: %llu\n",
> +			__func__, ctx->requested_len, *ctx->active_size);
> +		ctx->cc = OPAL_PARAMETER;
> +		ctx->window_state = closed_window;
> +		goto out;
> +	}
> +
> +	if (msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW)
> +		ctx->window_state = read_window;
> +	else
> +		ctx->window_state = write_window;
> +
> +out:	prlog(PR_TRACE, "Exiting the move window callback "
> +		"transaction ipmi seq=%i\n",
> +		ctx->inflight_seq);
> +	unlock(&ctx->lock);
> +out2:	ipmi_free_msg(msg);
> +}
> +
>  static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>  {
>  	struct ipmi_hiomap_result *res = msg->user_data;
> @@ -125,9 +312,9 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>  		return;
>  	}
>  
> -	if (msg->data[1] != ctx->seq) {
> +	if (msg->data[1] != ctx->inflight_seq) {
>  		prerror("Unmatched sequence number: wanted %u got %u\n",
> -			ctx->seq, msg->data[1]);
> +			ctx->inflight_seq, msg->data[1]);
>  		res->cc = IPMI_ERR_UNSPECIFIED;
>  		ipmi_free_msg(msg);
>  		return;
> @@ -138,6 +325,7 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>  	{
>  		struct hiomap_v2_info *parms;
>  
> +		ctx->cc = IPMI_CC_NO_ERROR;
>  		if (msg->resp_size != 6) {
>  			prerror("%u: Unexpected response size: %u\n", msg->data[0],
>  				msg->resp_size);
> @@ -162,6 +350,7 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>  	{
>  		struct hiomap_v2_flash_info *parms;
>  
> +		ctx->cc = IPMI_CC_NO_ERROR;
>  		if (msg->resp_size != 6) {
>  			prerror("%u: Unexpected response size: %u\n", msg->data[0],
>  				msg->resp_size);
> @@ -176,36 +365,6 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>  			blocks_to_bytes(ctx, le16_to_cpu(parms->erase_granule));
>  		break;
>  	}
> -	case HIOMAP_C_CREATE_READ_WINDOW:
> -	case HIOMAP_C_CREATE_WRITE_WINDOW:
> -	{
> -		struct hiomap_v2_create_window *parms;
> -
> -		if (msg->resp_size != 8) {
> -			prerror("%u: Unexpected response size: %u\n", msg->data[0],
> -				msg->resp_size);
> -			res->cc = IPMI_ERR_UNSPECIFIED;
> -			break;
> -		}
> -
> -		parms = (struct hiomap_v2_create_window *)&msg->data[2];
> -
> -		ctx->current.lpc_addr =
> -			blocks_to_bytes(ctx, le16_to_cpu(parms->lpc_addr));
> -		ctx->current.size =
> -			blocks_to_bytes(ctx, le16_to_cpu(parms->size));
> -		ctx->current.cur_pos =
> -			blocks_to_bytes(ctx, le16_to_cpu(parms->offset));
> -
> -		lock(&ctx->lock);
> -		if (msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW)
> -			ctx->window_state = read_window;
> -		else
> -			ctx->window_state = write_window;
> -		unlock(&ctx->lock);
> -
> -		break;
> -	}
>  	case HIOMAP_C_MARK_DIRTY:
>  	case HIOMAP_C_FLUSH:
>  	case HIOMAP_C_ACK:
> @@ -215,7 +374,15 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>  			prerror("%u: Unexpected response size: %u\n", msg->data[0],
>  				msg->resp_size);
>  			res->cc = IPMI_ERR_UNSPECIFIED;
> +			ctx->cc = OPAL_HARDWARE;
>  			break;
> +		} else {
> +			prlog(PR_TRACE, "%s Command=%u 1=RESET 7=DIRTY 8=FLUSH 9=ACK 10=ERASE ipmi seq=%u ctx->inflight_seq=%u\n",\
> +				__func__,
> +				msg->data[0],
> +				msg->data[1],
> +				ctx->inflight_seq);
> +			ctx->cc = IPMI_CC_NO_ERROR;
>  		}
>  		break;
>  	default:
> @@ -237,57 +404,179 @@ static void hiomap_init(struct ipmi_hiomap *ctx)
>  	unlock(&ctx->lock);
>  }
>  
> +static int hiomap_wait_for_cc(struct ipmi_hiomap *ctx, int *cc, uint8_t *seq, uint64_t ticks)
> +{
> +	uint64_t now;
> +	uint64_t start_time;
> +	uint64_t wait_time;
> +	uint64_t ipmi_hiomap_ticks;
> +	uint64_t timeout_counter;
> +	int rc;
> +
> +	prlog(PR_TRACE, "Start wait for cc ipmi seq=%i *cc=%i ticks=%llu\n", *seq, *cc, ticks);
> +	rc = 0;
> +	if (this_cpu()->tb_invalid) {
> +		/*
> +		 * SYNC paths already have *cc success
> +		 * ASYNC will RE-QUEUE and retry
> +		 * we just need to skip the tb logic handling
> +		 * we need to close the window to have the logic try the move again
> +		 */
> +		if (*cc != IPMI_CC_NO_ERROR) {
> +			lock(&ctx->lock);
> +			ctx->window_state = closed_window;
> +			++ctx->missed_cc_count;
> +			prlog(PR_NOTICE, "%s tb_invalid, CLOSING WINDOW for cc "
> +				"ipmi seq=%i ctx->missed_cc_count=%i\n",
> +				__func__, *seq, ctx->missed_cc_count);
> +			unlock(&ctx->lock);
> +			rc = FLASH_ERR_MISSED_CC;
> +		}
> +		prlog(PR_NOTICE, "%s tb_invalid, hopefully this will "
> +			"retry/recover rc=%i\n",
> +			__func__, rc);
> +		return rc;
> +	}
open-coding checks of tb_invalid is something that no one should be
doing unless they're dealing with the timebase directly. Why do we need
it here?

> +	start_time = mftb();
> +	now = mftb();
> +	wait_time = tb_to_msecs(now - start_time);
> +	timeout_counter = 0;
wait_time is zero.

> +
> +	if (ticks != 0) {
> +		ipmi_hiomap_ticks = ticks;
> +	} else {
> +		ipmi_hiomap_ticks = IPMI_HIOMAP_TICKS;
> +	}
skiboot coding style is to omit the braces on single line blocks so
drop them. this could also just be:

if (!ticks)
	ipmi_hiomap_ticks = IPMI_HIOMAP_TICKS;

> +	while (wait_time < ipmi_hiomap_ticks) {
> +		++timeout_counter;
> +		if (timeout_counter % IPMI_SKIP_INC == 0) {
> +			now = mftb();
> +			wait_time = tb_to_msecs(now - start_time);
> +		}

why do we need to open-code a delay loop rather than using
time_wait_ms(), or it's no-polling version?

> +		if (*cc == IPMI_CC_NO_ERROR) {
> +			prlog(PR_TRACE, "Break cc ipmi seq=%i "
> +				"ctx->missed_cc_count=%i\n",
> +				*seq, ctx->missed_cc_count);
> +			break;
> +		}
> +	}
> +	if (*cc != IPMI_CC_NO_ERROR) {
> +		lock(&ctx->lock);
> +		ctx->window_state = closed_window;
> +		++ctx->missed_cc_count;
> +		unlock(&ctx->lock);
> +		rc = FLASH_ERR_MISSED_CC;
> +	}
> +
> +	prlog(PR_TRACE, "Stop wait for *cc=%i ipmi seq=%i "
> +		"ctx->missed_cc_count=%i\n",
> +		*cc, *seq, ctx->missed_cc_count);
> +	return rc;
> +
> +}
> +

>  static int hiomap_get_info(struct ipmi_hiomap *ctx)
>  {
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result info_res;
That should be part of the context structure rather than being a static
local.

>  	unsigned char req[3];
>  	struct ipmi_msg *msg;
> +	uint8_t info_seq;
> +	int orig_flags;
> +	int tmp_sync_flags;
>  	int rc;
>  
> +	info_res.ctx = ctx;
> +	info_res.cc = -1;
> +
> +	lock(&ctx->lock);
> +	orig_flags = ctx->bl.flags;
> +	/* clear out async to always do sync */
> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
> +	ctx->bl.flags = tmp_sync_flags;
> +	ctx->cc = -1;
> +	info_seq = ++ctx->seq;
> +	ctx->inflight_seq = info_seq;
> +	unlock(&ctx->lock);
> +
>  	/* Negotiate protocol version 2 */
>  	req[0] = HIOMAP_C_GET_INFO;
> -	req[1] = ++ctx->seq;
> +	req[1] = info_seq;
>  	req[2] = HIOMAP_V2;
>  
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 6);
> +			 ipmi_hiomap_cmd_cb, &info_res, req, sizeof(req), 6);
>  
> -	rc = hiomap_queue_msg_sync(ctx, msg);
> +	rc = hiomap_queue_msg(ctx, msg);
> +	lock(&ctx->lock);
> +	ctx->bl.flags = orig_flags;
> +	unlock(&ctx->lock);
>  	if (rc)
>  		return rc;
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prerror("%s failed: %d\n", __func__, res.cc);
> -		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
> +
> +	if (rc) {
> +		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +			__func__, rc, IPMI_ACK_DEFAULT);
>  	}
>  
> -	return 0;
> +	return rc;
>  }
>  
>  static int hiomap_get_flash_info(struct ipmi_hiomap *ctx)
>  {
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result flash_info_res;
same

>  	unsigned char req[2];
>  	struct ipmi_msg *msg;
> +	uint8_t flash_info_seq;
> +	int orig_flags;
> +	int tmp_sync_flags;
>  	int rc;
>  
> +	flash_info_res.ctx = ctx;
> +	flash_info_res.cc = -1;
> +
> +	lock(&ctx->lock);
> +	orig_flags = ctx->bl.flags;
> +	/* clear out async to always do sync */
> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
> +	ctx->bl.flags = tmp_sync_flags;
> +	ctx->cc = -1;
> +	flash_info_seq = ++ctx->seq;
> +	ctx->inflight_seq = flash_info_seq;
> +	unlock(&ctx->lock);
> +
>  	req[0] = HIOMAP_C_GET_FLASH_INFO;
> -	req[1] = ++ctx->seq;
> +	req[1] = flash_info_seq;
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2 + 2 + 2);
> +			 ipmi_hiomap_cmd_cb, &flash_info_res, req, sizeof(req), 2 + 2 + 2);
>  
> -	rc = hiomap_queue_msg_sync(ctx, msg);
> +	rc = hiomap_queue_msg(ctx, msg);
> +	lock(&ctx->lock);
> +	ctx->bl.flags = orig_flags;
> +	unlock(&ctx->lock);
>  	if (rc)
>  		return rc;
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prerror("%s failed: %d\n", __func__, res.cc);
> -		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
> +	if (rc) {
> +		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +			__func__, rc, IPMI_ACK_DEFAULT);
>  	}
>  
> -	return 0;
> +	return rc;
>  }
>  
>  static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
> @@ -295,32 +584,65 @@ static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
>  {
>  	enum lpc_window_state want_state;
>  	struct hiomap_v2_range *range;
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result move_res;
same

>  	unsigned char req[6];
>  	struct ipmi_msg *msg;
> +	uint8_t move_seq;
>  	bool valid_state;
>  	bool is_read;
>  	int rc;
>  
> +	move_res.ctx = ctx;
> +	move_res.cc = -1;
>  	is_read = (command == HIOMAP_C_CREATE_READ_WINDOW);
>  	want_state = is_read ? read_window : write_window; 

> +	/* there will be lock contention between hiomap_window_move and move_cb */
If by contention you mean "they take the same lock" then sure, but I
don't see why it matters.

>  	lock(&ctx->lock);
> +	ctx->cc = -1;
> +
> +	if (ctx->bl.flags & IN_PROGRESS) {
> +		pos = ctx->tracking_pos;
> +		len = ctx->tracking_len;
> +	} else {
> +		ctx->tracking_pos = pos;
> +		ctx->tracking_len = len;
> +	}
>  
>  	valid_state = want_state == ctx->window_state;
>  	rc = hiomap_window_valid(ctx, pos, len);
> +
>  	if (valid_state && !rc) {
> +		/* if its valid stuff the proper maybe modified size */
> +		if ((pos + len) > (ctx->current.cur_pos + ctx->current.size)) {
> +			/* if we had bumped the adjusted_window_size down in move_cb */
> +			if ((ctx->current.adjusted_window_size != ctx->current.size)) {
> +				*size = ctx->current.adjusted_window_size;
> +			} else {
> +				*size = (ctx->current.cur_pos + ctx->current.size) - pos;
> +			}
braces, and as I said above move the calculation of the "adjusted" size
to here rather than having it in the callback.


> +		} else {
> +			*size = len;
> +		}
> +		ctx->cc = IPMI_CC_NO_ERROR;
>  		unlock(&ctx->lock);
> -		*size = len;
>  		return 0;
>  	}
>  
> -	ctx->window_state = closed_window;
> +	ctx->window_state = moving_window;
>  
> -	unlock(&ctx->lock);
> +	ctx->active_size = size;
> +	ctx->requested_pos = pos;
> +	ctx->requested_len = len;
> +
> +	move_seq = ++ctx->seq;
> +	ctx->inflight_seq = move_seq;
>  
>  	req[0] = command;
> -	req[1] = ++ctx->seq;
> +	req[1] = move_seq;
> +
> +	unlock(&ctx->lock);
>  
>  	range = (struct hiomap_v2_range *)&req[2];
>  	range->offset = cpu_to_le16(bytes_to_blocks(ctx, pos));
> @@ -328,38 +650,14 @@ static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
>  
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req),
> +			 move_cb, &move_res, req, sizeof(req),
>  			 2 + 2 + 2 + 2);
>  
> -	rc = hiomap_queue_msg_sync(ctx, msg);
> +	rc = hiomap_queue_msg(ctx, msg);
> +
>  	if (rc)
>  		return rc;
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prlog(PR_INFO, "%s failed: %d\n", __func__, res.cc);
> -		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
> -	}
> -
> -	lock(&ctx->lock);
> -	*size = len;
> -	/* Is length past the end of the window? */
> -	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size))
> -		/* Adjust size to meet current window */
> -		*size = (ctx->current.cur_pos + ctx->current.size) - pos;
> -
> -	if (len != 0 && *size == 0) {
> -		unlock(&ctx->lock);
> -		prerror("Invalid window properties: len: %"PRIu64", size: %"PRIu64"\n",
> -			len, *size);
> -		return FLASH_ERR_PARM_ERROR;
> -	}
> -
> -	prlog(PR_DEBUG, "Opened %s window from 0x%x for %u bytes at 0x%x\n",
> -	      (command == HIOMAP_C_CREATE_READ_WINDOW) ? "read" : "write",
> -	      ctx->current.cur_pos, ctx->current.size, ctx->current.lpc_addr);
> -
> -	unlock(&ctx->lock);
> -
>  	return 0;
>  }
>  
> @@ -368,21 +666,27 @@ static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
>  {
>  	struct hiomap_v2_range *range;
>  	enum lpc_window_state state;
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result dirty_res;
>  	unsigned char req[6];
>  	struct ipmi_msg *msg;
> +	uint8_t dirty_seq;
>  	uint32_t pos;
>  	int rc;
>  
> +	dirty_res.ctx = ctx;
> +	dirty_res.cc = -1;
>  	lock(&ctx->lock);
>  	state = ctx->window_state;
> +	dirty_seq = ++ctx->seq;
> +	ctx->inflight_seq = dirty_seq;
> +	ctx->cc = -1;
>  	unlock(&ctx->lock);
>  
>  	if (state != write_window)
>  		return FLASH_ERR_PARM_ERROR;
>  
>  	req[0] = HIOMAP_C_MARK_DIRTY;
> -	req[1] = ++ctx->seq;
> +	req[1] = dirty_seq;
>  
>  	pos = offset - ctx->current.cur_pos;
>  	range = (struct hiomap_v2_range *)&req[2];
> @@ -391,19 +695,15 @@ static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
>  
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
> +			 ipmi_hiomap_cmd_cb, &dirty_res, req, sizeof(req), 2);
> +
> +	rc = hiomap_queue_msg(ctx, msg);
>  
> -	rc = hiomap_queue_msg_sync(ctx, msg);
>  	if (rc)
>  		return rc;
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prerror("%s failed: %d\n", __func__, res.cc);
> -		return FLASH_ERR_PARM_ERROR;
> -	}
> -
>  	prlog(PR_DEBUG, "Marked flash dirty at 0x%" PRIx64 " for %" PRIu64 "\n",
> -	      offset, size);
> +		offset, size);
>  
>  	return 0;
>  }
> @@ -411,34 +711,36 @@ static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
>  static int hiomap_flush(struct ipmi_hiomap *ctx)
>  {
>  	enum lpc_window_state state;
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result flush_res;
>  	unsigned char req[2];
>  	struct ipmi_msg *msg;
> +	uint8_t flush_seq;
>  	int rc;
>  
> +	flush_res.ctx = ctx;
> +	flush_res.cc = -1;
>  	lock(&ctx->lock);
>  	state = ctx->window_state;
> +	flush_seq = ++ctx->seq;
> +	ctx->inflight_seq = flush_seq;
> +	ctx->cc = -1;
>  	unlock(&ctx->lock);
>  
>  	if (state != write_window)
>  		return FLASH_ERR_PARM_ERROR;
>  
>  	req[0] = HIOMAP_C_FLUSH;
> -	req[1] = ++ctx->seq;
> +	req[1] = flush_seq;
>  
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
> +			 ipmi_hiomap_cmd_cb, &flush_res, req, sizeof(req), 2);
> +
> +	rc = hiomap_queue_msg(ctx, msg);
>  
> -	rc = hiomap_queue_msg_sync(ctx, msg);
>  	if (rc)
>  		return rc;
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prerror("%s failed: %d\n", __func__, res.cc);
> -		return FLASH_ERR_PARM_ERROR;
> -	}
> -
>  	prlog(PR_DEBUG, "Flushed writes\n");
>  
>  	return 0;
> @@ -446,26 +748,47 @@ static int hiomap_flush(struct ipmi_hiomap *ctx)
>  
>  static int hiomap_ack(struct ipmi_hiomap *ctx, uint8_t ack)
>  {
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result ack_res;
here too

>  	unsigned char req[3];
>  	struct ipmi_msg *msg;
> +	uint8_t ack_seq;
> +	int orig_flags;
> +	int tmp_sync_flags;
>  	int rc;
>  
> +	ack_res.ctx = ctx;
> +	ack_res.cc = -1;
> +
> +	lock(&ctx->lock);
> +	orig_flags = ctx->bl.flags;
> +	/* clear out async to always do sync */
> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
> +	ctx->bl.flags = tmp_sync_flags;
> +	ctx->cc = -1;
> +	ack_seq = ++ctx->seq;
> +	ctx->inflight_seq = ack_seq;
> +	unlock(&ctx->lock);
> +
>  	req[0] = HIOMAP_C_ACK;
> -	req[1] = ++ctx->seq;
> +	req[1] = ack_seq;
>  	req[2] = ack;
>  
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
> +			 ipmi_hiomap_cmd_cb, &ack_res, req, sizeof(req), 2);
>  
> -	rc = hiomap_queue_msg_sync(ctx, msg);
> +	rc = hiomap_queue_msg(ctx, msg);
> +	lock(&ctx->lock);
> +	ctx->bl.flags = orig_flags;
> +	unlock(&ctx->lock);
>  	if (rc)
>  		return rc;
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prlog(PR_DEBUG, "%s failed: %d\n", __func__, res.cc);
> -		return FLASH_ERR_PARM_ERROR;
> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
> +	if (rc) {
> +		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +			__func__, rc, IPMI_ACK_DEFAULT);
> +		return rc;
>  	}
>  
>  	prlog(PR_DEBUG, "Acked events: 0x%x\n", ack);
> @@ -478,21 +801,27 @@ static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
>  {
>  	struct hiomap_v2_range *range;
>  	enum lpc_window_state state;
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result erase_res;
here too
>  	unsigned char req[6];
>  	struct ipmi_msg *msg;
> +	uint8_t erase_seq;
>  	uint32_t pos;
>  	int rc;
>  
> +	erase_res.ctx = ctx;
> +	erase_res.cc = -1;
>  	lock(&ctx->lock);
>  	state = ctx->window_state;
> +	erase_seq = ++ctx->seq;
> +	ctx->inflight_seq = erase_seq;
> +	ctx->cc = -1;
>  	unlock(&ctx->lock);
>  
>  	if (state != write_window)
>  		return FLASH_ERR_PARM_ERROR;
>  
>  	req[0] = HIOMAP_C_ERASE;
> -	req[1] = ++ctx->seq;
> +	req[1] = erase_seq;
>  
>  	pos = offset - ctx->current.cur_pos;
>  	range = (struct hiomap_v2_range *)&req[2];
> @@ -501,16 +830,13 @@ static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
>  
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
> -	rc = hiomap_queue_msg_sync(ctx, msg);
> +			 ipmi_hiomap_cmd_cb, &erase_res, req, sizeof(req), 2);
> +
> +	rc = hiomap_queue_msg(ctx, msg);
> +
>  	if (rc)
>  		return rc;
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prerror("%s failed: %d\n", __func__, res.cc);
> -		return FLASH_ERR_PARM_ERROR;
> -	}
> -
>  	prlog(PR_DEBUG, "Erased flash at 0x%" PRIx64 " for %" PRIu64 "\n",
>  	      offset, size);
>  
> @@ -519,24 +845,53 @@ static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
>  
>  static bool hiomap_reset(struct ipmi_hiomap *ctx)
>  {
> -	RESULT_INIT(res, ctx);
> +	static struct ipmi_hiomap_result reset_res;
same

>  	unsigned char req[2];
>  	struct ipmi_msg *msg;
> +	uint8_t reset_seq;
> +	int orig_flags;
> +	int tmp_sync_flags;
> +	int rc;
>  
> -	prlog(PR_NOTICE, "Reset\n");
> +	prlog(PR_NOTICE, "%s Reset ENTRY\n", __func__);
> +	reset_res.ctx = ctx;
> +	reset_res.cc = -1;
> +
> +	lock(&ctx->lock);
> +	orig_flags = ctx->bl.flags;
> +	/* clear out async to always do sync */
> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
> +	ctx->bl.flags = tmp_sync_flags;
> +	reset_seq = ++ctx->seq;
> +	ctx->cc = -1;
> +	ctx->inflight_seq = reset_seq;
> +	unlock(&ctx->lock);
>  
>  	req[0] = HIOMAP_C_RESET;
> -	req[1] = ++ctx->seq;
> +	req[1] = reset_seq;
>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
> -	ipmi_queue_msg_sync(msg);
> +			 ipmi_hiomap_cmd_cb, &reset_res, req, sizeof(req), 2);
> +
> +	rc = hiomap_queue_msg(ctx, msg);
> +	lock(&ctx->lock);
> +	ctx->bl.flags = orig_flags;
> +	unlock(&ctx->lock);
> +
> +	if (rc) {
> +		prlog(PR_NOTICE, "%s reset queue msg failed: rc=%d\n", __func__, rc);
> +		return false;
> +	}
>  
> -	if (res.cc != IPMI_CC_NO_ERROR) {
> -		prlog(PR_ERR, "%s failed: %d\n", __func__, res.cc);
> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
> +
> +	if (rc) {
> +		prlog(PR_NOTICE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +			__func__, rc, IPMI_ACK_DEFAULT);
>  		return false;
>  	}
>  
> +	prlog(PR_NOTICE, "%s Reset EXIT\n", __func__);
>  	return true;
>  }
>  
> @@ -666,6 +1021,7 @@ static int ipmi_hiomap_handle_events(struct ipmi_hiomap *ctx)
>  	 * Therefore it is enough to mark the window as closed to consider it
>  	 * recovered.
>  	 */
> +
>  	if (status & (HIOMAP_E_PROTOCOL_RESET | HIOMAP_E_WINDOW_RESET))
>  		ctx->window_state = closed_window;
>  
> @@ -737,8 +1093,9 @@ static int ipmi_hiomap_read(struct blocklevel_device *bl, uint64_t pos,
>  			    void *buf, uint64_t len)
>  {
>  	struct ipmi_hiomap *ctx;
> -	uint64_t size;
> -	int rc = 0;
> +	enum lpc_window_state state;
> +	static uint64_t size;
> +	int rc;
>  
>  	/* LPC is only 32bit */
>  	if (pos > UINT_MAX || len > UINT_MAX)
> @@ -746,88 +1103,208 @@ static int ipmi_hiomap_read(struct blocklevel_device *bl, uint64_t pos,
>  
>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
>  
> +	lock(&ctx->transaction_lock);
> +
>  	rc = ipmi_hiomap_handle_events(ctx);
>  	if (rc)
> -		return rc;
> +		goto out;
> +
> +	lock(&ctx->lock);
> +	if (ctx->bl.flags & IN_PROGRESS) {
> +		buf = ctx->tracking_buf;
> +		pos = ctx->tracking_pos;
> +		len = ctx->tracking_len;
> +	} else {
> +		ctx->tracking_buf = buf;
> +		ctx->tracking_pos = 0;
> +		ctx->tracking_len = 0;
> +	}
> +	unlock(&ctx->lock);
>  
>  	prlog(PR_TRACE, "Flash read at %#" PRIx64 " for %#" PRIx64 "\n", pos,
>  	      len);
>  	while (len > 0) {
> -		/* Move window and get a new size to read */
> -		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_READ_WINDOW, pos,
> -				        len, &size);
> -		if (rc)
> -			return rc;
> -
> -		/* Perform the read for this window */
> -		rc = lpc_window_read(ctx, pos, buf, size);
> -		if (rc)
> -			return rc;
> -
> -		/* Check we can trust what we read */
>  		lock(&ctx->lock);
> -		rc = hiomap_window_valid(ctx, pos, size);
> +		state = ctx->window_state;
>  		unlock(&ctx->lock);
> -		if (rc)
> -			return rc;
> +		if (state != moving_window) {
> +			/* Move window and get a new size to read */
> +			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_READ_WINDOW, pos,
> +				len, &size);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
> +					__func__, rc);
> +				goto out;
> +			}
> +		} else {
> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_HIOMAP_TICKS_DEFAULT);
> +			if (rc) {
> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +					__func__, rc, IPMI_HIOMAP_TICKS_DEFAULT);
> +				goto out;
> +			}
> +		}
>  
> -		len -= size;
> -		pos += size;
> -		buf += size;
> +		lock(&ctx->lock);
> +		state = ctx->window_state;
> +		unlock(&ctx->lock);
> +		if (state == read_window) {
> +			/*
> +			 * don't lock in case move_cb in progress
> +			 * if we get here the state is good
> +			 * just double-checking
> +			 */
> +			if (ctx->cc != IPMI_CC_NO_ERROR) {
> +				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
> +				rc = OPAL_HARDWARE;
> +				goto out;
> +			}
> +			/* Perform the read for this window */
> +			rc = lpc_window_read(ctx, pos, buf, size);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s lpc_window_read failed: rc=%d\n", __func__, rc);
> +				goto out;
> +			}
> +
> +			/* Check we can trust what we read */
> +			lock(&ctx->lock);
> +			rc = hiomap_window_valid(ctx, pos, size);
> +			unlock(&ctx->lock);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_window_valid failed: rc=%d\n", __func__, rc);
> +				goto out;
> +			}
> +
> +			len -= size;
> +			pos += size;
> +			buf += size;
> +			lock(&ctx->lock);
> +			ctx->tracking_len = len;
> +			ctx->tracking_pos = pos;
> +			ctx->tracking_buf = buf;
> +			unlock(&ctx->lock);
> +		}
>  	}
> -	return rc;
>  
> +out:	unlock(&ctx->transaction_lock);
> +	return rc;
>  }
>  
>  static int ipmi_hiomap_write(struct blocklevel_device *bl, uint64_t pos,
>  			     const void *buf, uint64_t len)
>  {
>  	struct ipmi_hiomap *ctx;
> -	uint64_t size;
> -	int rc = 0;
> +	enum lpc_window_state state;
> +	static uint64_t size;
> +	int rc;
>  
>  	/* LPC is only 32bit */
>  	if (pos > UINT_MAX || len > UINT_MAX)
>  		return FLASH_ERR_PARM_ERROR;
>  
>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
> +	lock(&ctx->transaction_lock);
>  
>  	rc = ipmi_hiomap_handle_events(ctx);
>  	if (rc)
> -		return rc;
> +		goto out;
> +
> +	lock(&ctx->lock);
> +	if (ctx->bl.flags & IN_PROGRESS) {
> +		buf = ctx->tracking_buf;
> +		pos = ctx->tracking_pos;
> +		len = ctx->tracking_len;
> +	} else {
> +		ctx->tracking_buf = (void *) buf;
> +		ctx->tracking_pos = 0;
> +		ctx->tracking_len = 0;
> +	}
> +	unlock(&ctx->lock);
>  
>  	prlog(PR_TRACE, "Flash write at %#" PRIx64 " for %#" PRIx64 "\n", pos,
>  	      len);
>  	while (len > 0) {
> -		/* Move window and get a new size to read */
> -		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
> -				        len, &size);
> -		if (rc)
> -			return rc;
> -
> -		/* Perform the write for this window */
> -		rc = lpc_window_write(ctx, pos, buf, size);
> -		if (rc)
> -			return rc;
> -
> -		rc = hiomap_mark_dirty(ctx, pos, size);
> -		if (rc)
> -			return rc;
> +		lock(&ctx->lock);
> +		state = ctx->window_state;
> +		unlock(&ctx->lock);
> +		if (state != moving_window) {
> +			/* Move window and get a new size to read */
> +			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
> +					        len, &size);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
> +					__func__, rc);
> +				goto out;
> +			}
> +		} else {
> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
> +			if (rc) {
> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +					__func__, rc, IPMI_LONG_TICKS);
> +				goto out;
> +			}
> +		}
>  
> -		/*
> -		 * The BMC *should* flush if the window is implicitly closed,
> -		 * but do an explicit flush here to be sure.
> -		 *
> -		 * XXX: Removing this could improve performance
> -		 */
> -		rc = hiomap_flush(ctx);
> -		if (rc)
> -			return rc;
> +		lock(&ctx->lock);
> +		state = ctx->window_state;
> +		unlock(&ctx->lock);
>  
> -		len -= size;
> -		pos += size;
> -		buf += size;
> +		if (state == write_window) {
> +			if (ctx->cc != IPMI_CC_NO_ERROR) {
> +				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
> +				rc = OPAL_HARDWARE;
> +				goto out;
> +			}
> +
> +			/* Perform the write for this window */
> +			rc = lpc_window_write(ctx, pos, buf, size);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s lpc_window_write failed: rc=%d\n", __func__, rc);
> +				goto out;
> +			}
> +
> +			rc = hiomap_mark_dirty(ctx, pos, size);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_mark_dirty failed: rc=%d\n", __func__, rc);
> +				goto out;
> +			}
> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
> +			if (rc) {
> +				prlog(PR_TRACE, "%s dirty hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +					__func__, rc, IPMI_LONG_TICKS);
> +				goto out;
> +			}
> +
> +			/*
> +			 * The BMC *should* flush if the window is implicitly closed,
> +			 * but do an explicit flush here to be sure.
> +			 *
> +			 * XXX: Removing this could improve performance
> +			 */
> +			rc = hiomap_flush(ctx);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_flush failed: rc=%d\n", __func__, rc);
> +				goto out;
> +			}
> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
> +			if (rc) {
> +				prlog(PR_TRACE, "%s flush hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +					__func__, rc, IPMI_LONG_TICKS);
> +				goto out;
> +			}
> +
> +			len -= size;
> +			pos += size;
> +			buf += size;
> +			lock(&ctx->lock);
> +			ctx->tracking_len = len;
> +			ctx->tracking_pos = pos;
> +			ctx->tracking_buf = (void *) buf;
> +			unlock(&ctx->lock);
> +		}
>  	}
> +
> +out:	unlock(&ctx->transaction_lock);
>  	return rc;
>  }
>  
> @@ -835,6 +1312,8 @@ static int ipmi_hiomap_erase(struct blocklevel_device *bl, uint64_t pos,
>  			     uint64_t len)
>  {
>  	struct ipmi_hiomap *ctx;
> +	enum lpc_window_state state;
> +	static uint64_t size;
>  	int rc;
>  
>  	/* LPC is only 32bit */
> @@ -842,39 +1321,94 @@ static int ipmi_hiomap_erase(struct blocklevel_device *bl, uint64_t pos,
>  		return FLASH_ERR_PARM_ERROR;
>  
>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
> +	lock(&ctx->transaction_lock);
>  
>  	rc = ipmi_hiomap_handle_events(ctx);
>  	if (rc)
> -		return rc;
> +		goto out;
> +
> +	lock(&ctx->lock);
> +	if (ctx->bl.flags & IN_PROGRESS) {
> +		pos = ctx->tracking_pos;
> +		len = ctx->tracking_len;
> +	} else {
> +		ctx->tracking_pos = 0;
> +		ctx->tracking_len = 0;
> +	}
> +	unlock(&ctx->lock);
>  
>  	prlog(PR_TRACE, "Flash erase at 0x%08x for 0x%08x\n", (u32) pos,
>  	      (u32) len);
>  	while (len > 0) {
> -		uint64_t size;
> -
> -		/* Move window and get a new size to erase */
> -		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
> -				        len, &size);
> -		if (rc)
> -			return rc;
> -
> -		rc = hiomap_erase(ctx, pos, size);
> -		if (rc)
> -			return rc;
> -
> -		/*
> -		 * Flush directly, don't mark that region dirty otherwise it
> -		 * isn't clear if a write happened there or not
> -		 */
> -		rc = hiomap_flush(ctx);
> -		if (rc)
> -			return rc;
> +		lock(&ctx->lock);
> +		state = ctx->window_state;
> +		unlock(&ctx->lock);
> +		if (state != moving_window) {
> +			/* Move window and get a new size to erase */
> +			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
> +					        len, &size);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
> +					__func__, rc);
> +				goto out;
> +			}
> +		} else {
> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
> +			if (rc) {
> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +					__func__, rc, IPMI_LONG_TICKS);
> +				goto out;
> +			}
> +		}
>  
> -		len -= size;
> -		pos += size;
> +		lock(&ctx->lock);
> +		state = ctx->window_state;
> +		unlock(&ctx->lock);
> +		if (state == write_window) {
> +			if (ctx->cc != IPMI_CC_NO_ERROR) {
> +				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
> +				rc = OPAL_HARDWARE;
> +				goto out;
> +			}
> +			rc = hiomap_erase(ctx, pos, size);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_erase failed: rc=%d\n", __func__, rc);
> +				goto out;
> +			}
> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
> +			if (rc) {
> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +					__func__, rc, IPMI_LONG_TICKS);
> +				goto out;
> +			}
> +
> +			/*
> +			 * Flush directly, don't mark that region dirty otherwise it
> +			 * isn't clear if a write happened there or not
> +			 */
> +			rc = hiomap_flush(ctx);
> +			if (rc) {
> +				prlog(PR_NOTICE, "%s hiomap_flush failed: rc=%d\n", __func__, rc);
> +				goto out;
> +			}
> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
> +			if (rc) {
> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
> +					__func__, rc, IPMI_LONG_TICKS);
> +				goto out;
> +			}
> +
> +			len -= size;
> +			pos += size;
> +			lock(&ctx->lock);
> +			ctx->tracking_len = len;
> +			ctx->tracking_pos = pos;
> +			unlock(&ctx->lock);
> +		}
>  	}
>  
> -	return 0;
> +out:	unlock(&ctx->transaction_lock);
> +	return rc;
>  }
>  
>  static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
> @@ -885,6 +1419,7 @@ static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
>  	int rc;
>  
>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
> +	lock(&ctx->transaction_lock);
>  
>  	rc = ipmi_hiomap_handle_events(ctx);
>  	if (rc)
> @@ -903,6 +1438,7 @@ static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
>  	if (erase_granule)
>  		*erase_granule = ctx->erase_granule;
>  
> +	unlock(&ctx->transaction_lock);
What situations is the transaction lock required to protect against?
I'm willing to belive it's necessary, but I'd like to know why since we
don't have one currently. Does the flash_lock handle everything for us
at the moment?

>  	return 0;
>  }
>  
> @@ -925,6 +1461,7 @@ int ipmi_hiomap_init(struct blocklevel_device **bl)
>  		return FLASH_ERR_MALLOC_FAILED;
>  
>  	init_lock(&ctx->lock);
> +	init_lock(&ctx->transaction_lock);
>  
>  	ctx->bl.read = &ipmi_hiomap_read;
>  	ctx->bl.write = &ipmi_hiomap_write;
> diff --git a/libflash/ipmi-hiomap.h b/libflash/ipmi-hiomap.h
> index 489d55e..4870fc5 100644
> --- a/libflash/ipmi-hiomap.h
> +++ b/libflash/ipmi-hiomap.h
> @@ -10,12 +10,36 @@
>  
>  #include "blocklevel.h"
>  
> -enum lpc_window_state { closed_window, read_window, write_window };
> +/*
> + * we basically check for a quick response
> + * otherwise we catch the updated window in the next cycle
> + */
> +#define IPMI_HIOMAP_TICKS 5

I did some testing on a witherspoon to see how long a
hiomap_send_msg_sync() usually takes, and in the fastest times were:

	count   wait time (ms)
	1 	6
	18	7
	52	8
	24	9
	19	10
	4	11
	8	12
	<long tail of larger waits>

So, is 5ms really enough to catch the common response times? It seem a
bit low to me.

> +#define IPMI_HIOMAP_TICKS_DEFAULT 0
> +
> +/* time to wait for write/erase/dirty ops */
> +#define IPMI_LONG_TICKS 500
> +
> +/*
> + * default for ack'ing typically 1-10 wait_time's
> + * allow upper bounds because if we cannot ack
> + * we make no forward progress post protocol reset
> + * async paths will retry
> + * sync paths always hit with zero wait_time elapsed
> + * with ASYNC_REQUIRED_MASK'd out, this is not used
> + */
> +#define IPMI_ACK_DEFAULT 500
> +
> +/* increment to skip the waiting loop */
> +#define IPMI_SKIP_INC 2
> +
> +enum lpc_window_state { closed_window, read_window, write_window, moving_window };
>  
>  struct lpc_window {
>  	uint32_t lpc_addr; /* Offset into LPC space */
>  	uint32_t cur_pos;  /* Current position of the window in the flash */
>  	uint32_t size;     /* Size of the window into the flash */
> +	uint32_t adjusted_window_size; /* store adjusted window size */
>  };
>  
>  struct ipmi_hiomap {
> @@ -35,6 +59,21 @@ struct ipmi_hiomap {
>  	 * three variables are protected by lock to avoid conflict.
>  	 */
>  	struct lock lock;
> +	struct lock transaction_lock;
> +
> +	/* callers transaction info */
> +	uint64_t *active_size;
> +	uint64_t requested_len;
> +	uint64_t requested_pos;
> +	uint64_t tracking_len;
> +	uint64_t tracking_pos;
> +	void *tracking_buf;
> +
> +	int missed_cc_count;
> +	int cc;
> +	/* inflight_seq used to aide debug */
> +	/* with other OPAL ipmi msg's      */
> +	uint8_t inflight_seq;
>  	uint8_t bmc_state;
>  	enum lpc_window_state window_state;
>  };
Deb McLemore Dec. 13, 2019, 12:32 p.m. UTC | #4
Comments below:

On 11/24/19 9:12 PM, Oliver O'Halloran wrote:
> On Thu, Nov 7, 2019 at 4:25 AM Deb McLemore <debmc@linux.ibm.com> wrote:
>> To provide higher layer async operations to access
>> the target flash, enable hiomap to perform async
>> ipmi messaging for call paths thru opal_flash_op.
>>
>> Special considerations and constraints are to prevent
>> recursive locking and/or polling inside OPAL calls.
>>
>> Objective is to remove the hiomap_queue_msg_sync for
>> moving windows (which already uses pollers) to allow
>> async requests to perform their functions.
>>
>> Call paths thru opal_flash_op will determine if the
>> requested operation needs to be re-queued, to allow
>> skiboot to jump back out to Linux to prevent RCU or
>> scheduler issues.
>>
>> For example, if a flash window move operation
>> is needed to erase a very large flash segment,
>> the time spent in opal firmware would be long
>> enough that Linux would stall.  To avoid the
>> long running duration of various aspects of this
>> inherent long running operation we divide up
>> the tasks in smaller chunks to avoid cumulating
>> time spent in opal firmware.  The return codes
>> are used to distinguish differences between cases
>> where a re-queue of the transaction would be
>> needed versus a true failure return code.
>>
>> PR_TRACE used since PR_DEBUG seems to always trigger,
>> unable to figure out why.
>>
>> Due to the nature of redefining the sync versus
>> async capabilities, this patch is larger than
>> desired due to interrelationships of changes
>> to functionality.
>>
>> Signed-off-by: Deb McLemore <debmc@linux.ibm.com>
>> ---
>>  core/flash.c           | 154 +++++++-
>>  libflash/blocklevel.h  |   5 +
>>  libflash/errors.h      |   1 +
>>  libflash/ipmi-hiomap.c | 955 ++++++++++++++++++++++++++++++++++++++-----------
>>  libflash/ipmi-hiomap.h |  41 ++-
>>  5 files changed, 936 insertions(+), 220 deletions(-)
>>
>> diff --git a/core/flash.c b/core/flash.c
>> index e98c8e0..98614f7 100644
>> --- a/core/flash.c
>> +++ b/core/flash.c
>> @@ -28,6 +28,14 @@
>>  #include <timer.h>
>>  #include <timebase.h>
>>
>> +/*
>> + * need to keep this under the BT queue limit
>> + * causes are when ipmi to bmc gets bogged down
>> + * delay allows for congestion to clear
>> + */
>> +#define FLASH_RETRY_LIMIT 10
>> +#define FLASH_SCHEDULE_DELAY 200
>> +
>>  enum flash_op {
>>         FLASH_OP_READ,
>>         FLASH_OP_WRITE,
>> @@ -41,6 +49,9 @@ struct flash_async_info {
>>         uint64_t pos;
>>         uint64_t len;
>>         uint64_t buf;
>> +       int retry_counter;
>> +       int transaction_id;
>> +       int in_progress_schedule_delay;
>>  };
>>
>>  struct flash {
>> @@ -80,13 +91,63 @@ static u32 nvram_offset, nvram_size;
>>  static bool flash_reserve(struct flash *flash)
>>  {
>>         bool rc = false;
>> +       int lock_try_counter = 10;
>> +       uint64_t now;
>> +       uint64_t start_time;
>> +       uint64_t wait_time;
>> +       uint64_t flash_reserve_ticks = 10;
>> +       uint64_t timeout_counter;
>> +
>> +       start_time = mftb();
>> +       now = mftb();
>> +       wait_time = tb_to_msecs(now - start_time);


From libflash/ipmi-hiomap.h

/*
 * we basically check for a quick response
 * otherwise we catch the updated window in the next cycle
 */
#define IPMI_HIOMAP_TICKS 5

Unless the window is immediately ready (so we have to stall

doing some logic which is not optimized out) this allows some

period of time (short) to allow a quick response otherwise

we will always kick out and back to linux to allow that delay

to allow the window to be moved.

> wait_time is always going to be zero since the two mftb() calls are
> going to return a pretty similar values and tb_to_msecs() divides the
> difference by 512000 (tb_hz / 1000).
>
>> +       timeout_counter = 0;
>> +
>> +
>> +       while (wait_time < flash_reserve_ticks) {
>> +               ++timeout_counter;
>> +               if (timeout_counter % 4 == 0) {
>> +                       now = mftb();
>> +                       wait_time = tb_to_msecs(now - start_time);
>> +               }
>> +               if (flash->busy == false) {
>> +                       break;
>> +               }
>> +       }

Again due to optimizing issues we want to stall and actually

check the value "x" number of times to allow a miss or two

on the lock to happen, other logic was optimized away.

> This doesn't make any sense to me. What are you doing here and why?
>
>> -       if (!try_lock(&flash_lock))
>> +       while (!try_lock(&flash_lock)) {
>> +               --lock_try_counter;
>> +               if (lock_try_counter == 0) {
>> +                       break;
>> +               }
>> +       }
> If you're going to do this sort of thing then use a for loop. T so the
> iteration count is set the same place it's being used.
>
>> +       if (lock_try_counter == 0) {
>>                 return false;
>> +       }
> you should return from inside the loop instead of breaking out and
> re-checking the break condition
>
>> +
>> +       /* we have the lock if we got here */
>>
>>         if (!flash->busy) {
>>                 flash->busy = true;
>>                 rc = true;
>> +       } else {
>> +               /* probably beat a flash_release and grabbed the lock */
>> +               unlock(&flash_lock);
>> +               while (!try_lock(&flash_lock)) {
>> +                       --lock_try_counter;
>> +                       if (lock_try_counter == 0) {
>> +                               break;
>> +                       }
>> +               }
>> +               if (lock_try_counter == 0) {
>> +                       return false;
>> +               }
>> +               /* we have the lock if we are here */
>> +               if (!flash->busy) {
>> +                       flash->busy = true;
>> +                       rc = true;
>> +               }
> This doesn't make much sense to me either. Why is replicating all the
> previous logic necessary?
>
>>         }
>>         unlock(&flash_lock);
> I'm not sure that we even need to use try_lock() with patch 03/13
> applied. With that patch the flash's lock should only be held briefly
> so we should be able to use the normal (blocking) lock() since we
> don't need to deal with waiting for a 64MB flash read to complete.
>
>> @@ -279,12 +340,31 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>>                 assert(0);
>>         }
>>
>> -       if (rc)
>> -               rc = OPAL_HARDWARE;
>> +       if (rc == 0) {
>> +               /*
>> +                * if we are good to proceed forward
>> +                * otherwise we may have to try again
>> +                */
>> +               flash->async.pos += len;
>> +               flash->async.buf += len;
>> +               flash->async.len -= len;
>> +               /* if we good clear */
>> +               flash->async.retry_counter = 0;
>> +               /*
>> +                * clear the IN_PROGRESS flags
>> +                * we only need IN_PROGRESS active on missed_cc
>> +                */
>> +               flash->bl->flags &= IN_PROGRESS_MASK;
>> +               /* reset time for scheduling gap */
>> +               flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;

async flash layer is the one that is managing the work,

block layer is a worker which does not know how the

work is being managed.

> The default delay should come from the blocklevel rather than being
> assumed by the user of libflash. The bl has a better idea about what
> sort of interval is required between ops and what a reasonable timeout
> is.
>
>> +       }
>> +
>> +       /*
>> +        * corner cases if the move window misses and
>> +        * the requested window is split (needing adjustment down) problem
>> +        * if timing good on move_cb the world is good
>> +        */

Problem cases are when the async has split the work up into

chunks.  If a chunk desired to be read straddles the window

(meaning that only a subset of the chunk is available in the

current window) we have to bump down the async chunk

desired to be read so that it will succeed.  Then we can

acquire a new window which we can then do subsequent reads

(again at the async flash layer).

> I can't parse this, but it seems like it's all blocklevel specific
> stuff that should be documented there. From the perspective of a
> libflash user we don't want to know or care about *why* we need to do
> a async delay, just that we have to.
>
>> -       flash->async.pos += len;
>> -       flash->async.buf += len;
>> -       flash->async.len -= len;
>>         if (!rc && flash->async.len) {
>>                 /*
>>                  * We want to get called pretty much straight away, just have
>> @@ -293,10 +373,38 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>>                  */
>>                 schedule_timer(&flash->async.poller, 0);
>>                 return;
>> +       } else {
>> +               if (rc == FLASH_ERR_MISSED_CC) {
Yes will update
> Try to avoid stacking indentation levels. It causes us to run into the
> 80col limit even faster and tracing through nested if chains gets
> tedious pretty quickly.
>
> You can avoid it using linear if() ... else if() ... else chains, e.g.
>
> if (rc == FLASH_ERR_MISSED_CC) {
>    /* new stuff goes here */
> } else if (rc) {
>    /* old stuff goes here */
> }
>
>> +                       ++flash->async.retry_counter;
>> +                       flash->async.in_progress_schedule_delay += FLASH_SCHEDULE_DELAY;
>> +                       if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
>> +                               rc = OPAL_HARDWARE;
>> +                               prlog(PR_TRACE, "flash_poll PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
>> +                                       FLASH_RETRY_LIMIT,
>> +                                       flash->async.transaction_id);
>> +                       } else {
>> +                               /*
>> +                                * give the BT time to work and receive response
>> +                                * throttle back to allow for congestion to clear
>> +                                * cases observed were complete lack of ipmi response until very late
>> +                                * or cases immediately following an unaligned window read/move (so slow)
>> +                                */
>> +                               flash->bl->flags |= IN_PROGRESS;
>> +                               schedule_timer(&flash->async.poller, msecs_to_tb(flash->async.in_progress_schedule_delay));
>> +                               return;
>> +                       }
>> +               } else {
>> +                       if (rc != 0) {
>> +                               rc = OPAL_HARDWARE;
>> +                       }
>> +               }
>>         }
>>
>> -       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
>> +       flash->bl->flags &= IN_PROGRESS_MASK;
>> +       flash->bl->flags &= ASYNC_REQUIRED_MASK;
Yes will update
> Masks are for defining bit fields not the inverse of a specific bit.
> Use the normal pattern for clearing a flag:
>      flash->bl->flags &= ~IN_PROGRESS;
>      flash->bl->flags &= ~ASYNC_REQUIRED;
>
>> +       /* release the flash before we allow next opal entry */
>>         flash_release(flash);
>> +       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
>>  }
>>
>>  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
>> @@ -454,6 +562,7 @@ int flash_register(struct blocklevel_device *bl)
>>         flash->size = size;
>>         flash->block_size = block_size;
>>         flash->id = num_flashes();
>> +       flash->async.transaction_id = 0;
>>         init_timer(&flash->async.poller, flash_poll, flash);
>>
>>         rc = ffs_init(0, flash->size, bl, &ffs, 1);
>> @@ -487,7 +596,7 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>                 uint64_t buf, uint64_t size, uint64_t token)
>>  {
>>         struct flash *flash = NULL;
>> -       uint64_t len;
>> +       uint64_t len = 0;
Yes
> Is that needed?
>
>>         int rc;
>>
>>         list_for_each(&flashes, flash, list)
>> @@ -516,6 +625,10 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>         flash->async.buf = buf + len;
>>         flash->async.len = size - len;
>>         flash->async.pos = offset + len;
>> +       flash->async.retry_counter = 0;
>> +       flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
>> +       flash->bl->flags |= ASYNC_REQUIRED;
>> +       ++flash->async.transaction_id;
This is needed.
> Use post-increment unless you absolutely have to use pre-increment.
> There's only a few instances where pre-increment is used in skiboot
> and I'd like to keep it that way.
>
>>         /*
>>          * These ops intentionally have no smarts (ecc correction or erase
>> @@ -539,8 +652,27 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>         }
>>
>>         if (rc) {
>> -               prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
>> -               rc = OPAL_HARDWARE;
>> +               if (rc == FLASH_ERR_MISSED_CC) {
Yes
> same comment about stacking indents
>
>> +                       ++flash->async.retry_counter;
>> +                       flash->async.buf = buf;
>> +                       flash->async.len = size;
>> +                       flash->async.pos = offset;
>> +                       /* for completeness, opal_flash_op is first time pass so unless the retry limit set to 1 */
>> +                       if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
>> +                               rc = OPAL_HARDWARE;
>> +                               prlog(PR_TRACE, "opal_flash_op PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
>> +                                       FLASH_RETRY_LIMIT,
>> +                                       flash->async.transaction_id);
>> +                               goto out;
>> +                       }
>> +                       flash->bl->flags |= IN_PROGRESS;
>> +                       schedule_timer(&flash->async.poller, msecs_to_tb(FLASH_SCHEDULE_DELAY));
>> +                       /* Don't release the flash */
Yes will update
> explain why
>
>> +                       return OPAL_ASYNC_COMPLETION;
>> +               } else {
>> +                       prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
>> +                       rc = OPAL_HARDWARE;
>> +               }
>>                 goto out;
>>         }
>>
>> @@ -564,6 +696,8 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>         rc = OPAL_ASYNC_COMPLETION;
>>  out:
>>         flash_release(flash);
>> +       flash->bl->flags &= IN_PROGRESS_MASK;
>> +       flash->bl->flags &= ASYNC_REQUIRED_MASK;
>>         return rc;
>>  }
>>
>> diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
>> index 492918e..63d8690 100644
>> --- a/libflash/blocklevel.h
>> +++ b/libflash/blocklevel.h
>> @@ -18,8 +18,13 @@ struct blocklevel_range {
>>         int total_prot;
>>  };
>>
>> +#define ASYNC_REQUIRED_MASK 0xFFFB
>> +#define IN_PROGRESS_MASK 0xFFF7
Yes
> these can go.
>
>> +
>>  enum blocklevel_flags {
>>         WRITE_NEED_ERASE = 1,
>> +       ASYNC_REQUIRED = 4,
>> +       IN_PROGRESS = 8,
>>  };
>>
>>  /*
>> diff --git a/libflash/errors.h b/libflash/errors.h
>> index c800ada..c24166d 100644
>> --- a/libflash/errors.h
>> +++ b/libflash/errors.h
>> @@ -21,6 +21,7 @@
>>  #define FLASH_ERR_BAD_READ             15
>>  #define FLASH_ERR_DEVICE_GONE  16
>>  #define FLASH_ERR_AGAIN        17
>> +#define FLASH_ERR_MISSED_CC    18
Yes will update
> The libflash return codes are supposed to be independent of the
> backing blocklevel. As far as I can tell this is being used as a
> generic "async continuation required" return code, similar to how we
> use OPAL_ASYNC_COMPLETION, so rename it something that reflects that.
>
>>  #ifdef __SKIBOOT__
>>  #include <skiboot.h>
> *snip*
>
> I'll send comments about the HIOMAP bits in a seperate mail since this
> is getting unwieldy.
Deb McLemore Dec. 13, 2019, 1:01 p.m. UTC | #5
Comments below:

On 11/25/19 3:10 AM, Oliver O'Halloran wrote:
> On Wed, 2019-11-06 at 11:22 -0600, Deb McLemore wrote:
>> To provide higher layer async operations to access
>> the target flash, enable hiomap to perform async
>> ipmi messaging for call paths thru opal_flash_op.
>>
>> Special considerations and constraints are to prevent
>> recursive locking and/or polling inside OPAL calls.
>>
>> Objective is to remove the hiomap_queue_msg_sync for
>> moving windows (which already uses pollers) to allow
>> async requests to perform their functions.
>>
>> Call paths thru opal_flash_op will determine if the
>> requested operation needs to be re-queued, to allow
>> skiboot to jump back out to Linux to prevent RCU or
>> scheduler issues.
>>
>> For example, if a flash window move operation
>> is needed to erase a very large flash segment,
>> the time spent in opal firmware would be long
>> enough that Linux would stall.  To avoid the
>> long running duration of various aspects of this
>> inherent long running operation we divide up
>> the tasks in smaller chunks to avoid cumulating
>> time spent in opal firmware.  The return codes
>> are used to distinguish differences between cases
>> where a re-queue of the transaction would be
>> needed versus a true failure return code.
>
>> PR_TRACE used since PR_DEBUG seems to always trigger,
>> unable to figure out why.
> https://lists.ozlabs.org/pipermail/skiboot/2019-October/015645.html
>
>> Due to the nature of redefining the sync versus
>> async capabilities, this patch is larger than
>> desired due to interrelationships of changes
>> to functionality.

During the evolution of designing, code and testing your

suggestions were purused and then iterations evolved.

Design points which took highest priority in factoring the

code was consolidating the runtime checks (like are we booting)

in a single location, moving the other checks to the various

other functions.  This design decision also made it

clear to the future maintainers that consideration needs

to be made for sync versus async.

One main design point which is not intuitively

obvious (I'll add some comments) is that its necessary

to know the ASYNC_REQUIRED across the layers and the only

way to communicate this is via the blocklevel_flags (unless we

wanted to change the interfaces which is not desired).

When the boot is in progress and the transition made where

the kernel immediately comes in with an async flash request,

we must differentiate the call so that the appropriate move

window symantics occur.  Various other alternative methods

were pursued and the chosen design is the cleanest.

When the functionality is implemented its necessary to have

the appropriate debug/trace instrumentation in place, otherwise

the design decision lacks some nuance if in the future someone

was to bisect this patch and find that its not functional, therefore

it was deemed best to leave as a whole.

Will update comments to better describe in appropriate places.

> Yeah, nah.
>
> The usual pattern for sync->async conversions is to re-implement the
> sync parts using the async implementation internally and by having the
> "sync" functions wait on a flag variable internally. Doing that then
> allows you to expand the async bits gradually into the callers. See
> i2c_request_sync() for an example.
>
> Even after this patch there's a lot of HIOMAP commands that are still
> required to be sync, so what I'd do is:
>
> 1. Keep hiomap_queue_msg_sync() around and continue using it for
> commands that need to be sync rather than faking it. That'll remove a
> pile of churn to begin with.
>
> 2. Split the transction lock() changes into a seperate patch,
> preferable with a commit mesage explaining why its needed.
>
> 3. Re-implement the window movement using the new async mathods. Keep
> the external interface the same.
>
> 4. Now convert flash.c to use the new async interface. This might
> require moving around where changes occur so that cyril's async-lite
> patch comes after the changes above.
>
> 5. Move some of the debug prints out of this patch and into a follow
> up. Pruning them while reviewing them took out ~200 lines so it'll
> help.
>
> Doing that should let you get this down to a few hundred lines at most.
> That's still too large, but whatever, at least it's manageable.
>
> I'm not asking you to do this for for the sake of being a pain. I spent
> all day trying to unpack what this patch actually does and that only
> took all day because it's over a thousand lines of random stuff
> happening for random reasons. We don't ask for large changes to be
> split into a smaller ones to because we demand the platonic ideal of a
> patch. We do it so that it's possible to review them without sending
> the reviewer insane.
>
>> Signed-off-by: Deb McLemore <debmc@linux.ibm.com>
>> ---
>>  core/flash.c           | 154 +++++++-
>>  libflash/blocklevel.h  |   5 +
>>  libflash/errors.h      |   1 +
>>  libflash/ipmi-hiomap.c | 955 ++++++++++++++++++++++++++++++++++++++-----------
>>  libflash/ipmi-hiomap.h |  41 ++-
>>  5 files changed, 936 insertions(+), 220 deletions(-)
> flash changes are in the other mail
>
>> diff --git a/libflash/ipmi-hiomap.c b/libflash/ipmi-hiomap.c
>> index 7327b83..08af005 100644
>> --- a/libflash/ipmi-hiomap.c
>> +++ b/libflash/ipmi-hiomap.c
>> @@ -11,6 +11,10 @@
>>  #include <stdbool.h>
>>  #include <stdint.h>
>>  #include <string.h>
>> +#include <lock.h>
>> +#include <debug_descriptor.h>
>> +#include <timebase.h>
>> +#include <cpu.h>
>>  
>>  #include <ccan/container_of/container_of.h>
>>  
>> @@ -24,7 +28,7 @@ struct ipmi_hiomap_result {
>>  	int16_t cc;
>>  };
>>  
>> -#define RESULT_INIT(_name, _ctx) struct ipmi_hiomap_result _name = { _ctx, -1 }
>> +static struct hiomap_v2_create_window *window_parms;
>>  
>>  static inline uint32_t blocks_to_bytes(struct ipmi_hiomap *ctx, uint16_t blocks)
>>  {
>> @@ -62,9 +66,20 @@ static int hiomap_protocol_ready(struct ipmi_hiomap *ctx)
>>  	return 0;
>>  }
>>  
>> -static int hiomap_queue_msg_sync(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
>> +static int hiomap_queue_msg(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
>>  {
>>  	int rc;
>> +	int bl_flags;
>> +
>> +	lock(&ctx->lock);
>> +	bl_flags = ctx->bl.flags;
>> +	unlock(&ctx->lock);
>> +
>> +	/*
>> +	 * during boot caution to stay duration within skiboot/
>> +	 * no exit re-entry due to poller conflicts with synchronous window moves
>> +	 * asynchronous usage intended for opal_flash_op and flash_poll paths
>> +	 */
> doesn't parse
>
>
>>  	/*
>>  	 * There's an unavoidable TOCTOU race here with the BMC sending an
>> @@ -73,17 +88,23 @@ static int hiomap_queue_msg_sync(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
>>  	 * hiomap_queue_msg_sync() exists to capture the race in a single
>>  	 * location.
>>  	 */
>> -	lock(&ctx->lock);
>> -	rc = hiomap_protocol_ready(ctx);
>> -	unlock(&ctx->lock);
>> -	if (rc) {
>> -		ipmi_free_msg(msg);
>> -		return rc;
>> -	}
>>  
>> -	ipmi_queue_msg_sync(msg);
>> +	if ((opal_booting()) || (!(bl_flags & ASYNC_REQUIRED))) {
>> +		lock(&ctx->lock);
>> +		rc = hiomap_protocol_ready(ctx);
>> +		unlock(&ctx->lock);
>> +		if (rc) {
>> +			ipmi_free_msg(msg);
>> +			return rc;
>> +		}
>> +		prlog(PR_TRACE, "%s SENDING SYNC\n", __func__);
>> +		ipmi_queue_msg_sync(msg);
>> +	} else {
>> +		prlog(PR_TRACE, "%s SENDING ASYNC\n", __func__);
>> +		rc = ipmi_queue_msg(msg);
Yes, during a protocol reset we have some interesting corner cases.
> Is the hiomap_protocol_ready() check required in the async branch too?
>
>> +	}
>>  
>> -	return 0;
>> +	return rc;
>>  }
>>  
>>  /* Call under ctx->lock */
>> @@ -100,12 +121,178 @@ static int hiomap_window_valid(struct ipmi_hiomap *ctx, uint64_t pos,
>>  		return FLASH_ERR_PARM_ERROR;
>>  	if (pos < ctx->current.cur_pos)
>>  		return FLASH_ERR_PARM_ERROR;
>> -	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size))
>> -		return FLASH_ERR_PARM_ERROR;
>> +	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size)) {
>> +		/* we will compensate the proper values in caller */

See other patch comments on when the async chunk straddles

the current window sizes, etc.

> compensate?
>
>> +		if ((pos + ctx->current.size) <= (ctx->current.cur_pos + ctx->current.size)) {
>> +			prlog(PR_TRACE, "%s OK pos=%llu "
>> +				"ctx->current.size=0x%x "
>> +				"ctx->current.cur_pos=0x%x\n",
>> +				__func__,
>> +				pos,
>> +				ctx->current.size,
>> +				ctx->current.cur_pos);
>> +		} else {
>> +			prlog(PR_TRACE, "%s CHECKING further pos=%llu "
>> +				"for len=%llu ctx->current.size=0x%x "
>> +				"ctx->current.cur_pos=0x%x\n",
>> +				__func__,
>> +				pos,
>> +				len,
>> +				ctx->current.size,
>> +				ctx->current.cur_pos);
>> +			if ((pos + ctx->current.adjusted_window_size) <= (ctx->current.cur_pos + ctx->current.size)) {
>> +				prlog(PR_TRACE, "%s OK use ADJUSTED pos=%llu "
>> +					"adjusted_len=%i for len=%llu "
>> +					"ctx->current.size=0x%x "
>> +					"ctx->current.cur_pos=0x%x\n",
>> +					__func__,
>> +					pos,
>> +					ctx->current.adjusted_window_size,
>> +					len,
>> +					ctx->current.size,
>> +					ctx->current.cur_pos);
>> +			} else {
>> +				prlog(PR_TRACE, "%s we need to MOVE the window\n", __func__);
>> +				return FLASH_ERR_PARM_ERROR;
>> +			}
>> +		}
>> +	}
>>  
>> +	prlog(PR_TRACE, "%s ALL GOOD, no move needed\n", __func__);
>>  	return 0;
>>  }
>>  
>> +static void move_cb(struct ipmi_msg *msg)
>> +{
>> +	/*
>> +	 * we leave the move_cb outside of the ipmi_hiomap_cmd_cb
>> +	 * based on the command we need to special close the window
>> +	 */

Closing the window allows the asynchronous call which may take

minutes to complete to force the next iteration to properly setup

the window.

> I don't see why this can't be handled in-line with the reset of the
> hiomap commands. Why is closing the window required? Does the existing
> sync path do that?
>
>> +	struct ipmi_hiomap_result *res = msg->user_data;
>> +	struct ipmi_hiomap *ctx = res->ctx;
>> +	/*
>> +	 * only a few iterations to try for lock/
>> +	 * contention is probably hiomap_window_move trying to setup again
>> +	 */
>> +	int lock_try_counter = 10;
>> +
>> +	if ((msg->resp_size != 8) || (msg->cc != IPMI_CC_NO_ERROR) || (msg->data[1] != ctx->inflight_seq)) {
>> +		lock(&ctx->lock);
>> +		ctx->cc = OPAL_HARDWARE;
>> +		ctx->window_state = closed_window;
>> +		goto out;
>> +	}
>> +
>> +	window_parms = (struct hiomap_v2_create_window *)&msg->data[2];
>> +	while (!try_lock(&ctx->lock)) {
>> +		--lock_try_counter;
>> +		if (lock_try_counter == 0) {
>> +			break;
>> +		}
>> +	}
>> +	if (lock_try_counter == 0) {
>> +		/*
>> +		 * we cannot get the lock, but update anyway
>> +		 * because we cannot communicate this completion
>> +		 * and someone will need to retry
>> +		 * contention usually with handle_events or window_move
>> +		 * this code path is the critical path that will open the window
>> +		 */
>> +		ctx->window_state = closed_window;
>> +		ctx->cc = OPAL_PARAMETER;
>> +		goto out2;
>> +	}
See other patch comments on logic being optimized away.
> This lock retry stuff seems fundementally sketchy and I don't see why
> it's needed.
>
>> +	/* If here, we got the lock, cc consumed higher up so need in ctx */
>> +
>> +	ctx->cc = IPMI_CC_NO_ERROR;
>> +	ctx->current.lpc_addr =
>> +		blocks_to_bytes(ctx, le16_to_cpu(window_parms->lpc_addr));
>> +	ctx->current.size =
>> +		blocks_to_bytes(ctx, le16_to_cpu(window_parms->size));
>> +	ctx->current.cur_pos =
>> +		blocks_to_bytes(ctx, le16_to_cpu(window_parms->offset));
>> +	/* refresh to current */
>> +	ctx->current.adjusted_window_size = ctx->current.size;
>> +
>> +	/* now that we have moved stuff the values */
>> +	*ctx->active_size = ctx->requested_len;
>> +	/*
>> +	 * Is length past the end of the window?
>> +	 * if this condition happens it can cause the async.retry_counter to fail
See other comments on async chunk and straddling current window.
> Again, explain why. The fact it can happen is interesting, but comments
> should be answering more questions then they raise.
>
>> +	if ((ctx->requested_pos + ctx->requested_len) > (ctx->current.cur_pos + ctx->current.size)) {
>> +		/*
>> +		 * Adjust size to meet current window
>> +		 * active_size goes back to caller,
>> +		 * but caller may expire and we need to store for future use
>> +		 */
>> +		*ctx->active_size = (ctx->current.cur_pos + ctx->current.size) - ctx->requested_pos;
>> +		ctx->current.adjusted_window_size = (ctx->current.cur_pos + ctx->current.size) - ctx->requested_pos;
>> +	}

See other comments on async chunk and straddling current window

and then subsequent schedule delays and retry.

Static variables are needed since scope of calls are completed

asynchronously and stack variables are gone.

> Both of these variables have no business being set here. How much of
> the current window that we should read is only interesting at the point
> where we do the actual read (i.e. in ipmi_hiomap_read()) and it should
> be handled there rather than passing around a pointer (to a static
> local variable?) and setting it some random callback. 
>
> hiomap_window_move() already returns the size of the mapped area so
> truncate it there.
>
>> +	if (ctx->requested_len != 0 && *ctx->active_size == 0) {
>> +		prlog(PR_NOTICE, "%s Invalid window properties: len: %llu, size: %llu\n",
>> +			__func__, ctx->requested_len, *ctx->active_size);
>> +		ctx->cc = OPAL_PARAMETER;
>> +		ctx->window_state = closed_window;
>> +		goto out;
>> +	}
>> +
>> +	if (msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW)
>> +		ctx->window_state = read_window;
>> +	else
>> +		ctx->window_state = write_window;
>> +
>> +out:	prlog(PR_TRACE, "Exiting the move window callback "
>> +		"transaction ipmi seq=%i\n",
>> +		ctx->inflight_seq);
>> +	unlock(&ctx->lock);
>> +out2:	ipmi_free_msg(msg);
>> +}
>> +
>>  static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>>  {
>>  	struct ipmi_hiomap_result *res = msg->user_data;
>> @@ -125,9 +312,9 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>>  		return;
>>  	}
>>  
>> -	if (msg->data[1] != ctx->seq) {
>> +	if (msg->data[1] != ctx->inflight_seq) {
>>  		prerror("Unmatched sequence number: wanted %u got %u\n",
>> -			ctx->seq, msg->data[1]);
>> +			ctx->inflight_seq, msg->data[1]);
>>  		res->cc = IPMI_ERR_UNSPECIFIED;
>>  		ipmi_free_msg(msg);
>>  		return;
>> @@ -138,6 +325,7 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>>  	{
>>  		struct hiomap_v2_info *parms;
>>  
>> +		ctx->cc = IPMI_CC_NO_ERROR;
>>  		if (msg->resp_size != 6) {
>>  			prerror("%u: Unexpected response size: %u\n", msg->data[0],
>>  				msg->resp_size);
>> @@ -162,6 +350,7 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>>  	{
>>  		struct hiomap_v2_flash_info *parms;
>>  
>> +		ctx->cc = IPMI_CC_NO_ERROR;
>>  		if (msg->resp_size != 6) {
>>  			prerror("%u: Unexpected response size: %u\n", msg->data[0],
>>  				msg->resp_size);
>> @@ -176,36 +365,6 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>>  			blocks_to_bytes(ctx, le16_to_cpu(parms->erase_granule));
>>  		break;
>>  	}
>> -	case HIOMAP_C_CREATE_READ_WINDOW:
>> -	case HIOMAP_C_CREATE_WRITE_WINDOW:
>> -	{
>> -		struct hiomap_v2_create_window *parms;
>> -
>> -		if (msg->resp_size != 8) {
>> -			prerror("%u: Unexpected response size: %u\n", msg->data[0],
>> -				msg->resp_size);
>> -			res->cc = IPMI_ERR_UNSPECIFIED;
>> -			break;
>> -		}
>> -
>> -		parms = (struct hiomap_v2_create_window *)&msg->data[2];
>> -
>> -		ctx->current.lpc_addr =
>> -			blocks_to_bytes(ctx, le16_to_cpu(parms->lpc_addr));
>> -		ctx->current.size =
>> -			blocks_to_bytes(ctx, le16_to_cpu(parms->size));
>> -		ctx->current.cur_pos =
>> -			blocks_to_bytes(ctx, le16_to_cpu(parms->offset));
>> -
>> -		lock(&ctx->lock);
>> -		if (msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW)
>> -			ctx->window_state = read_window;
>> -		else
>> -			ctx->window_state = write_window;
>> -		unlock(&ctx->lock);
>> -
>> -		break;
>> -	}
>>  	case HIOMAP_C_MARK_DIRTY:
>>  	case HIOMAP_C_FLUSH:
>>  	case HIOMAP_C_ACK:
>> @@ -215,7 +374,15 @@ static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
>>  			prerror("%u: Unexpected response size: %u\n", msg->data[0],
>>  				msg->resp_size);
>>  			res->cc = IPMI_ERR_UNSPECIFIED;
>> +			ctx->cc = OPAL_HARDWARE;
>>  			break;
>> +		} else {
>> +			prlog(PR_TRACE, "%s Command=%u 1=RESET 7=DIRTY 8=FLUSH 9=ACK 10=ERASE ipmi seq=%u ctx->inflight_seq=%u\n",\
>> +				__func__,
>> +				msg->data[0],
>> +				msg->data[1],
>> +				ctx->inflight_seq);
>> +			ctx->cc = IPMI_CC_NO_ERROR;
>>  		}
>>  		break;
>>  	default:
>> @@ -237,57 +404,179 @@ static void hiomap_init(struct ipmi_hiomap *ctx)
>>  	unlock(&ctx->lock);
>>  }
>>  
>> +static int hiomap_wait_for_cc(struct ipmi_hiomap *ctx, int *cc, uint8_t *seq, uint64_t ticks)
>> +{
>> +	uint64_t now;
>> +	uint64_t start_time;
>> +	uint64_t wait_time;
>> +	uint64_t ipmi_hiomap_ticks;
>> +	uint64_t timeout_counter;
>> +	int rc;
>> +
>> +	prlog(PR_TRACE, "Start wait for cc ipmi seq=%i *cc=%i ticks=%llu\n", *seq, *cc, ticks);
>> +	rc = 0;
>> +	if (this_cpu()->tb_invalid) {
>> +		/*
>> +		 * SYNC paths already have *cc success
>> +		 * ASYNC will RE-QUEUE and retry
>> +		 * we just need to skip the tb logic handling
>> +		 * we need to close the window to have the logic try the move again
>> +		 */
>> +		if (*cc != IPMI_CC_NO_ERROR) {
>> +			lock(&ctx->lock);
>> +			ctx->window_state = closed_window;
>> +			++ctx->missed_cc_count;
>> +			prlog(PR_NOTICE, "%s tb_invalid, CLOSING WINDOW for cc "
>> +				"ipmi seq=%i ctx->missed_cc_count=%i\n",
>> +				__func__, *seq, ctx->missed_cc_count);
>> +			unlock(&ctx->lock);
>> +			rc = FLASH_ERR_MISSED_CC;
>> +		}
>> +		prlog(PR_NOTICE, "%s tb_invalid, hopefully this will "
>> +			"retry/recover rc=%i\n",
>> +			__func__, rc);
>> +		return rc;
>> +	}
We are doing some logic which is uses the time base as a metric.
> open-coding checks of tb_invalid is something that no one should be
> doing unless they're dealing with the timebase directly. Why do we need
> it here?
>
>> +	start_time = mftb();
>> +	now = mftb();
>> +	wait_time = tb_to_msecs(now - start_time);
>> +	timeout_counter = 0;
> wait_time is zero.
>
>> +
>> +	if (ticks != 0) {
>> +		ipmi_hiomap_ticks = ticks;
>> +	} else {
>> +		ipmi_hiomap_ticks = IPMI_HIOMAP_TICKS;
>> +	}
Optimizations caused some odd code for functionality.
> skiboot coding style is to omit the braces on single line blocks so
> drop them. this could also just be:
>
> if (!ticks)
> 	ipmi_hiomap_ticks = IPMI_HIOMAP_TICKS;
>
>> +	while (wait_time < ipmi_hiomap_ticks) {
>> +		++timeout_counter;
>> +		if (timeout_counter % IPMI_SKIP_INC == 0) {
>> +			now = mftb();
>> +			wait_time = tb_to_msecs(now - start_time);
>> +		}
Optimization considerations and polling considerations.
> why do we need to open-code a delay loop rather than using
> time_wait_ms(), or it's no-polling version?
>
>> +		if (*cc == IPMI_CC_NO_ERROR) {
>> +			prlog(PR_TRACE, "Break cc ipmi seq=%i "
>> +				"ctx->missed_cc_count=%i\n",
>> +				*seq, ctx->missed_cc_count);
>> +			break;
>> +		}
>> +	}
>> +	if (*cc != IPMI_CC_NO_ERROR) {
>> +		lock(&ctx->lock);
>> +		ctx->window_state = closed_window;
>> +		++ctx->missed_cc_count;
>> +		unlock(&ctx->lock);
>> +		rc = FLASH_ERR_MISSED_CC;
>> +	}
>> +
>> +	prlog(PR_TRACE, "Stop wait for *cc=%i ipmi seq=%i "
>> +		"ctx->missed_cc_count=%i\n",
>> +		*cc, *seq, ctx->missed_cc_count);
>> +	return rc;
>> +
>> +}
>> +
>>  static int hiomap_get_info(struct ipmi_hiomap *ctx)
>>  {
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result info_res;

Static variables are needed since scope of calls are completed

asynchronously and stack variables are gone.

> That should be part of the context structure rather than being a static
> local.
>
>>  	unsigned char req[3];
>>  	struct ipmi_msg *msg;
>> +	uint8_t info_seq;
>> +	int orig_flags;
>> +	int tmp_sync_flags;
>>  	int rc;
>>  
>> +	info_res.ctx = ctx;
>> +	info_res.cc = -1;
>> +
>> +	lock(&ctx->lock);
>> +	orig_flags = ctx->bl.flags;
>> +	/* clear out async to always do sync */
>> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
>> +	ctx->bl.flags = tmp_sync_flags;
>> +	ctx->cc = -1;
>> +	info_seq = ++ctx->seq;
>> +	ctx->inflight_seq = info_seq;
>> +	unlock(&ctx->lock);
>> +
>>  	/* Negotiate protocol version 2 */
>>  	req[0] = HIOMAP_C_GET_INFO;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = info_seq;
>>  	req[2] = HIOMAP_V2;
>>  
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 6);
>> +			 ipmi_hiomap_cmd_cb, &info_res, req, sizeof(req), 6);
>>  
>> -	rc = hiomap_queue_msg_sync(ctx, msg);
>> +	rc = hiomap_queue_msg(ctx, msg);
>> +	lock(&ctx->lock);
>> +	ctx->bl.flags = orig_flags;
>> +	unlock(&ctx->lock);
>>  	if (rc)
>>  		return rc;
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prerror("%s failed: %d\n", __func__, res.cc);
>> -		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
>> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
>> +
>> +	if (rc) {
>> +		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +			__func__, rc, IPMI_ACK_DEFAULT);
>>  	}
>>  
>> -	return 0;
>> +	return rc;
>>  }
>>  
>>  static int hiomap_get_flash_info(struct ipmi_hiomap *ctx)
>>  {
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result flash_info_res;

Static variables are needed since scope of calls are completed

asynchronously and stack variables are gone.

> same
>
>>  	unsigned char req[2];
>>  	struct ipmi_msg *msg;
>> +	uint8_t flash_info_seq;
>> +	int orig_flags;
>> +	int tmp_sync_flags;
>>  	int rc;
>>  
>> +	flash_info_res.ctx = ctx;
>> +	flash_info_res.cc = -1;
>> +
>> +	lock(&ctx->lock);
>> +	orig_flags = ctx->bl.flags;
>> +	/* clear out async to always do sync */
>> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
>> +	ctx->bl.flags = tmp_sync_flags;
>> +	ctx->cc = -1;
>> +	flash_info_seq = ++ctx->seq;
>> +	ctx->inflight_seq = flash_info_seq;
>> +	unlock(&ctx->lock);
>> +
>>  	req[0] = HIOMAP_C_GET_FLASH_INFO;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = flash_info_seq;
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2 + 2 + 2);
>> +			 ipmi_hiomap_cmd_cb, &flash_info_res, req, sizeof(req), 2 + 2 + 2);
>>  
>> -	rc = hiomap_queue_msg_sync(ctx, msg);
>> +	rc = hiomap_queue_msg(ctx, msg);
>> +	lock(&ctx->lock);
>> +	ctx->bl.flags = orig_flags;
>> +	unlock(&ctx->lock);
>>  	if (rc)
>>  		return rc;
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prerror("%s failed: %d\n", __func__, res.cc);
>> -		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
>> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
>> +	if (rc) {
>> +		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +			__func__, rc, IPMI_ACK_DEFAULT);
>>  	}
>>  
>> -	return 0;
>> +	return rc;
>>  }
>>  
>>  static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
>> @@ -295,32 +584,65 @@ static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
>>  {
>>  	enum lpc_window_state want_state;
>>  	struct hiomap_v2_range *range;
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result move_res;

Static variables are needed since scope of calls are completed

asynchronously and stack variables are gone.

> same
>
>>  	unsigned char req[6];
>>  	struct ipmi_msg *msg;
>> +	uint8_t move_seq;
>>  	bool valid_state;
>>  	bool is_read;
>>  	int rc;
>>  
>> +	move_res.ctx = ctx;
>> +	move_res.cc = -1;
>>  	is_read = (command == HIOMAP_C_CREATE_READ_WINDOW);
>>  	want_state = is_read ? read_window : write_window; 
>> +	/* there will be lock contention between hiomap_window_move and move_cb */
Optimization considerations.
> If by contention you mean "they take the same lock" then sure, but I
> don't see why it matters.
>
>>  	lock(&ctx->lock);
>> +	ctx->cc = -1;
>> +
>> +	if (ctx->bl.flags & IN_PROGRESS) {
>> +		pos = ctx->tracking_pos;
>> +		len = ctx->tracking_len;
>> +	} else {
>> +		ctx->tracking_pos = pos;
>> +		ctx->tracking_len = len;
>> +	}
>>  
>>  	valid_state = want_state == ctx->window_state;
>>  	rc = hiomap_window_valid(ctx, pos, len);
>> +
>>  	if (valid_state && !rc) {
>> +		/* if its valid stuff the proper maybe modified size */
>> +		if ((pos + len) > (ctx->current.cur_pos + ctx->current.size)) {
>> +			/* if we had bumped the adjusted_window_size down in move_cb */
>> +			if ((ctx->current.adjusted_window_size != ctx->current.size)) {
>> +				*size = ctx->current.adjusted_window_size;
>> +			} else {
>> +				*size = (ctx->current.cur_pos + ctx->current.size) - pos;
>> +			}
See previous comments.
> braces, and as I said above move the calculation of the "adjusted" size
> to here rather than having it in the callback.
>
>
>> +		} else {
>> +			*size = len;
>> +		}
>> +		ctx->cc = IPMI_CC_NO_ERROR;
>>  		unlock(&ctx->lock);
>> -		*size = len;
>>  		return 0;
>>  	}
>>  
>> -	ctx->window_state = closed_window;
>> +	ctx->window_state = moving_window;
>>  
>> -	unlock(&ctx->lock);
>> +	ctx->active_size = size;
>> +	ctx->requested_pos = pos;
>> +	ctx->requested_len = len;
>> +
>> +	move_seq = ++ctx->seq;
>> +	ctx->inflight_seq = move_seq;
>>  
>>  	req[0] = command;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = move_seq;
>> +
>> +	unlock(&ctx->lock);
>>  
>>  	range = (struct hiomap_v2_range *)&req[2];
>>  	range->offset = cpu_to_le16(bytes_to_blocks(ctx, pos));
>> @@ -328,38 +650,14 @@ static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
>>  
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req),
>> +			 move_cb, &move_res, req, sizeof(req),
>>  			 2 + 2 + 2 + 2);
>>  
>> -	rc = hiomap_queue_msg_sync(ctx, msg);
>> +	rc = hiomap_queue_msg(ctx, msg);
>> +
>>  	if (rc)
>>  		return rc;
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prlog(PR_INFO, "%s failed: %d\n", __func__, res.cc);
>> -		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
>> -	}
>> -
>> -	lock(&ctx->lock);
>> -	*size = len;
>> -	/* Is length past the end of the window? */
>> -	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size))
>> -		/* Adjust size to meet current window */
>> -		*size = (ctx->current.cur_pos + ctx->current.size) - pos;
>> -
>> -	if (len != 0 && *size == 0) {
>> -		unlock(&ctx->lock);
>> -		prerror("Invalid window properties: len: %"PRIu64", size: %"PRIu64"\n",
>> -			len, *size);
>> -		return FLASH_ERR_PARM_ERROR;
>> -	}
>> -
>> -	prlog(PR_DEBUG, "Opened %s window from 0x%x for %u bytes at 0x%x\n",
>> -	      (command == HIOMAP_C_CREATE_READ_WINDOW) ? "read" : "write",
>> -	      ctx->current.cur_pos, ctx->current.size, ctx->current.lpc_addr);
>> -
>> -	unlock(&ctx->lock);
>> -
>>  	return 0;
>>  }
>>  
>> @@ -368,21 +666,27 @@ static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
>>  {
>>  	struct hiomap_v2_range *range;
>>  	enum lpc_window_state state;
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result dirty_res;
>>  	unsigned char req[6];
>>  	struct ipmi_msg *msg;
>> +	uint8_t dirty_seq;
>>  	uint32_t pos;
>>  	int rc;
>>  
>> +	dirty_res.ctx = ctx;
>> +	dirty_res.cc = -1;
>>  	lock(&ctx->lock);
>>  	state = ctx->window_state;
>> +	dirty_seq = ++ctx->seq;
>> +	ctx->inflight_seq = dirty_seq;
>> +	ctx->cc = -1;
>>  	unlock(&ctx->lock);
>>  
>>  	if (state != write_window)
>>  		return FLASH_ERR_PARM_ERROR;
>>  
>>  	req[0] = HIOMAP_C_MARK_DIRTY;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = dirty_seq;
>>  
>>  	pos = offset - ctx->current.cur_pos;
>>  	range = (struct hiomap_v2_range *)&req[2];
>> @@ -391,19 +695,15 @@ static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
>>  
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
>> +			 ipmi_hiomap_cmd_cb, &dirty_res, req, sizeof(req), 2);
>> +
>> +	rc = hiomap_queue_msg(ctx, msg);
>>  
>> -	rc = hiomap_queue_msg_sync(ctx, msg);
>>  	if (rc)
>>  		return rc;
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prerror("%s failed: %d\n", __func__, res.cc);
>> -		return FLASH_ERR_PARM_ERROR;
>> -	}
>> -
>>  	prlog(PR_DEBUG, "Marked flash dirty at 0x%" PRIx64 " for %" PRIu64 "\n",
>> -	      offset, size);
>> +		offset, size);
>>  
>>  	return 0;
>>  }
>> @@ -411,34 +711,36 @@ static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
>>  static int hiomap_flush(struct ipmi_hiomap *ctx)
>>  {
>>  	enum lpc_window_state state;
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result flush_res;
>>  	unsigned char req[2];
>>  	struct ipmi_msg *msg;
>> +	uint8_t flush_seq;
>>  	int rc;
>>  
>> +	flush_res.ctx = ctx;
>> +	flush_res.cc = -1;
>>  	lock(&ctx->lock);
>>  	state = ctx->window_state;
>> +	flush_seq = ++ctx->seq;
>> +	ctx->inflight_seq = flush_seq;
>> +	ctx->cc = -1;
>>  	unlock(&ctx->lock);
>>  
>>  	if (state != write_window)
>>  		return FLASH_ERR_PARM_ERROR;
>>  
>>  	req[0] = HIOMAP_C_FLUSH;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = flush_seq;
>>  
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
>> +			 ipmi_hiomap_cmd_cb, &flush_res, req, sizeof(req), 2);
>> +
>> +	rc = hiomap_queue_msg(ctx, msg);
>>  
>> -	rc = hiomap_queue_msg_sync(ctx, msg);
>>  	if (rc)
>>  		return rc;
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prerror("%s failed: %d\n", __func__, res.cc);
>> -		return FLASH_ERR_PARM_ERROR;
>> -	}
>> -
>>  	prlog(PR_DEBUG, "Flushed writes\n");
>>  
>>  	return 0;
>> @@ -446,26 +748,47 @@ static int hiomap_flush(struct ipmi_hiomap *ctx)
>>  
>>  static int hiomap_ack(struct ipmi_hiomap *ctx, uint8_t ack)
>>  {
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result ack_res;

Static variables are needed since scope of calls are completed

asynchronously and stack variables are gone.

> here too
>
>>  	unsigned char req[3];
>>  	struct ipmi_msg *msg;
>> +	uint8_t ack_seq;
>> +	int orig_flags;
>> +	int tmp_sync_flags;
>>  	int rc;
>>  
>> +	ack_res.ctx = ctx;
>> +	ack_res.cc = -1;
>> +
>> +	lock(&ctx->lock);
>> +	orig_flags = ctx->bl.flags;
>> +	/* clear out async to always do sync */
>> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
>> +	ctx->bl.flags = tmp_sync_flags;
>> +	ctx->cc = -1;
>> +	ack_seq = ++ctx->seq;
>> +	ctx->inflight_seq = ack_seq;
>> +	unlock(&ctx->lock);
>> +
>>  	req[0] = HIOMAP_C_ACK;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = ack_seq;
>>  	req[2] = ack;
>>  
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
>> +			 ipmi_hiomap_cmd_cb, &ack_res, req, sizeof(req), 2);
>>  
>> -	rc = hiomap_queue_msg_sync(ctx, msg);
>> +	rc = hiomap_queue_msg(ctx, msg);
>> +	lock(&ctx->lock);
>> +	ctx->bl.flags = orig_flags;
>> +	unlock(&ctx->lock);
>>  	if (rc)
>>  		return rc;
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prlog(PR_DEBUG, "%s failed: %d\n", __func__, res.cc);
>> -		return FLASH_ERR_PARM_ERROR;
>> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
>> +	if (rc) {
>> +		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +			__func__, rc, IPMI_ACK_DEFAULT);
>> +		return rc;
>>  	}
>>  
>>  	prlog(PR_DEBUG, "Acked events: 0x%x\n", ack);
>> @@ -478,21 +801,27 @@ static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
>>  {
>>  	struct hiomap_v2_range *range;
>>  	enum lpc_window_state state;
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result erase_res;

Static variables are needed since scope of calls are completed

asynchronously and stack variables are gone.

> here too
>>  	unsigned char req[6];
>>  	struct ipmi_msg *msg;
>> +	uint8_t erase_seq;
>>  	uint32_t pos;
>>  	int rc;
>>  
>> +	erase_res.ctx = ctx;
>> +	erase_res.cc = -1;
>>  	lock(&ctx->lock);
>>  	state = ctx->window_state;
>> +	erase_seq = ++ctx->seq;
>> +	ctx->inflight_seq = erase_seq;
>> +	ctx->cc = -1;
>>  	unlock(&ctx->lock);
>>  
>>  	if (state != write_window)
>>  		return FLASH_ERR_PARM_ERROR;
>>  
>>  	req[0] = HIOMAP_C_ERASE;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = erase_seq;
>>  
>>  	pos = offset - ctx->current.cur_pos;
>>  	range = (struct hiomap_v2_range *)&req[2];
>> @@ -501,16 +830,13 @@ static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
>>  
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
>> -	rc = hiomap_queue_msg_sync(ctx, msg);
>> +			 ipmi_hiomap_cmd_cb, &erase_res, req, sizeof(req), 2);
>> +
>> +	rc = hiomap_queue_msg(ctx, msg);
>> +
>>  	if (rc)
>>  		return rc;
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prerror("%s failed: %d\n", __func__, res.cc);
>> -		return FLASH_ERR_PARM_ERROR;
>> -	}
>> -
>>  	prlog(PR_DEBUG, "Erased flash at 0x%" PRIx64 " for %" PRIu64 "\n",
>>  	      offset, size);
>>  
>> @@ -519,24 +845,53 @@ static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
>>  
>>  static bool hiomap_reset(struct ipmi_hiomap *ctx)
>>  {
>> -	RESULT_INIT(res, ctx);
>> +	static struct ipmi_hiomap_result reset_res;

Static variables are needed since scope of calls are completed

asynchronously and stack variables are gone.

> same
>
>>  	unsigned char req[2];
>>  	struct ipmi_msg *msg;
>> +	uint8_t reset_seq;
>> +	int orig_flags;
>> +	int tmp_sync_flags;
>> +	int rc;
>>  
>> -	prlog(PR_NOTICE, "Reset\n");
>> +	prlog(PR_NOTICE, "%s Reset ENTRY\n", __func__);
>> +	reset_res.ctx = ctx;
>> +	reset_res.cc = -1;
>> +
>> +	lock(&ctx->lock);
>> +	orig_flags = ctx->bl.flags;
>> +	/* clear out async to always do sync */
>> +	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
>> +	ctx->bl.flags = tmp_sync_flags;
>> +	reset_seq = ++ctx->seq;
>> +	ctx->cc = -1;
>> +	ctx->inflight_seq = reset_seq;
>> +	unlock(&ctx->lock);
>>  
>>  	req[0] = HIOMAP_C_RESET;
>> -	req[1] = ++ctx->seq;
>> +	req[1] = reset_seq;
>>  	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
>>  		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
>> -			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
>> -	ipmi_queue_msg_sync(msg);
>> +			 ipmi_hiomap_cmd_cb, &reset_res, req, sizeof(req), 2);
>> +
>> +	rc = hiomap_queue_msg(ctx, msg);
>> +	lock(&ctx->lock);
>> +	ctx->bl.flags = orig_flags;
>> +	unlock(&ctx->lock);
>> +
>> +	if (rc) {
>> +		prlog(PR_NOTICE, "%s reset queue msg failed: rc=%d\n", __func__, rc);
>> +		return false;
>> +	}
>>  
>> -	if (res.cc != IPMI_CC_NO_ERROR) {
>> -		prlog(PR_ERR, "%s failed: %d\n", __func__, res.cc);
>> +	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
>> +
>> +	if (rc) {
>> +		prlog(PR_NOTICE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +			__func__, rc, IPMI_ACK_DEFAULT);
>>  		return false;
>>  	}
>>  
>> +	prlog(PR_NOTICE, "%s Reset EXIT\n", __func__);
>>  	return true;
>>  }
>>  
>> @@ -666,6 +1021,7 @@ static int ipmi_hiomap_handle_events(struct ipmi_hiomap *ctx)
>>  	 * Therefore it is enough to mark the window as closed to consider it
>>  	 * recovered.
>>  	 */
>> +
>>  	if (status & (HIOMAP_E_PROTOCOL_RESET | HIOMAP_E_WINDOW_RESET))
>>  		ctx->window_state = closed_window;
>>  
>> @@ -737,8 +1093,9 @@ static int ipmi_hiomap_read(struct blocklevel_device *bl, uint64_t pos,
>>  			    void *buf, uint64_t len)
>>  {
>>  	struct ipmi_hiomap *ctx;
>> -	uint64_t size;
>> -	int rc = 0;
>> +	enum lpc_window_state state;
>> +	static uint64_t size;
>> +	int rc;
>>  
>>  	/* LPC is only 32bit */
>>  	if (pos > UINT_MAX || len > UINT_MAX)
>> @@ -746,88 +1103,208 @@ static int ipmi_hiomap_read(struct blocklevel_device *bl, uint64_t pos,
>>  
>>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
>>  
>> +	lock(&ctx->transaction_lock);
>> +
>>  	rc = ipmi_hiomap_handle_events(ctx);
>>  	if (rc)
>> -		return rc;
>> +		goto out;
>> +
>> +	lock(&ctx->lock);
>> +	if (ctx->bl.flags & IN_PROGRESS) {
>> +		buf = ctx->tracking_buf;
>> +		pos = ctx->tracking_pos;
>> +		len = ctx->tracking_len;
>> +	} else {
>> +		ctx->tracking_buf = buf;
>> +		ctx->tracking_pos = 0;
>> +		ctx->tracking_len = 0;
>> +	}
>> +	unlock(&ctx->lock);
>>  
>>  	prlog(PR_TRACE, "Flash read at %#" PRIx64 " for %#" PRIx64 "\n", pos,
>>  	      len);
>>  	while (len > 0) {
>> -		/* Move window and get a new size to read */
>> -		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_READ_WINDOW, pos,
>> -				        len, &size);
>> -		if (rc)
>> -			return rc;
>> -
>> -		/* Perform the read for this window */
>> -		rc = lpc_window_read(ctx, pos, buf, size);
>> -		if (rc)
>> -			return rc;
>> -
>> -		/* Check we can trust what we read */
>>  		lock(&ctx->lock);
>> -		rc = hiomap_window_valid(ctx, pos, size);
>> +		state = ctx->window_state;
>>  		unlock(&ctx->lock);
>> -		if (rc)
>> -			return rc;
>> +		if (state != moving_window) {
>> +			/* Move window and get a new size to read */
>> +			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_READ_WINDOW, pos,
>> +				len, &size);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
>> +					__func__, rc);
>> +				goto out;
>> +			}
>> +		} else {
>> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_HIOMAP_TICKS_DEFAULT);
>> +			if (rc) {
>> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +					__func__, rc, IPMI_HIOMAP_TICKS_DEFAULT);
>> +				goto out;
>> +			}
>> +		}
>>  
>> -		len -= size;
>> -		pos += size;
>> -		buf += size;
>> +		lock(&ctx->lock);
>> +		state = ctx->window_state;
>> +		unlock(&ctx->lock);
>> +		if (state == read_window) {
>> +			/*
>> +			 * don't lock in case move_cb in progress
>> +			 * if we get here the state is good
>> +			 * just double-checking
>> +			 */
>> +			if (ctx->cc != IPMI_CC_NO_ERROR) {
>> +				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
>> +				rc = OPAL_HARDWARE;
>> +				goto out;
>> +			}
>> +			/* Perform the read for this window */
>> +			rc = lpc_window_read(ctx, pos, buf, size);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s lpc_window_read failed: rc=%d\n", __func__, rc);
>> +				goto out;
>> +			}
>> +
>> +			/* Check we can trust what we read */
>> +			lock(&ctx->lock);
>> +			rc = hiomap_window_valid(ctx, pos, size);
>> +			unlock(&ctx->lock);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_window_valid failed: rc=%d\n", __func__, rc);
>> +				goto out;
>> +			}
>> +
>> +			len -= size;
>> +			pos += size;
>> +			buf += size;
>> +			lock(&ctx->lock);
>> +			ctx->tracking_len = len;
>> +			ctx->tracking_pos = pos;
>> +			ctx->tracking_buf = buf;
>> +			unlock(&ctx->lock);
>> +		}
>>  	}
>> -	return rc;
>>  
>> +out:	unlock(&ctx->transaction_lock);
>> +	return rc;
>>  }
>>  
>>  static int ipmi_hiomap_write(struct blocklevel_device *bl, uint64_t pos,
>>  			     const void *buf, uint64_t len)
>>  {
>>  	struct ipmi_hiomap *ctx;
>> -	uint64_t size;
>> -	int rc = 0;
>> +	enum lpc_window_state state;
>> +	static uint64_t size;
>> +	int rc;
>>  
>>  	/* LPC is only 32bit */
>>  	if (pos > UINT_MAX || len > UINT_MAX)
>>  		return FLASH_ERR_PARM_ERROR;
>>  
>>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
>> +	lock(&ctx->transaction_lock);
>>  
>>  	rc = ipmi_hiomap_handle_events(ctx);
>>  	if (rc)
>> -		return rc;
>> +		goto out;
>> +
>> +	lock(&ctx->lock);
>> +	if (ctx->bl.flags & IN_PROGRESS) {
>> +		buf = ctx->tracking_buf;
>> +		pos = ctx->tracking_pos;
>> +		len = ctx->tracking_len;
>> +	} else {
>> +		ctx->tracking_buf = (void *) buf;
>> +		ctx->tracking_pos = 0;
>> +		ctx->tracking_len = 0;
>> +	}
>> +	unlock(&ctx->lock);
>>  
>>  	prlog(PR_TRACE, "Flash write at %#" PRIx64 " for %#" PRIx64 "\n", pos,
>>  	      len);
>>  	while (len > 0) {
>> -		/* Move window and get a new size to read */
>> -		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
>> -				        len, &size);
>> -		if (rc)
>> -			return rc;
>> -
>> -		/* Perform the write for this window */
>> -		rc = lpc_window_write(ctx, pos, buf, size);
>> -		if (rc)
>> -			return rc;
>> -
>> -		rc = hiomap_mark_dirty(ctx, pos, size);
>> -		if (rc)
>> -			return rc;
>> +		lock(&ctx->lock);
>> +		state = ctx->window_state;
>> +		unlock(&ctx->lock);
>> +		if (state != moving_window) {
>> +			/* Move window and get a new size to read */
>> +			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
>> +					        len, &size);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
>> +					__func__, rc);
>> +				goto out;
>> +			}
>> +		} else {
>> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
>> +			if (rc) {
>> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +					__func__, rc, IPMI_LONG_TICKS);
>> +				goto out;
>> +			}
>> +		}
>>  
>> -		/*
>> -		 * The BMC *should* flush if the window is implicitly closed,
>> -		 * but do an explicit flush here to be sure.
>> -		 *
>> -		 * XXX: Removing this could improve performance
>> -		 */
>> -		rc = hiomap_flush(ctx);
>> -		if (rc)
>> -			return rc;
>> +		lock(&ctx->lock);
>> +		state = ctx->window_state;
>> +		unlock(&ctx->lock);
>>  
>> -		len -= size;
>> -		pos += size;
>> -		buf += size;
>> +		if (state == write_window) {
>> +			if (ctx->cc != IPMI_CC_NO_ERROR) {
>> +				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
>> +				rc = OPAL_HARDWARE;
>> +				goto out;
>> +			}
>> +
>> +			/* Perform the write for this window */
>> +			rc = lpc_window_write(ctx, pos, buf, size);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s lpc_window_write failed: rc=%d\n", __func__, rc);
>> +				goto out;
>> +			}
>> +
>> +			rc = hiomap_mark_dirty(ctx, pos, size);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_mark_dirty failed: rc=%d\n", __func__, rc);
>> +				goto out;
>> +			}
>> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
>> +			if (rc) {
>> +				prlog(PR_TRACE, "%s dirty hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +					__func__, rc, IPMI_LONG_TICKS);
>> +				goto out;
>> +			}
>> +
>> +			/*
>> +			 * The BMC *should* flush if the window is implicitly closed,
>> +			 * but do an explicit flush here to be sure.
>> +			 *
>> +			 * XXX: Removing this could improve performance
>> +			 */
>> +			rc = hiomap_flush(ctx);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_flush failed: rc=%d\n", __func__, rc);
>> +				goto out;
>> +			}
>> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
>> +			if (rc) {
>> +				prlog(PR_TRACE, "%s flush hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +					__func__, rc, IPMI_LONG_TICKS);
>> +				goto out;
>> +			}
>> +
>> +			len -= size;
>> +			pos += size;
>> +			buf += size;
>> +			lock(&ctx->lock);
>> +			ctx->tracking_len = len;
>> +			ctx->tracking_pos = pos;
>> +			ctx->tracking_buf = (void *) buf;
>> +			unlock(&ctx->lock);
>> +		}
>>  	}
>> +
>> +out:	unlock(&ctx->transaction_lock);
>>  	return rc;
>>  }
>>  
>> @@ -835,6 +1312,8 @@ static int ipmi_hiomap_erase(struct blocklevel_device *bl, uint64_t pos,
>>  			     uint64_t len)
>>  {
>>  	struct ipmi_hiomap *ctx;
>> +	enum lpc_window_state state;
>> +	static uint64_t size;
>>  	int rc;
>>  
>>  	/* LPC is only 32bit */
>> @@ -842,39 +1321,94 @@ static int ipmi_hiomap_erase(struct blocklevel_device *bl, uint64_t pos,
>>  		return FLASH_ERR_PARM_ERROR;
>>  
>>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
>> +	lock(&ctx->transaction_lock);
>>  
>>  	rc = ipmi_hiomap_handle_events(ctx);
>>  	if (rc)
>> -		return rc;
>> +		goto out;
>> +
>> +	lock(&ctx->lock);
>> +	if (ctx->bl.flags & IN_PROGRESS) {
>> +		pos = ctx->tracking_pos;
>> +		len = ctx->tracking_len;
>> +	} else {
>> +		ctx->tracking_pos = 0;
>> +		ctx->tracking_len = 0;
>> +	}
>> +	unlock(&ctx->lock);
>>  
>>  	prlog(PR_TRACE, "Flash erase at 0x%08x for 0x%08x\n", (u32) pos,
>>  	      (u32) len);
>>  	while (len > 0) {
>> -		uint64_t size;
>> -
>> -		/* Move window and get a new size to erase */
>> -		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
>> -				        len, &size);
>> -		if (rc)
>> -			return rc;
>> -
>> -		rc = hiomap_erase(ctx, pos, size);
>> -		if (rc)
>> -			return rc;
>> -
>> -		/*
>> -		 * Flush directly, don't mark that region dirty otherwise it
>> -		 * isn't clear if a write happened there or not
>> -		 */
>> -		rc = hiomap_flush(ctx);
>> -		if (rc)
>> -			return rc;
>> +		lock(&ctx->lock);
>> +		state = ctx->window_state;
>> +		unlock(&ctx->lock);
>> +		if (state != moving_window) {
>> +			/* Move window and get a new size to erase */
>> +			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
>> +					        len, &size);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
>> +					__func__, rc);
>> +				goto out;
>> +			}
>> +		} else {
>> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
>> +			if (rc) {
>> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +					__func__, rc, IPMI_LONG_TICKS);
>> +				goto out;
>> +			}
>> +		}
>>  
>> -		len -= size;
>> -		pos += size;
>> +		lock(&ctx->lock);
>> +		state = ctx->window_state;
>> +		unlock(&ctx->lock);
>> +		if (state == write_window) {
>> +			if (ctx->cc != IPMI_CC_NO_ERROR) {
>> +				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
>> +				rc = OPAL_HARDWARE;
>> +				goto out;
>> +			}
>> +			rc = hiomap_erase(ctx, pos, size);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_erase failed: rc=%d\n", __func__, rc);
>> +				goto out;
>> +			}
>> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
>> +			if (rc) {
>> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +					__func__, rc, IPMI_LONG_TICKS);
>> +				goto out;
>> +			}
>> +
>> +			/*
>> +			 * Flush directly, don't mark that region dirty otherwise it
>> +			 * isn't clear if a write happened there or not
>> +			 */
>> +			rc = hiomap_flush(ctx);
>> +			if (rc) {
>> +				prlog(PR_NOTICE, "%s hiomap_flush failed: rc=%d\n", __func__, rc);
>> +				goto out;
>> +			}
>> +			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
>> +			if (rc) {
>> +				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
>> +					__func__, rc, IPMI_LONG_TICKS);
>> +				goto out;
>> +			}
>> +
>> +			len -= size;
>> +			pos += size;
>> +			lock(&ctx->lock);
>> +			ctx->tracking_len = len;
>> +			ctx->tracking_pos = pos;
>> +			unlock(&ctx->lock);
>> +		}
>>  	}
>>  
>> -	return 0;
>> +out:	unlock(&ctx->transaction_lock);
>> +	return rc;
>>  }
>>  
>>  static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
>> @@ -885,6 +1419,7 @@ static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
>>  	int rc;
>>  
>>  	ctx = container_of(bl, struct ipmi_hiomap, bl);
>> +	lock(&ctx->transaction_lock);
>>  
>>  	rc = ipmi_hiomap_handle_events(ctx);
>>  	if (rc)
>> @@ -903,6 +1438,7 @@ static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
>>  	if (erase_granule)
>>  		*erase_granule = ctx->erase_granule;
>>  
>> +	unlock(&ctx->transaction_lock);

Transaction lock needed to assure that the complete async flash operation

is 100% complete.

> What situations is the transaction lock required to protect against?
> I'm willing to belive it's necessary, but I'd like to know why since we
> don't have one currently. Does the flash_lock handle everything for us
> at the moment?
>
>>  	return 0;
>>  }
>>  
>> @@ -925,6 +1461,7 @@ int ipmi_hiomap_init(struct blocklevel_device **bl)
>>  		return FLASH_ERR_MALLOC_FAILED;
>>  
>>  	init_lock(&ctx->lock);
>> +	init_lock(&ctx->transaction_lock);
>>  
>>  	ctx->bl.read = &ipmi_hiomap_read;
>>  	ctx->bl.write = &ipmi_hiomap_write;
>> diff --git a/libflash/ipmi-hiomap.h b/libflash/ipmi-hiomap.h
>> index 489d55e..4870fc5 100644
>> --- a/libflash/ipmi-hiomap.h
>> +++ b/libflash/ipmi-hiomap.h
>> @@ -10,12 +10,36 @@
>>  
>>  #include "blocklevel.h"
>>  
>> -enum lpc_window_state { closed_window, read_window, write_window };
>> +/*
>> + * we basically check for a quick response
>> + * otherwise we catch the updated window in the next cycle
>> + */
>> +#define IPMI_HIOMAP_TICKS 5

See previous comment on methodology on only

an immediate hit, otherwise we designed to pop

out back to linux to allow the completion.

> I did some testing on a witherspoon to see how long a
> hiomap_send_msg_sync() usually takes, and in the fastest times were:
>
> 	count   wait time (ms)
> 	1 	6
> 	18	7
> 	52	8
> 	24	9
> 	19	10
> 	4	11
> 	8	12
> 	<long tail of larger waits>
>
> So, is 5ms really enough to catch the common response times? It seem a
> bit low to me.
>
>> +#define IPMI_HIOMAP_TICKS_DEFAULT 0
>> +
>> +/* time to wait for write/erase/dirty ops */
>> +#define IPMI_LONG_TICKS 500
>> +
>> +/*
>> + * default for ack'ing typically 1-10 wait_time's
>> + * allow upper bounds because if we cannot ack
>> + * we make no forward progress post protocol reset
>> + * async paths will retry
>> + * sync paths always hit with zero wait_time elapsed
>> + * with ASYNC_REQUIRED_MASK'd out, this is not used
>> + */
>> +#define IPMI_ACK_DEFAULT 500
>> +
>> +/* increment to skip the waiting loop */
>> +#define IPMI_SKIP_INC 2
>> +
>> +enum lpc_window_state { closed_window, read_window, write_window, moving_window };
>>  
>>  struct lpc_window {
>>  	uint32_t lpc_addr; /* Offset into LPC space */
>>  	uint32_t cur_pos;  /* Current position of the window in the flash */
>>  	uint32_t size;     /* Size of the window into the flash */
>> +	uint32_t adjusted_window_size; /* store adjusted window size */
>>  };
>>  
>>  struct ipmi_hiomap {
>> @@ -35,6 +59,21 @@ struct ipmi_hiomap {
>>  	 * three variables are protected by lock to avoid conflict.
>>  	 */
>>  	struct lock lock;
>> +	struct lock transaction_lock;
>> +
>> +	/* callers transaction info */
>> +	uint64_t *active_size;
>> +	uint64_t requested_len;
>> +	uint64_t requested_pos;
>> +	uint64_t tracking_len;
>> +	uint64_t tracking_pos;
>> +	void *tracking_buf;
>> +
>> +	int missed_cc_count;
>> +	int cc;
>> +	/* inflight_seq used to aide debug */
>> +	/* with other OPAL ipmi msg's      */
>> +	uint8_t inflight_seq;
>>  	uint8_t bmc_state;
>>  	enum lpc_window_state window_state;
>>  };
Oliver O'Halloran Dec. 17, 2019, 1:53 a.m. UTC | #6
On Fri, 2019-12-13 at 06:32 -0600, Deb McLemore wrote:
> Comments below:
> 
> On 11/24/19 9:12 PM, Oliver O'Halloran wrote:
> > On Thu, Nov 7, 2019 at 4:25 AM Deb McLemore <debmc@linux.ibm.com> wrote:
> > > *snip*
> From libflash/ipmi-hiomap.h
> 
> /*
>  * we basically check for a quick response
>  * otherwise we catch the updated window in the next cycle
>  */
> #define IPMI_HIOMAP_TICKS 5
> 
> Unless the window is immediately ready (so we have to stall
> doing some logic which is not optimized out) this allows some
> period of time (short) to allow a quick response otherwise
> we will always kick out and back to linux to allow that delay
> to allow the window to be moved.
> 
> > wait_time is always going to be zero since the two mftb() calls are
> > going to return a pretty similar values and tb_to_msecs() divides the
> > difference by 512000 (tb_hz / 1000).
> > 
> > > +       timeout_counter = 0;
> > > +
> > > +
> > > +       while (wait_time < flash_reserve_ticks) {
> > > +               ++timeout_counter;
> > > +               if (timeout_counter % 4 == 0) {
> > > +                       now = mftb();
> > > +                       wait_time = tb_to_msecs(now - start_time);
> > > +               }
> > > +               if (flash->busy == false) {
> > > +                       break;
> > > +               }
> > > +       }

Please follow convention and put your responses below the thing being
responded to rather than above. Mixing responses above and below just
makes the thread a pain to follow.

> Again due to optimizing issues we want to stall and actually
> check the value "x" number of times to allow a miss or two
> on the lock to happen, other logic was optimized away.
> 
> > This doesn't make any sense to me. What are you doing here and why?
> > 
> > > -       if (!try_lock(&flash_lock))
> > > +       while (!try_lock(&flash_lock)) {
> > > +               --lock_try_counter;
> > > +               if (lock_try_counter == 0) {
> > > +                       break;
> > > +               }
> > > +       }
> > If you're going to do this sort of thing then use a for loop. T so the
> > iteration count is set the same place it's being used.
> > 
> > > +       if (lock_try_counter == 0) {
> > >                 return false;
> > > +       }
> > you should return from inside the loop instead of breaking out and
> > re-checking the break condition
> > 
> > > +
> > > +       /* we have the lock if we got here */
> > > 
> > >         if (!flash->busy) {
> > >                 flash->busy = true;
> > >                 rc = true;
> > > +       } else {
> > > +               /* probably beat a flash_release and grabbed the lock */
> > > +               unlock(&flash_lock);
> > > +               while (!try_lock(&flash_lock)) {
> > > +                       --lock_try_counter;
> > > +                       if (lock_try_counter == 0) {
> > > +                               break;
> > > +                       }
> > > +               }
> > > +               if (lock_try_counter == 0) {
> > > +                       return false;
> > > +               }
> > > +               /* we have the lock if we are here */
> > > +               if (!flash->busy) {
> > > +                       flash->busy = true;
> > > +                       rc = true;
> > > +               }
> > This doesn't make much sense to me either. Why is replicating all the
> > previous logic necessary?
> > 
> > >         }
> > >         unlock(&flash_lock);
> > I'm not sure that we even need to use try_lock() with patch 03/13
> > applied. With that patch the flash's lock should only be held briefly
> > so we should be able to use the normal (blocking) lock() since we
> > don't need to deal with waiting for a 64MB flash read to complete.

^^^^^^^^^^^^^^^^^^

This is what I really wanted a response to.

All the games above with try_lock() and checking the busy flag outside
the lock only make sense to me if we're need to avoid being stuck
waiting for a lock in skiboot. The changes in 3/13 result in the lock
only being held briefly since the only thing that needs to be done
under the lock is checking the busy flag. Why not do this instead:

diff --git a/core/flash.c b/core/flash.c
index d97d88edbdd5..8fcc9d46ca59 100644
--- a/core/flash.c
+++ b/core/flash.c
@@ -64,8 +64,7 @@ static bool flash_reserve(struct flash *flash)
 {
 	bool rc = false;
 
-	if (!try_lock(&flash_lock))
-		return false;
+	lock(&flash_lock);
 
 	if (!flash->busy) {
 		flash->busy = true;

That change probably should have been in Cyril's patch. Oh well, people
write bugs sometimes.


> > > @@ -279,12 +340,31 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
> > >                 assert(0);
> > >         }
> > > 
> > > -       if (rc)
> > > -               rc = OPAL_HARDWARE;
> > > +       if (rc == 0) {
> > > +               /*
> > > +                * if we are good to proceed forward
> > > +                * otherwise we may have to try again
> > > +                */
> > > +               flash->async.pos += len;
> > > +               flash->async.buf += len;
> > > +               flash->async.len -= len;
> > > +               /* if we good clear */
> > > +               flash->async.retry_counter = 0;
> > > +               /*
> > > +                * clear the IN_PROGRESS flags
> > > +                * we only need IN_PROGRESS active on missed_cc
> > > +                */
> > > +               flash->bl->flags &= IN_PROGRESS_MASK;
> > > +               /* reset time for scheduling gap */
> > > +               flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
>
> async flash layer is the one that is managing the work,
> block layer is a worker which does not know how the
> work is being managed.
> 
> > The default delay should come from the blocklevel rather than being
> > assumed by the user of libflash. The bl has a better idea about what
> > sort of interval is required between ops and what a reasonable timeout
> > is.

What do you mean by "managing"? The general pattern for async IO in
skiboot is:

upper layer:
	1. Sets up a job
	2. Submit job it to the worker
	3. Periodically polls the worker until completed.

worker:
	1. Do all the work you can until a wait is required.
	2. Return a wait time to the upper layer.

All the details of how the job is handled are supposed to be owned by
the worker. The upper layer does not, and should not, know any of the
details of how the job is actually handled. All the upper layer really
wants is a completion code from the worker. This does mean you need to
keep a bit more of the transaction state in the blocklayer, but I think
that's fine.

> Problem cases are when the async has split the work up into
> chunks.  If a chunk desired to be read straddles the window
> (meaning that only a subset of the chunk is available in the
> current window) we have to bump down the async chunk
> desired to be read so that it will succeed.  Then we can
> acquire a new window which we can then do subsequent reads
> (again at the async flash layer).
> 
> > I can't parse this, but it seems like it's all blocklevel specific
> > stuff that should be documented there. From the perspective of a
> > libflash user we don't want to know or care about *why* we need to do
> > a async delay, just that we have to.

As mentioned above, I don't see why this can't be handled in the
blocklayer. Splitting a single read because it crosses a window
boundary is an implementation detail of the blocklayer driver and
should be handled there.

> > > @@ -487,7 +596,7 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> > >                 uint64_t buf, uint64_t size, uint64_t token)
> > >  {
> > >         struct flash *flash = NULL;
> > > -       uint64_t len;
> > > +       uint64_t len = 0;
> Yes
> > Is that needed?

Doesn't appear to be. Adding a bit more context to the hunk below:

        len = MIN(size, flash->block_size*10);
        printf("Flash op %d len %llu\n", op, len);
        flash->async.op = op;
        flash->async.token = token;
        flash->async.buf = buf + len; 
        flash->async.len = size - len; 

So it's already explicitly initialised.


> > > @@ -516,6 +625,10 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> > >         flash->async.buf = buf + len;
> > >         flash->async.len = size - len;
> > >         flash->async.pos = offset + len;
> > > +       flash->async.retry_counter = 0;
> > > +       flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
> > > +       flash->bl->flags |= ASYNC_REQUIRED;
> > > +       ++flash->async.transaction_id;
> This is needed.
> > Use post-increment unless you absolutely have to use pre-increment.
> > There's only a few instances where pre-increment is used in skiboot
> > and I'd like to keep it that way.

Why? Is a transaction ID of zero signifigant in some way? If so, is
that documented?
Deb McLemore Jan. 2, 2020, 4:38 p.m. UTC | #7
On 12/16/19 7:53 PM, Oliver O'Halloran wrote:
> On Fri, 2019-12-13 at 06:32 -0600, Deb McLemore wrote:
>> Comments below:
>>
>> On 11/24/19 9:12 PM, Oliver O'Halloran wrote:
>>> On Thu, Nov 7, 2019 at 4:25 AM Deb McLemore <debmc@linux.ibm.com> wrote:
>>>> *snip*
>> From libflash/ipmi-hiomap.h
>>
>> /*
>>  * we basically check for a quick response
>>  * otherwise we catch the updated window in the next cycle
>>  */
>> #define IPMI_HIOMAP_TICKS 5
>>
>> Unless the window is immediately ready (so we have to stall
>> doing some logic which is not optimized out) this allows some
>> period of time (short) to allow a quick response otherwise
>> we will always kick out and back to linux to allow that delay
>> to allow the window to be moved.
>>
>>> wait_time is always going to be zero since the two mftb() calls are
>>> going to return a pretty similar values and tb_to_msecs() divides the
>>> difference by 512000 (tb_hz / 1000).
>>>
>>>> +       timeout_counter = 0;
>>>> +
>>>> +
>>>> +       while (wait_time < flash_reserve_ticks) {
>>>> +               ++timeout_counter;
>>>> +               if (timeout_counter % 4 == 0) {
>>>> +                       now = mftb();
>>>> +                       wait_time = tb_to_msecs(now - start_time);
>>>> +               }
>>>> +               if (flash->busy == false) {
>>>> +                       break;
>>>> +               }
>>>> +       }
> Please follow convention and put your responses below the thing being
> responded to rather than above. Mixing responses above and below just
> makes the thread a pain to follow.
>
>> Again due to optimizing issues we want to stall and actually
>> check the value "x" number of times to allow a miss or two
>> on the lock to happen, other logic was optimized away.
>>
>>> This doesn't make any sense to me. What are you doing here and why?
>>>
>>>> -       if (!try_lock(&flash_lock))
>>>> +       while (!try_lock(&flash_lock)) {
>>>> +               --lock_try_counter;
>>>> +               if (lock_try_counter == 0) {
>>>> +                       break;
>>>> +               }
>>>> +       }
>>> If you're going to do this sort of thing then use a for loop. T so the
>>> iteration count is set the same place it's being used.
>>>
>>>> +       if (lock_try_counter == 0) {
>>>>                 return false;
>>>> +       }
>>> you should return from inside the loop instead of breaking out and
>>> re-checking the break condition
>>>
>>>> +
>>>> +       /* we have the lock if we got here */
>>>>
>>>>         if (!flash->busy) {
>>>>                 flash->busy = true;
>>>>                 rc = true;
>>>> +       } else {
>>>> +               /* probably beat a flash_release and grabbed the lock */
>>>> +               unlock(&flash_lock);
>>>> +               while (!try_lock(&flash_lock)) {
>>>> +                       --lock_try_counter;
>>>> +                       if (lock_try_counter == 0) {
>>>> +                               break;
>>>> +                       }
>>>> +               }
>>>> +               if (lock_try_counter == 0) {
>>>> +                       return false;
>>>> +               }
>>>> +               /* we have the lock if we are here */
>>>> +               if (!flash->busy) {
>>>> +                       flash->busy = true;
>>>> +                       rc = true;
>>>> +               }
>>> This doesn't make much sense to me either. Why is replicating all the
>>> previous logic necessary?
>>>
>>>>         }
>>>>         unlock(&flash_lock);
>>> I'm not sure that we even need to use try_lock() with patch 03/13
>>> applied. With that patch the flash's lock should only be held briefly
>>> so we should be able to use the normal (blocking) lock() since we
>>> don't need to deal with waiting for a 64MB flash read to complete.
> ^^^^^^^^^^^^^^^^^^
>
> This is what I really wanted a response to.
>
> All the games above with try_lock() and checking the busy flag outside
> the lock only make sense to me if we're need to avoid being stuck
> waiting for a lock in skiboot. The changes in 3/13 result in the lock
> only being held briefly since the only thing that needs to be done
> under the lock is checking the busy flag. Why not do this instead:
>
> diff --git a/core/flash.c b/core/flash.c
> index d97d88edbdd5..8fcc9d46ca59 100644
> --- a/core/flash.c
> +++ b/core/flash.c
> @@ -64,8 +64,7 @@ static bool flash_reserve(struct flash *flash)
>  {
>  	bool rc = false;
>  
> -	if (!try_lock(&flash_lock))
> -		return false;
> +	lock(&flash_lock);
>  
>  	if (!flash->busy) {
>  		flash->busy = true;
>
> That change probably should have been in Cyril's patch. Oh well, people
> write bugs sometimes.

The logic added is to add resiliency.  If the flash_lock is secured, this

does not mean the flash->busy will always be available so we check the

status of the data and allow some iterations to maybe persist in the current

operation.

>
>>>> @@ -279,12 +340,31 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>>>>                 assert(0);
>>>>         }
>>>>
>>>> -       if (rc)
>>>> -               rc = OPAL_HARDWARE;
>>>> +       if (rc == 0) {
>>>> +               /*
>>>> +                * if we are good to proceed forward
>>>> +                * otherwise we may have to try again
>>>> +                */
>>>> +               flash->async.pos += len;
>>>> +               flash->async.buf += len;
>>>> +               flash->async.len -= len;
>>>> +               /* if we good clear */
>>>> +               flash->async.retry_counter = 0;
>>>> +               /*
>>>> +                * clear the IN_PROGRESS flags
>>>> +                * we only need IN_PROGRESS active on missed_cc
>>>> +                */
>>>> +               flash->bl->flags &= IN_PROGRESS_MASK;
>>>> +               /* reset time for scheduling gap */
>>>> +               flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
>> async flash layer is the one that is managing the work,
>> block layer is a worker which does not know how the
>> work is being managed.
>>
>>> The default delay should come from the blocklevel rather than being
>>> assumed by the user of libflash. The bl has a better idea about what
>>> sort of interval is required between ops and what a reasonable timeout
>>> is.
> What do you mean by "managing"? The general pattern for async IO in
> skiboot is:
>
> upper layer:
> 	1. Sets up a job
> 	2. Submit job it to the worker
> 	3. Periodically polls the worker until completed.
>
> worker:
> 	1. Do all the work you can until a wait is required.
> 	2. Return a wait time to the upper layer.
>
> All the details of how the job is handled are supposed to be owned by
> the worker. The upper layer does not, and should not, know any of the
> details of how the job is actually handled. All the upper layer really
> wants is a completion code from the worker. This does mean you need to
> keep a bit more of the transaction state in the blocklayer, but I think
> that's fine.
>
>> Problem cases are when the async has split the work up into
>> chunks.  If a chunk desired to be read straddles the window
>> (meaning that only a subset of the chunk is available in the
>> current window) we have to bump down the async chunk
>> desired to be read so that it will succeed.  Then we can
>> acquire a new window which we can then do subsequent reads
>> (again at the async flash layer).
>>
>>> I can't parse this, but it seems like it's all blocklevel specific
>>> stuff that should be documented there. From the perspective of a
>>> libflash user we don't want to know or care about *why* we need to do
>>> a async delay, just that we have to.
> As mentioned above, I don't see why this can't be handled in the
> blocklayer. Splitting a single read because it crosses a window
> boundary is an implementation detail of the blocklayer driver and
> should be handled there.

The async design for this feature is to allow an alternative management
of the typical

skiboot polling since during the window movement we cannot allow those

operations (pushing the transaction state to the blocklayer was not the
design point of this

feature, managing and communicating success and failure of each
individual worker request seemed a bit much).

>
>>>> @@ -487,7 +596,7 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>>>                 uint64_t buf, uint64_t size, uint64_t token)
>>>>  {
>>>>         struct flash *flash = NULL;
>>>> -       uint64_t len;
>>>> +       uint64_t len = 0;
>> Yes
>>> Is that needed?
> Doesn't appear to be. Adding a bit more context to the hunk below:
>
>         len = MIN(size, flash->block_size*10);
>         printf("Flash op %d len %llu\n", op, len);
>         flash->async.op = op;
>         flash->async.token = token;
>         flash->async.buf = buf + len; 
>         flash->async.len = size - len; 
>
> So it's already explicitly initialised.
        [CC]  core/flash.o
In file included from core/flash.c:11:0:
core/flash.c: In function ‘opal_flash_op’:
include/skiboot.h:81:31: error: ‘len’ may be used uninitialized in this
function [-Werror=maybe-uninitialized]
 #define prlog(l, f, ...) do { _prlog(l, pr_fmt(f), ##__VA_ARGS__); }
while(0)
                               ^~~~~~
core/flash.c:612:11: note: ‘len’ was declared here
  uint64_t len;
           ^~~
        [CC]  core/sensor.o
cc1: all warnings being treated as errors
make[2]: *** [core/flash.o] Error 1

>
>
>>>> @@ -516,6 +625,10 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>>>         flash->async.buf = buf + len;
>>>>         flash->async.len = size - len;
>>>>         flash->async.pos = offset + len;
>>>> +       flash->async.retry_counter = 0;
>>>> +       flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
>>>> +       flash->bl->flags |= ASYNC_REQUIRED;
>>>> +       ++flash->async.transaction_id;
>> This is needed.
>>> Use post-increment unless you absolutely have to use pre-increment.
>>> There's only a few instances where pre-increment is used in skiboot
>>> and I'd like to keep it that way.
> Why? Is a transaction ID of zero signifigant in some way? If so, is
> that documented?

The entry of opal_flash_op is the transition point of when we know a new
transaction is being

requested, so we bump the transaction_id on entry (subsequent
continuation paths enter

through flash_poll).

>
>
>
Oliver O'Halloran Jan. 7, 2020, 3:43 a.m. UTC | #8
On Fri, Jan 3, 2020 at 3:38 AM Deb McLemore <debmc@linux.ibm.com> wrote:
>
> *snip*
>
> >>>>         unlock(&flash_lock);
> >>> I'm not sure that we even need to use try_lock() with patch 03/13
> >>> applied. With that patch the flash's lock should only be held briefly
> >>> so we should be able to use the normal (blocking) lock() since we
> >>> don't need to deal with waiting for a 64MB flash read to complete.
> > ^^^^^^^^^^^^^^^^^^
> >
> > This is what I really wanted a response to.
> >
> > All the games above with try_lock() and checking the busy flag outside
> > the lock only make sense to me if we're need to avoid being stuck
> > waiting for a lock in skiboot. The changes in 3/13 result in the lock
> > only being held briefly since the only thing that needs to be done
> > under the lock is checking the busy flag. Why not do this instead:
> >
> > diff --git a/core/flash.c b/core/flash.c
> > index d97d88edbdd5..8fcc9d46ca59 100644
> > --- a/core/flash.c
> > +++ b/core/flash.c
> > @@ -64,8 +64,7 @@ static bool flash_reserve(struct flash *flash)
> >  {
> >       bool rc = false;
> >
> > -     if (!try_lock(&flash_lock))
> > -             return false;
> > +     lock(&flash_lock);
> >
> >       if (!flash->busy) {
> >               flash->busy = true;
> >
> > That change probably should have been in Cyril's patch. Oh well, people
> > write bugs sometimes.
>
> The logic added is to add resiliency.

Resiliency against what? Performing multiple flash operations in
parallel is something that we've never really supported  and the
caller needs to handle the OPAL_BUSY case anyway. If the operation
can't be done now then we should return BUSY to the caller and let it
re-try the operation. What does this hack help with? What breaks if we
don't do this?

> If the flash_lock is secured, this
> does not mean the flash->busy will always be available so we check the
> status of the data and

I know.

> allow some iterations to maybe persist in the current operation.

Again, why is this worth doing?

> >>>> @@ -279,12 +340,31 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
> >>>>                 assert(0);
> >>>>         }
> >>>>
> >>>> -       if (rc)
> >>>> -               rc = OPAL_HARDWARE;
> >>>> +       if (rc == 0) {
> >>>> +               /*
> >>>> +                * if we are good to proceed forward
> >>>> +                * otherwise we may have to try again
> >>>> +                */
> >>>> +               flash->async.pos += len;
> >>>> +               flash->async.buf += len;
> >>>> +               flash->async.len -= len;
> >>>> +               /* if we good clear */
> >>>> +               flash->async.retry_counter = 0;
> >>>> +               /*
> >>>> +                * clear the IN_PROGRESS flags
> >>>> +                * we only need IN_PROGRESS active on missed_cc
> >>>> +                */
> >>>> +               flash->bl->flags &= IN_PROGRESS_MASK;
> >>>> +               /* reset time for scheduling gap */
> >>>> +               flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
> >> async flash layer is the one that is managing the work,
> >> block layer is a worker which does not know how the
> >> work is being managed.
> >>
> >>> The default delay should come from the blocklevel rather than being
> >>> assumed by the user of libflash. The bl has a better idea about what
> >>> sort of interval is required between ops and what a reasonable timeout
> >>> is.
> > What do you mean by "managing"? The general pattern for async IO in
> > skiboot is:
> >
> > upper layer:
> >       1. Sets up a job
> >       2. Submit job it to the worker
> >       3. Periodically polls the worker until completed.
> >
> > worker:
> >       1. Do all the work you can until a wait is required.
> >       2. Return a wait time to the upper layer.
> >
> > All the details of how the job is handled are supposed to be owned by
> > the worker. The upper layer does not, and should not, know any of the
> > details of how the job is actually handled. All the upper layer really
> > wants is a completion code from the worker. This does mean you need to
> > keep a bit more of the transaction state in the blocklayer, but I think
> > that's fine.
> >
> >> Problem cases are when the async has split the work up into
> >> chunks.  If a chunk desired to be read straddles the window
> >> (meaning that only a subset of the chunk is available in the
> >> current window) we have to bump down the async chunk
> >> desired to be read so that it will succeed.  Then we can
> >> acquire a new window which we can then do subsequent reads
> >> (again at the async flash layer).
> >>
> >>> I can't parse this, but it seems like it's all blocklevel specific
> >>> stuff that should be documented there. From the perspective of a
> >>> libflash user we don't want to know or care about *why* we need to do
> >>> a async delay, just that we have to.
> > As mentioned above, I don't see why this can't be handled in the
> > blocklayer. Splitting a single read because it crosses a window
> > boundary is an implementation detail of the blocklayer driver and
> > should be handled there.
>
> The async design for this feature is to allow an alternative management
> of the typical skiboot polling during the window movement we cannot
> allow those operations

Why not?

> (pushing the transaction state to the blocklayer was not the
> design point of this feature managing and communicating success
> and failure of each individual worker request seemed a bit much).

What is "the design point?" and how (and why) did you decide that was the point?

My main gripe with this series (aside from the 1300 line patch...) is
that there's a lot of strange code which seems like it's there to hack
around issues caused by the weird split between the blocklayer and the
"transaction state". So, why did that design get chosen? What were the
alternatives and why is this better? I'm sure there's real problems
you're trying to address, but I'm having a hard time working out what
they are...

> >>>> @@ -487,7 +596,7 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >>>>                 uint64_t buf, uint64_t size, uint64_t token)
> >>>>  {
> >>>>         struct flash *flash = NULL;
> >>>> -       uint64_t len;
> >>>> +       uint64_t len = 0;
> >> Yes
> >>> Is that needed?
> > Doesn't appear to be. Adding a bit more context to the hunk below:
> >
> >         len = MIN(size, flash->block_size*10);
> >         printf("Flash op %d len %llu\n", op, len);
> >         flash->async.op = op;
> >         flash->async.token = token;
> >         flash->async.buf = buf + len;
> >         flash->async.len = size - len;
> >
> > So it's already explicitly initialised.
>         [CC]  core/flash.o
> In file included from core/flash.c:11:0:
> core/flash.c: In function ‘opal_flash_op’:
> include/skiboot.h:81:31: error: ‘len’ may be used uninitialized in this
> function [-Werror=maybe-uninitialized]
>  #define prlog(l, f, ...) do { _prlog(l, pr_fmt(f), ##__VA_ARGS__); }
> while(0)
>                                ^~~~~~
> core/flash.c:612:11: note: ‘len’ was declared here
>   uint64_t len;
>            ^~~
>         [CC]  core/sensor.o
> cc1: all warnings being treated as errors
> make[2]: *** [core/flash.o] Error 1

Right, that's fine then. That error message is pretty terrible though
since it doesn't even mention the line that causes it (I think it's
the prlog()s after the out: label). Might be worth filing a GCC bug
about it.

> >>>> @@ -516,6 +625,10 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >>>>         flash->async.buf = buf + len;
> >>>>         flash->async.len = size - len;
> >>>>         flash->async.pos = offset + len;
> >>>> +       flash->async.retry_counter = 0;
> >>>> +       flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
> >>>> +       flash->bl->flags |= ASYNC_REQUIRED;
> >>>> +       ++flash->async.transaction_id;
> >> This is needed.
> >>> Use post-increment unless you absolutely have to use pre-increment.
> >>> There's only a few instances where pre-increment is used in skiboot
> >>> and I'd like to keep it that way.
> > Why? Is a transaction ID of zero signifigant in some way? If so, is
> > that documented?
>
> The entry of opal_flash_op is the transition point of when we know a new
> transaction is being  requested, so we bump the transaction_id on entry
> (subsequent continuation paths enter through flash_poll).

What does this have to do with using a pre-increment rather than a
post-increment? I'm complaining about the coding style, not asking why
the transaction_id exists.
Deb McLemore Jan. 7, 2020, 12:50 p.m. UTC | #9
On 1/6/20 9:43 PM, Oliver O'Halloran wrote:
> On Fri, Jan 3, 2020 at 3:38 AM Deb McLemore <debmc@linux.ibm.com> wrote:
>> *snip*
>>
>>>>>>         unlock(&flash_lock);
>>>>> I'm not sure that we even need to use try_lock() with patch 03/13
>>>>> applied. With that patch the flash's lock should only be held briefly
>>>>> so we should be able to use the normal (blocking) lock() since we
>>>>> don't need to deal with waiting for a 64MB flash read to complete.
>>> ^^^^^^^^^^^^^^^^^^
>>>
>>> This is what I really wanted a response to.
>>>
>>> All the games above with try_lock() and checking the busy flag outside
>>> the lock only make sense to me if we're need to avoid being stuck
>>> waiting for a lock in skiboot. The changes in 3/13 result in the lock
>>> only being held briefly since the only thing that needs to be done
>>> under the lock is checking the busy flag. Why not do this instead:
>>>
>>> diff --git a/core/flash.c b/core/flash.c
>>> index d97d88edbdd5..8fcc9d46ca59 100644
>>> --- a/core/flash.c
>>> +++ b/core/flash.c
>>> @@ -64,8 +64,7 @@ static bool flash_reserve(struct flash *flash)
>>>  {
>>>       bool rc = false;
>>>
>>> -     if (!try_lock(&flash_lock))
>>> -             return false;
>>> +     lock(&flash_lock);
>>>
>>>       if (!flash->busy) {
>>>               flash->busy = true;
>>>
>>> That change probably should have been in Cyril's patch. Oh well, people
>>> write bugs sometimes.
>> The logic added is to add resiliency.
> Resiliency against what? Performing multiple flash operations in
> parallel is something that we've never really supported  and the
> caller needs to handle the OPAL_BUSY case anyway. If the operation
> can't be done now then we should return BUSY to the caller and let it
> re-try the operation. What does this hack help with? What breaks if we
> don't do this?
>
>> If the flash_lock is secured, this
>> does not mean the flash->busy will always be available so we check the
>> status of the data and
> I know.
>
>> allow some iterations to maybe persist in the current operation.
> Again, why is this worth doing?

During testing using a single caller there are timing flows where

the lock is able to be retrieved and where another thread has

freed the lock and updated the data busy flag.  Without this code

there are timing issues where simple requests fail.

>
>>>>>> @@ -279,12 +340,31 @@ static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
>>>>>>                 assert(0);
>>>>>>         }
>>>>>>
>>>>>> -       if (rc)
>>>>>> -               rc = OPAL_HARDWARE;
>>>>>> +       if (rc == 0) {
>>>>>> +               /*
>>>>>> +                * if we are good to proceed forward
>>>>>> +                * otherwise we may have to try again
>>>>>> +                */
>>>>>> +               flash->async.pos += len;
>>>>>> +               flash->async.buf += len;
>>>>>> +               flash->async.len -= len;
>>>>>> +               /* if we good clear */
>>>>>> +               flash->async.retry_counter = 0;
>>>>>> +               /*
>>>>>> +                * clear the IN_PROGRESS flags
>>>>>> +                * we only need IN_PROGRESS active on missed_cc
>>>>>> +                */
>>>>>> +               flash->bl->flags &= IN_PROGRESS_MASK;
>>>>>> +               /* reset time for scheduling gap */
>>>>>> +               flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
>>>> async flash layer is the one that is managing the work,
>>>> block layer is a worker which does not know how the
>>>> work is being managed.
>>>>
>>>>> The default delay should come from the blocklevel rather than being
>>>>> assumed by the user of libflash. The bl has a better idea about what
>>>>> sort of interval is required between ops and what a reasonable timeout
>>>>> is.
>>> What do you mean by "managing"? The general pattern for async IO in
>>> skiboot is:
>>>
>>> upper layer:
>>>       1. Sets up a job
>>>       2. Submit job it to the worker
>>>       3. Periodically polls the worker until completed.
>>>
>>> worker:
>>>       1. Do all the work you can until a wait is required.
>>>       2. Return a wait time to the upper layer.
>>>
>>> All the details of how the job is handled are supposed to be owned by
>>> the worker. The upper layer does not, and should not, know any of the
>>> details of how the job is actually handled. All the upper layer really
>>> wants is a completion code from the worker. This does mean you need to
>>> keep a bit more of the transaction state in the blocklayer, but I think
>>> that's fine.
>>>
>>>> Problem cases are when the async has split the work up into
>>>> chunks.  If a chunk desired to be read straddles the window
>>>> (meaning that only a subset of the chunk is available in the
>>>> current window) we have to bump down the async chunk
>>>> desired to be read so that it will succeed.  Then we can
>>>> acquire a new window which we can then do subsequent reads
>>>> (again at the async flash layer).
>>>>
>>>>> I can't parse this, but it seems like it's all blocklevel specific
>>>>> stuff that should be documented there. From the perspective of a
>>>>> libflash user we don't want to know or care about *why* we need to do
>>>>> a async delay, just that we have to.
>>> As mentioned above, I don't see why this can't be handled in the
>>> blocklayer. Splitting a single read because it crosses a window
>>> boundary is an implementation detail of the blocklayer driver and
>>> should be handled there.
>> The async design for this feature is to allow an alternative management
>> of the typical skiboot polling during the window movement we cannot
>> allow those operations
> Why not?
>
>> (pushing the transaction state to the blocklayer was not the
>> design point of this feature managing and communicating success
>> and failure of each individual worker request seemed a bit much).
> What is "the design point?" and how (and why) did you decide that was the point?
>
> My main gripe with this series (aside from the 1300 line patch...) is
> that there's a lot of strange code which seems like it's there to hack
> around issues caused by the weird split between the blocklayer and the
> "transaction state". So, why did that design get chosen? What were the
> alternatives and why is this better? I'm sure there's real problems
> you're trying to address, but I'm having a hard time working out what
> they are...

Recursive polling in skiboot is not allowed, therefore we need a design

for opal_flash_op which allows asynchronous operations (which

includes the synchronous ipmi window move).  Performing the

synchronous window move takes an undeterministic amount of

time, therefore we need to return to linux to prevent stalls.  Handling

the success and failure of these move operations also needs to be managed.

Tracking the success and failure of each individual block request also
needs

to be managed therefore this design is chosen to manage the async work

requirements.

The other suggestion referred for consideration was the

posix_fadvise(POSIX_FADV_WILLNEED) which was reviewed.

>
>>>>>> @@ -487,7 +596,7 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>>>>>                 uint64_t buf, uint64_t size, uint64_t token)
>>>>>>  {
>>>>>>         struct flash *flash = NULL;
>>>>>> -       uint64_t len;
>>>>>> +       uint64_t len = 0;
>>>> Yes
>>>>> Is that needed?
>>> Doesn't appear to be. Adding a bit more context to the hunk below:
>>>
>>>         len = MIN(size, flash->block_size*10);
>>>         printf("Flash op %d len %llu\n", op, len);
>>>         flash->async.op = op;
>>>         flash->async.token = token;
>>>         flash->async.buf = buf + len;
>>>         flash->async.len = size - len;
>>>
>>> So it's already explicitly initialised.
>>         [CC]  core/flash.o
>> In file included from core/flash.c:11:0:
>> core/flash.c: In function ‘opal_flash_op’:
>> include/skiboot.h:81:31: error: ‘len’ may be used uninitialized in this
>> function [-Werror=maybe-uninitialized]
>>  #define prlog(l, f, ...) do { _prlog(l, pr_fmt(f), ##__VA_ARGS__); }
>> while(0)
>>                                ^~~~~~
>> core/flash.c:612:11: note: ‘len’ was declared here
>>   uint64_t len;
>>            ^~~
>>         [CC]  core/sensor.o
>> cc1: all warnings being treated as errors
>> make[2]: *** [core/flash.o] Error 1
> Right, that's fine then. That error message is pretty terrible though
> since it doesn't even mention the line that causes it (I think it's
> the prlog()s after the out: label). Might be worth filing a GCC bug
> about it.
>
>>>>>> @@ -516,6 +625,10 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>>>>>>         flash->async.buf = buf + len;
>>>>>>         flash->async.len = size - len;
>>>>>>         flash->async.pos = offset + len;
>>>>>> +       flash->async.retry_counter = 0;
>>>>>> +       flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
>>>>>> +       flash->bl->flags |= ASYNC_REQUIRED;
>>>>>> +       ++flash->async.transaction_id;
>>>> This is needed.
>>>>> Use post-increment unless you absolutely have to use pre-increment.
>>>>> There's only a few instances where pre-increment is used in skiboot
>>>>> and I'd like to keep it that way.
>>> Why? Is a transaction ID of zero signifigant in some way? If so, is
>>> that documented?
>> The entry of opal_flash_op is the transition point of when we know a new
>> transaction is being  requested, so we bump the transaction_id on entry
>> (subsequent continuation paths enter through flash_poll).
> What does this have to do with using a pre-increment rather than a
> post-increment? I'm complaining about the coding style, not asking why
> the transaction_id exists.

The pre-increment is used at the point in time when we know the operation

is a new transaction, this allows the ability to have the bump in a single

location versus having multiple locations on different exit paths.
diff mbox series

Patch

diff --git a/core/flash.c b/core/flash.c
index e98c8e0..98614f7 100644
--- a/core/flash.c
+++ b/core/flash.c
@@ -28,6 +28,14 @@ 
 #include <timer.h>
 #include <timebase.h>
 
+/*
+ * need to keep this under the BT queue limit
+ * causes are when ipmi to bmc gets bogged down
+ * delay allows for congestion to clear
+ */
+#define FLASH_RETRY_LIMIT 10
+#define FLASH_SCHEDULE_DELAY 200
+
 enum flash_op {
 	FLASH_OP_READ,
 	FLASH_OP_WRITE,
@@ -41,6 +49,9 @@  struct flash_async_info {
 	uint64_t pos;
 	uint64_t len;
 	uint64_t buf;
+	int retry_counter;
+	int transaction_id;
+	int in_progress_schedule_delay;
 };
 
 struct flash {
@@ -80,13 +91,63 @@  static u32 nvram_offset, nvram_size;
 static bool flash_reserve(struct flash *flash)
 {
 	bool rc = false;
+	int lock_try_counter = 10;
+	uint64_t now;
+	uint64_t start_time;
+	uint64_t wait_time;
+	uint64_t flash_reserve_ticks = 10;
+	uint64_t timeout_counter;
+
+	start_time = mftb();
+	now = mftb();
+	wait_time = tb_to_msecs(now - start_time);
+	timeout_counter = 0;
+
+
+	while (wait_time < flash_reserve_ticks) {
+		++timeout_counter;
+		if (timeout_counter % 4 == 0) {
+			now = mftb();
+			wait_time = tb_to_msecs(now - start_time);
+		}
+		if (flash->busy == false) {
+			break;
+		}
+	}
 
-	if (!try_lock(&flash_lock))
+	while (!try_lock(&flash_lock)) {
+		--lock_try_counter;
+		if (lock_try_counter == 0) {
+			break;
+		}
+	}
+
+	if (lock_try_counter == 0) {
 		return false;
+	}
+
+	/* we have the lock if we got here */
 
 	if (!flash->busy) {
 		flash->busy = true;
 		rc = true;
+	} else {
+		/* probably beat a flash_release and grabbed the lock */
+		unlock(&flash_lock);
+		while (!try_lock(&flash_lock)) {
+			--lock_try_counter;
+			if (lock_try_counter == 0) {
+				break;
+			}
+		}
+		if (lock_try_counter == 0) {
+			return false;
+		}
+		/* we have the lock if we are here */
+		if (!flash->busy) {
+			flash->busy = true;
+			rc = true;
+		}
 	}
 	unlock(&flash_lock);
 
@@ -279,12 +340,31 @@  static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
 		assert(0);
 	}
 
-	if (rc)
-		rc = OPAL_HARDWARE;
+	if (rc == 0) {
+		/*
+		 * if we are good to proceed forward
+		 * otherwise we may have to try again
+		 */
+		flash->async.pos += len;
+		flash->async.buf += len;
+		flash->async.len -= len;
+		/* if we good clear */
+		flash->async.retry_counter = 0;
+		/*
+		 * clear the IN_PROGRESS flags
+		 * we only need IN_PROGRESS active on missed_cc
+		 */
+		flash->bl->flags &= IN_PROGRESS_MASK;
+		/* reset time for scheduling gap */
+		flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
+	}
+
+	/*
+	 * corner cases if the move window misses and
+	 * the requested window is split (needing adjustment down) problem
+	 * if timing good on move_cb the world is good
+	 */
 
-	flash->async.pos += len;
-	flash->async.buf += len;
-	flash->async.len -= len;
 	if (!rc && flash->async.len) {
 		/*
 		 * We want to get called pretty much straight away, just have
@@ -293,10 +373,38 @@  static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unus
 		 */
 		schedule_timer(&flash->async.poller, 0);
 		return;
+	} else {
+		if (rc == FLASH_ERR_MISSED_CC) {
+			++flash->async.retry_counter;
+			flash->async.in_progress_schedule_delay += FLASH_SCHEDULE_DELAY;
+			if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
+				rc = OPAL_HARDWARE;
+				prlog(PR_TRACE, "flash_poll PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
+					FLASH_RETRY_LIMIT,
+					flash->async.transaction_id);
+			} else {
+				/*
+				 * give the BT time to work and receive response
+				 * throttle back to allow for congestion to clear
+				 * cases observed were complete lack of ipmi response until very late
+				 * or cases immediately following an unaligned window read/move (so slow)
+				 */
+				flash->bl->flags |= IN_PROGRESS;
+				schedule_timer(&flash->async.poller, msecs_to_tb(flash->async.in_progress_schedule_delay));
+				return;
+			}
+		} else {
+			if (rc != 0) {
+				rc = OPAL_HARDWARE;
+			}
+		}
 	}
 
-	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
+	flash->bl->flags &= IN_PROGRESS_MASK;
+	flash->bl->flags &= ASYNC_REQUIRED_MASK;
+	/* release the flash before we allow next opal entry */
 	flash_release(flash);
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
 }
 
 static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
@@ -454,6 +562,7 @@  int flash_register(struct blocklevel_device *bl)
 	flash->size = size;
 	flash->block_size = block_size;
 	flash->id = num_flashes();
+	flash->async.transaction_id = 0;
 	init_timer(&flash->async.poller, flash_poll, flash);
 
 	rc = ffs_init(0, flash->size, bl, &ffs, 1);
@@ -487,7 +596,7 @@  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 		uint64_t buf, uint64_t size, uint64_t token)
 {
 	struct flash *flash = NULL;
-	uint64_t len;
+	uint64_t len = 0;
 	int rc;
 
 	list_for_each(&flashes, flash, list)
@@ -516,6 +625,10 @@  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 	flash->async.buf = buf + len;
 	flash->async.len = size - len;
 	flash->async.pos = offset + len;
+	flash->async.retry_counter = 0;
+	flash->async.in_progress_schedule_delay = FLASH_SCHEDULE_DELAY;
+	flash->bl->flags |= ASYNC_REQUIRED;
+	++flash->async.transaction_id;
 
 	/*
 	 * These ops intentionally have no smarts (ecc correction or erase
@@ -539,8 +652,27 @@  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 	}
 
 	if (rc) {
-		prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
-		rc = OPAL_HARDWARE;
+		if (rc == FLASH_ERR_MISSED_CC) {
+			++flash->async.retry_counter;
+			flash->async.buf = buf;
+			flash->async.len = size;
+			flash->async.pos = offset;
+			/* for completeness, opal_flash_op is first time pass so unless the retry limit set to 1 */
+			if (flash->async.retry_counter >= FLASH_RETRY_LIMIT) {
+				rc = OPAL_HARDWARE;
+				prlog(PR_TRACE, "opal_flash_op PROBLEM FLASH_RETRY_LIMIT of %i reached on transaction_id=%i\n",
+					FLASH_RETRY_LIMIT,
+					flash->async.transaction_id);
+				goto out;
+			}
+			flash->bl->flags |= IN_PROGRESS;
+			schedule_timer(&flash->async.poller, msecs_to_tb(FLASH_SCHEDULE_DELAY));
+			/* Don't release the flash */
+			return OPAL_ASYNC_COMPLETION;
+		} else {
+			prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
+			rc = OPAL_HARDWARE;
+		}
 		goto out;
 	}
 
@@ -564,6 +696,8 @@  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 	rc = OPAL_ASYNC_COMPLETION;
 out:
 	flash_release(flash);
+	flash->bl->flags &= IN_PROGRESS_MASK;
+	flash->bl->flags &= ASYNC_REQUIRED_MASK;
 	return rc;
 }
 
diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
index 492918e..63d8690 100644
--- a/libflash/blocklevel.h
+++ b/libflash/blocklevel.h
@@ -18,8 +18,13 @@  struct blocklevel_range {
 	int total_prot;
 };
 
+#define ASYNC_REQUIRED_MASK 0xFFFB
+#define IN_PROGRESS_MASK 0xFFF7
+
 enum blocklevel_flags {
 	WRITE_NEED_ERASE = 1,
+	ASYNC_REQUIRED = 4,
+	IN_PROGRESS = 8,
 };
 
 /*
diff --git a/libflash/errors.h b/libflash/errors.h
index c800ada..c24166d 100644
--- a/libflash/errors.h
+++ b/libflash/errors.h
@@ -21,6 +21,7 @@ 
 #define FLASH_ERR_BAD_READ		15
 #define FLASH_ERR_DEVICE_GONE	16
 #define FLASH_ERR_AGAIN	17
+#define FLASH_ERR_MISSED_CC	18
 
 #ifdef __SKIBOOT__
 #include <skiboot.h>
diff --git a/libflash/ipmi-hiomap.c b/libflash/ipmi-hiomap.c
index 7327b83..08af005 100644
--- a/libflash/ipmi-hiomap.c
+++ b/libflash/ipmi-hiomap.c
@@ -11,6 +11,10 @@ 
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
+#include <lock.h>
+#include <debug_descriptor.h>
+#include <timebase.h>
+#include <cpu.h>
 
 #include <ccan/container_of/container_of.h>
 
@@ -24,7 +28,7 @@  struct ipmi_hiomap_result {
 	int16_t cc;
 };
 
-#define RESULT_INIT(_name, _ctx) struct ipmi_hiomap_result _name = { _ctx, -1 }
+static struct hiomap_v2_create_window *window_parms;
 
 static inline uint32_t blocks_to_bytes(struct ipmi_hiomap *ctx, uint16_t blocks)
 {
@@ -62,9 +66,20 @@  static int hiomap_protocol_ready(struct ipmi_hiomap *ctx)
 	return 0;
 }
 
-static int hiomap_queue_msg_sync(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
+static int hiomap_queue_msg(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
 {
 	int rc;
+	int bl_flags;
+
+	lock(&ctx->lock);
+	bl_flags = ctx->bl.flags;
+	unlock(&ctx->lock);
+
+	/*
+	 * during boot caution to stay duration within skiboot/
+	 * no exit re-entry due to poller conflicts with synchronous window moves
+	 * asynchronous usage intended for opal_flash_op and flash_poll paths
+	 */
 
 	/*
 	 * There's an unavoidable TOCTOU race here with the BMC sending an
@@ -73,17 +88,23 @@  static int hiomap_queue_msg_sync(struct ipmi_hiomap *ctx, struct ipmi_msg *msg)
 	 * hiomap_queue_msg_sync() exists to capture the race in a single
 	 * location.
 	 */
-	lock(&ctx->lock);
-	rc = hiomap_protocol_ready(ctx);
-	unlock(&ctx->lock);
-	if (rc) {
-		ipmi_free_msg(msg);
-		return rc;
-	}
 
-	ipmi_queue_msg_sync(msg);
+	if ((opal_booting()) || (!(bl_flags & ASYNC_REQUIRED))) {
+		lock(&ctx->lock);
+		rc = hiomap_protocol_ready(ctx);
+		unlock(&ctx->lock);
+		if (rc) {
+			ipmi_free_msg(msg);
+			return rc;
+		}
+		prlog(PR_TRACE, "%s SENDING SYNC\n", __func__);
+		ipmi_queue_msg_sync(msg);
+	} else {
+		prlog(PR_TRACE, "%s SENDING ASYNC\n", __func__);
+		rc = ipmi_queue_msg(msg);
+	}
 
-	return 0;
+	return rc;
 }
 
 /* Call under ctx->lock */
@@ -100,12 +121,178 @@  static int hiomap_window_valid(struct ipmi_hiomap *ctx, uint64_t pos,
 		return FLASH_ERR_PARM_ERROR;
 	if (pos < ctx->current.cur_pos)
 		return FLASH_ERR_PARM_ERROR;
-	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size))
-		return FLASH_ERR_PARM_ERROR;
+	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size)) {
+		/* we will compensate the proper values in caller */
+		if ((pos + ctx->current.size) <= (ctx->current.cur_pos + ctx->current.size)) {
+			prlog(PR_TRACE, "%s OK pos=%llu "
+				"ctx->current.size=0x%x "
+				"ctx->current.cur_pos=0x%x\n",
+				__func__,
+				pos,
+				ctx->current.size,
+				ctx->current.cur_pos);
+		} else {
+			prlog(PR_TRACE, "%s CHECKING further pos=%llu "
+				"for len=%llu ctx->current.size=0x%x "
+				"ctx->current.cur_pos=0x%x\n",
+				__func__,
+				pos,
+				len,
+				ctx->current.size,
+				ctx->current.cur_pos);
+			if ((pos + ctx->current.adjusted_window_size) <= (ctx->current.cur_pos + ctx->current.size)) {
+				prlog(PR_TRACE, "%s OK use ADJUSTED pos=%llu "
+					"adjusted_len=%i for len=%llu "
+					"ctx->current.size=0x%x "
+					"ctx->current.cur_pos=0x%x\n",
+					__func__,
+					pos,
+					ctx->current.adjusted_window_size,
+					len,
+					ctx->current.size,
+					ctx->current.cur_pos);
+			} else {
+				prlog(PR_TRACE, "%s we need to MOVE the window\n", __func__);
+				return FLASH_ERR_PARM_ERROR;
+			}
+		}
+	}
 
+	prlog(PR_TRACE, "%s ALL GOOD, no move needed\n", __func__);
 	return 0;
 }
 
+static void move_cb(struct ipmi_msg *msg)
+{
+	/*
+	 * we leave the move_cb outside of the ipmi_hiomap_cmd_cb
+	 * based on the command we need to special close the window
+	 */
+
+	struct ipmi_hiomap_result *res = msg->user_data;
+	struct ipmi_hiomap *ctx = res->ctx;
+	/*
+	 * only a few iterations to try for lock/
+	 * contention is probably hiomap_window_move trying to setup again
+	 */
+	int lock_try_counter = 10;
+
+	if ((msg->resp_size != 8) || (msg->cc != IPMI_CC_NO_ERROR) || (msg->data[1] != ctx->inflight_seq)) {
+		prlog(PR_TRACE, "Command %u (4=READ 6=WRITE): move_cb "
+			"Unexpected results to check: response size we "
+			"expect 8 but received %u, ipmi cc=%d "
+			"(should be zero), expected ipmi seq %i but got "
+			"wrong ipmi seq %i\n",
+			msg->data[0],
+			msg->resp_size,
+			msg->cc,
+			ctx->inflight_seq,
+			msg->data[1]);
+		lock(&ctx->lock);
+		ctx->cc = OPAL_HARDWARE;
+		ctx->window_state = closed_window;
+		goto out;
+	} else {
+		prlog(PR_TRACE, "Entered %s for %s window from "
+			"OLD block pos 0x%x for 0x%x bytes at "
+			"lpc_addr 0x%x ipmi seq=%i\n",
+			__func__,
+			(msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW) ? "read" : "write",
+			ctx->current.cur_pos,
+			ctx->current.size,
+			ctx->current.lpc_addr,
+			ctx->inflight_seq);
+	}
+
+	window_parms = (struct hiomap_v2_create_window *)&msg->data[2];
+
+	while (!try_lock(&ctx->lock)) {
+		--lock_try_counter;
+		if (lock_try_counter == 0) {
+			break;
+		}
+	}
+	if (lock_try_counter == 0) {
+		/*
+		 * we cannot get the lock, but update anyway
+		 * because we cannot communicate this completion
+		 * and someone will need to retry
+		 * contention usually with handle_events or window_move
+		 * this code path is the critical path that will open the window
+		 */
+		ctx->window_state = closed_window;
+		ctx->cc = OPAL_PARAMETER;
+		goto out2;
+	}
+
+	/* If here, we got the lock, cc consumed higher up so need in ctx */
+
+	ctx->cc = IPMI_CC_NO_ERROR;
+	ctx->current.lpc_addr =
+		blocks_to_bytes(ctx, le16_to_cpu(window_parms->lpc_addr));
+	ctx->current.size =
+		blocks_to_bytes(ctx, le16_to_cpu(window_parms->size));
+	ctx->current.cur_pos =
+		blocks_to_bytes(ctx, le16_to_cpu(window_parms->offset));
+	/* refresh to current */
+	ctx->current.adjusted_window_size = ctx->current.size;
+
+	/* now that we have moved stuff the values */
+	*ctx->active_size = ctx->requested_len;
+
+	/*
+	 * Is length past the end of the window?
+	 * if this condition happens it can cause the async.retry_counter to fail
+	 */
+	if ((ctx->requested_pos + ctx->requested_len) > (ctx->current.cur_pos + ctx->current.size)) {
+		/*
+		 * Adjust size to meet current window
+		 * active_size goes back to caller,
+		 * but caller may expire and we need to store for future use
+		 */
+		*ctx->active_size = (ctx->current.cur_pos + ctx->current.size) - ctx->requested_pos;
+		ctx->current.adjusted_window_size = (ctx->current.cur_pos + ctx->current.size) - ctx->requested_pos;
+		prlog(PR_TRACE, "%s VALID MOVE ADJUSTMENT "
+			"*ctx->active_size=%llu "
+			"ctx->requested_pos=%llu "
+			"ctx->current.adjusted_window_size=%i\n",
+			__func__,
+			*ctx->active_size,
+			ctx->requested_pos,
+			ctx->current.adjusted_window_size);
+	}
+
+	if (ctx->requested_len != 0 && *ctx->active_size == 0) {
+		prlog(PR_NOTICE, "%s Invalid window properties: len: %llu, size: %llu\n",
+			__func__, ctx->requested_len, *ctx->active_size);
+		ctx->cc = OPAL_PARAMETER;
+		ctx->window_state = closed_window;
+		goto out;
+	}
+
+	if (msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW)
+		ctx->window_state = read_window;
+	else
+		ctx->window_state = write_window;
+
+	prlog(PR_TRACE, "Opened %s window to NEW block pos 0x%x for 0x%x bytes "
+		"at lpc_addr 0x%x ipmi seq=%i active size=%llu "
+		"adjusted_window_size=%i\n",
+		(msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW) ? "read" : "write",
+		ctx->current.cur_pos,
+		ctx->current.size,
+		ctx->current.lpc_addr,
+		ctx->inflight_seq,
+		*ctx->active_size,
+		ctx->current.adjusted_window_size);
+
+out:	prlog(PR_TRACE, "Exiting the move window callback "
+		"transaction ipmi seq=%i\n",
+		ctx->inflight_seq);
+	unlock(&ctx->lock);
+out2:	ipmi_free_msg(msg);
+}
+
 static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
 {
 	struct ipmi_hiomap_result *res = msg->user_data;
@@ -125,9 +312,9 @@  static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
 		return;
 	}
 
-	if (msg->data[1] != ctx->seq) {
+	if (msg->data[1] != ctx->inflight_seq) {
 		prerror("Unmatched sequence number: wanted %u got %u\n",
-			ctx->seq, msg->data[1]);
+			ctx->inflight_seq, msg->data[1]);
 		res->cc = IPMI_ERR_UNSPECIFIED;
 		ipmi_free_msg(msg);
 		return;
@@ -138,6 +325,7 @@  static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
 	{
 		struct hiomap_v2_info *parms;
 
+		ctx->cc = IPMI_CC_NO_ERROR;
 		if (msg->resp_size != 6) {
 			prerror("%u: Unexpected response size: %u\n", msg->data[0],
 				msg->resp_size);
@@ -162,6 +350,7 @@  static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
 	{
 		struct hiomap_v2_flash_info *parms;
 
+		ctx->cc = IPMI_CC_NO_ERROR;
 		if (msg->resp_size != 6) {
 			prerror("%u: Unexpected response size: %u\n", msg->data[0],
 				msg->resp_size);
@@ -176,36 +365,6 @@  static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
 			blocks_to_bytes(ctx, le16_to_cpu(parms->erase_granule));
 		break;
 	}
-	case HIOMAP_C_CREATE_READ_WINDOW:
-	case HIOMAP_C_CREATE_WRITE_WINDOW:
-	{
-		struct hiomap_v2_create_window *parms;
-
-		if (msg->resp_size != 8) {
-			prerror("%u: Unexpected response size: %u\n", msg->data[0],
-				msg->resp_size);
-			res->cc = IPMI_ERR_UNSPECIFIED;
-			break;
-		}
-
-		parms = (struct hiomap_v2_create_window *)&msg->data[2];
-
-		ctx->current.lpc_addr =
-			blocks_to_bytes(ctx, le16_to_cpu(parms->lpc_addr));
-		ctx->current.size =
-			blocks_to_bytes(ctx, le16_to_cpu(parms->size));
-		ctx->current.cur_pos =
-			blocks_to_bytes(ctx, le16_to_cpu(parms->offset));
-
-		lock(&ctx->lock);
-		if (msg->data[0] == HIOMAP_C_CREATE_READ_WINDOW)
-			ctx->window_state = read_window;
-		else
-			ctx->window_state = write_window;
-		unlock(&ctx->lock);
-
-		break;
-	}
 	case HIOMAP_C_MARK_DIRTY:
 	case HIOMAP_C_FLUSH:
 	case HIOMAP_C_ACK:
@@ -215,7 +374,15 @@  static void ipmi_hiomap_cmd_cb(struct ipmi_msg *msg)
 			prerror("%u: Unexpected response size: %u\n", msg->data[0],
 				msg->resp_size);
 			res->cc = IPMI_ERR_UNSPECIFIED;
+			ctx->cc = OPAL_HARDWARE;
 			break;
+		} else {
+			prlog(PR_TRACE, "%s Command=%u 1=RESET 7=DIRTY 8=FLUSH 9=ACK 10=ERASE ipmi seq=%u ctx->inflight_seq=%u\n",
+				__func__,
+				msg->data[0],
+				msg->data[1],
+				ctx->inflight_seq);
+			ctx->cc = IPMI_CC_NO_ERROR;
 		}
 		break;
 	default:
@@ -237,57 +404,179 @@  static void hiomap_init(struct ipmi_hiomap *ctx)
 	unlock(&ctx->lock);
 }
 
+static int hiomap_wait_for_cc(struct ipmi_hiomap *ctx, int *cc, uint8_t *seq, uint64_t ticks)
+{
+	uint64_t now;
+	uint64_t start_time;
+	uint64_t wait_time;
+	uint64_t ipmi_hiomap_ticks;
+	uint64_t timeout_counter;
+	int rc;
+
+	prlog(PR_TRACE, "Start wait for cc ipmi seq=%i *cc=%i ticks=%llu\n", *seq, *cc, ticks);
+	rc = 0;
+	if (this_cpu()->tb_invalid) {
+		/*
+		 * SYNC paths already have *cc success
+		 * ASYNC will RE-QUEUE and retry
+		 * we just need to skip the tb logic handling
+		 * we need to close the window to have the logic try the move again
+		 */
+		if (*cc != IPMI_CC_NO_ERROR) {
+			lock(&ctx->lock);
+			ctx->window_state = closed_window;
+			++ctx->missed_cc_count;
+			prlog(PR_NOTICE, "%s tb_invalid, CLOSING WINDOW for cc "
+				"ipmi seq=%i ctx->missed_cc_count=%i\n",
+				__func__, *seq, ctx->missed_cc_count);
+			unlock(&ctx->lock);
+			rc = FLASH_ERR_MISSED_CC;
+		}
+		prlog(PR_NOTICE, "%s tb_invalid, hopefully this will "
+			"retry/recover rc=%i\n",
+			__func__, rc);
+		return rc;
+	}
+	start_time = mftb();
+	now = mftb();
+	wait_time = tb_to_msecs(now - start_time);
+	timeout_counter = 0;
+
+	if (ticks != 0) {
+		ipmi_hiomap_ticks = ticks;
+	} else {
+		ipmi_hiomap_ticks = IPMI_HIOMAP_TICKS;
+	}
+
+	prlog(PR_TRACE, "wait_time=%llu ipmi_hiomap_ticks=%llu ipmi seq=%i "
+			"ctx->missed_cc_count=%i\n",
+		wait_time, ticks, *seq, ctx->missed_cc_count);
+	while (wait_time < ipmi_hiomap_ticks) {
+		++timeout_counter;
+		if (timeout_counter % IPMI_SKIP_INC == 0) {
+			now = mftb();
+			wait_time = tb_to_msecs(now - start_time);
+		}
+		if (*cc == IPMI_CC_NO_ERROR) {
+			prlog(PR_TRACE, "Break cc ipmi seq=%i "
+				"ctx->missed_cc_count=%i\n",
+				*seq, ctx->missed_cc_count);
+			break;
+		}
+	}
+	prlog(PR_TRACE, "Status CHECK wait_for_cc wait_time=%llu *cc=%i "
+		"ipmi seq=%i ctx->missed_cc_count=%i\n",
+		wait_time, *cc, *seq, ctx->missed_cc_count);
+	if (*cc != IPMI_CC_NO_ERROR) {
+		lock(&ctx->lock);
+		ctx->window_state = closed_window;
+		++ctx->missed_cc_count;
+		prlog(PR_TRACE, "CLOSING WINDOW for cc ipmi seq=%i "
+			"ctx->missed_cc_count=%i\n",
+			*seq, ctx->missed_cc_count);
+		unlock(&ctx->lock);
+		rc = FLASH_ERR_MISSED_CC;
+	}
+
+	prlog(PR_TRACE, "Stop wait for *cc=%i ipmi seq=%i "
+		"ctx->missed_cc_count=%i\n",
+		*cc, *seq, ctx->missed_cc_count);
+	return rc;
+
+}
+
 static int hiomap_get_info(struct ipmi_hiomap *ctx)
 {
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result info_res;
 	unsigned char req[3];
 	struct ipmi_msg *msg;
+	uint8_t info_seq;
+	int orig_flags;
+	int tmp_sync_flags;
 	int rc;
 
+	info_res.ctx = ctx;
+	info_res.cc = -1;
+
+	lock(&ctx->lock);
+	orig_flags = ctx->bl.flags;
+	/* clear out async to always do sync */
+	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
+	ctx->bl.flags = tmp_sync_flags;
+	ctx->cc = -1;
+	info_seq = ++ctx->seq;
+	ctx->inflight_seq = info_seq;
+	unlock(&ctx->lock);
+
 	/* Negotiate protocol version 2 */
 	req[0] = HIOMAP_C_GET_INFO;
-	req[1] = ++ctx->seq;
+	req[1] = info_seq;
 	req[2] = HIOMAP_V2;
 
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 6);
+			 ipmi_hiomap_cmd_cb, &info_res, req, sizeof(req), 6);
 
-	rc = hiomap_queue_msg_sync(ctx, msg);
+	rc = hiomap_queue_msg(ctx, msg);
+	lock(&ctx->lock);
+	ctx->bl.flags = orig_flags;
+	unlock(&ctx->lock);
 	if (rc)
 		return rc;
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prerror("%s failed: %d\n", __func__, res.cc);
-		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
+	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
+
+	if (rc) {
+		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+			__func__, rc, IPMI_ACK_DEFAULT);
 	}
 
-	return 0;
+	return rc;
 }
 
 static int hiomap_get_flash_info(struct ipmi_hiomap *ctx)
 {
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result flash_info_res;
 	unsigned char req[2];
 	struct ipmi_msg *msg;
+	uint8_t flash_info_seq;
+	int orig_flags;
+	int tmp_sync_flags;
 	int rc;
 
+	flash_info_res.ctx = ctx;
+	flash_info_res.cc = -1;
+
+	lock(&ctx->lock);
+	orig_flags = ctx->bl.flags;
+	/* clear out async to always do sync */
+	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
+	ctx->bl.flags = tmp_sync_flags;
+	ctx->cc = -1;
+	flash_info_seq = ++ctx->seq;
+	ctx->inflight_seq = flash_info_seq;
+	unlock(&ctx->lock);
+
 	req[0] = HIOMAP_C_GET_FLASH_INFO;
-	req[1] = ++ctx->seq;
+	req[1] = flash_info_seq;
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2 + 2 + 2);
+			 ipmi_hiomap_cmd_cb, &flash_info_res, req, sizeof(req), 2 + 2 + 2);
 
-	rc = hiomap_queue_msg_sync(ctx, msg);
+	rc = hiomap_queue_msg(ctx, msg);
+	lock(&ctx->lock);
+	ctx->bl.flags = orig_flags;
+	unlock(&ctx->lock);
 	if (rc)
 		return rc;
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prerror("%s failed: %d\n", __func__, res.cc);
-		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
+	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
+	if (rc) {
+		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+			__func__, rc, IPMI_ACK_DEFAULT);
 	}
 
-	return 0;
+	return rc;
 }
 
 static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
@@ -295,32 +584,65 @@  static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
 {
 	enum lpc_window_state want_state;
 	struct hiomap_v2_range *range;
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result move_res;
 	unsigned char req[6];
 	struct ipmi_msg *msg;
+	uint8_t move_seq;
 	bool valid_state;
 	bool is_read;
 	int rc;
 
+	move_res.ctx = ctx;
+	move_res.cc = -1;
 	is_read = (command == HIOMAP_C_CREATE_READ_WINDOW);
 	want_state = is_read ? read_window : write_window;
 
+	/* there will be lock contention between hiomap_window_move and move_cb */
+
 	lock(&ctx->lock);
+	ctx->cc = -1;
+
+	if (ctx->bl.flags & IN_PROGRESS) {
+		pos = ctx->tracking_pos;
+		len = ctx->tracking_len;
+	} else {
+		ctx->tracking_pos = pos;
+		ctx->tracking_len = len;
+	}
 
 	valid_state = want_state == ctx->window_state;
 	rc = hiomap_window_valid(ctx, pos, len);
+
 	if (valid_state && !rc) {
+		/* if its valid stuff the proper maybe modified size */
+		if ((pos + len) > (ctx->current.cur_pos + ctx->current.size)) {
+			/* if we had bumped the adjusted_window_size down in move_cb */
+			if ((ctx->current.adjusted_window_size != ctx->current.size)) {
+				*size = ctx->current.adjusted_window_size;
+			} else {
+				*size = (ctx->current.cur_pos + ctx->current.size) - pos;
+			}
+		} else {
+			*size = len;
+		}
+		ctx->cc = IPMI_CC_NO_ERROR;
 		unlock(&ctx->lock);
-		*size = len;
 		return 0;
 	}
 
-	ctx->window_state = closed_window;
+	ctx->window_state = moving_window;
 
-	unlock(&ctx->lock);
+	ctx->active_size = size;
+	ctx->requested_pos = pos;
+	ctx->requested_len = len;
+
+	move_seq = ++ctx->seq;
+	ctx->inflight_seq = move_seq;
 
 	req[0] = command;
-	req[1] = ++ctx->seq;
+	req[1] = move_seq;
+
+	unlock(&ctx->lock);
 
 	range = (struct hiomap_v2_range *)&req[2];
 	range->offset = cpu_to_le16(bytes_to_blocks(ctx, pos));
@@ -328,38 +650,14 @@  static int hiomap_window_move(struct ipmi_hiomap *ctx, uint8_t command,
 
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req),
+			 move_cb, &move_res, req, sizeof(req),
 			 2 + 2 + 2 + 2);
 
-	rc = hiomap_queue_msg_sync(ctx, msg);
+	rc = hiomap_queue_msg(ctx, msg);
+
 	if (rc)
 		return rc;
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prlog(PR_INFO, "%s failed: %d\n", __func__, res.cc);
-		return FLASH_ERR_PARM_ERROR; /* XXX: Find something better? */
-	}
-
-	lock(&ctx->lock);
-	*size = len;
-	/* Is length past the end of the window? */
-	if ((pos + len) > (ctx->current.cur_pos + ctx->current.size))
-		/* Adjust size to meet current window */
-		*size = (ctx->current.cur_pos + ctx->current.size) - pos;
-
-	if (len != 0 && *size == 0) {
-		unlock(&ctx->lock);
-		prerror("Invalid window properties: len: %"PRIu64", size: %"PRIu64"\n",
-			len, *size);
-		return FLASH_ERR_PARM_ERROR;
-	}
-
-	prlog(PR_DEBUG, "Opened %s window from 0x%x for %u bytes at 0x%x\n",
-	      (command == HIOMAP_C_CREATE_READ_WINDOW) ? "read" : "write",
-	      ctx->current.cur_pos, ctx->current.size, ctx->current.lpc_addr);
-
-	unlock(&ctx->lock);
-
 	return 0;
 }
 
@@ -368,21 +666,27 @@  static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
 {
 	struct hiomap_v2_range *range;
 	enum lpc_window_state state;
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result dirty_res;
 	unsigned char req[6];
 	struct ipmi_msg *msg;
+	uint8_t dirty_seq;
 	uint32_t pos;
 	int rc;
 
+	dirty_res.ctx = ctx;
+	dirty_res.cc = -1;
 	lock(&ctx->lock);
 	state = ctx->window_state;
+	dirty_seq = ++ctx->seq;
+	ctx->inflight_seq = dirty_seq;
+	ctx->cc = -1;
 	unlock(&ctx->lock);
 
 	if (state != write_window)
 		return FLASH_ERR_PARM_ERROR;
 
 	req[0] = HIOMAP_C_MARK_DIRTY;
-	req[1] = ++ctx->seq;
+	req[1] = dirty_seq;
 
 	pos = offset - ctx->current.cur_pos;
 	range = (struct hiomap_v2_range *)&req[2];
@@ -391,19 +695,15 @@  static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
 
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
+			 ipmi_hiomap_cmd_cb, &dirty_res, req, sizeof(req), 2);
+
+	rc = hiomap_queue_msg(ctx, msg);
 
-	rc = hiomap_queue_msg_sync(ctx, msg);
 	if (rc)
 		return rc;
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prerror("%s failed: %d\n", __func__, res.cc);
-		return FLASH_ERR_PARM_ERROR;
-	}
-
 	prlog(PR_DEBUG, "Marked flash dirty at 0x%" PRIx64 " for %" PRIu64 "\n",
-	      offset, size);
+		offset, size);
 
 	return 0;
 }
@@ -411,34 +711,36 @@  static int hiomap_mark_dirty(struct ipmi_hiomap *ctx, uint64_t offset,
 static int hiomap_flush(struct ipmi_hiomap *ctx)
 {
 	enum lpc_window_state state;
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result flush_res;
 	unsigned char req[2];
 	struct ipmi_msg *msg;
+	uint8_t flush_seq;
 	int rc;
 
+	flush_res.ctx = ctx;
+	flush_res.cc = -1;
 	lock(&ctx->lock);
 	state = ctx->window_state;
+	flush_seq = ++ctx->seq;
+	ctx->inflight_seq = flush_seq;
+	ctx->cc = -1;
 	unlock(&ctx->lock);
 
 	if (state != write_window)
 		return FLASH_ERR_PARM_ERROR;
 
 	req[0] = HIOMAP_C_FLUSH;
-	req[1] = ++ctx->seq;
+	req[1] = flush_seq;
 
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
+			 ipmi_hiomap_cmd_cb, &flush_res, req, sizeof(req), 2);
+
+	rc = hiomap_queue_msg(ctx, msg);
 
-	rc = hiomap_queue_msg_sync(ctx, msg);
 	if (rc)
 		return rc;
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prerror("%s failed: %d\n", __func__, res.cc);
-		return FLASH_ERR_PARM_ERROR;
-	}
-
 	prlog(PR_DEBUG, "Flushed writes\n");
 
 	return 0;
@@ -446,26 +748,47 @@  static int hiomap_flush(struct ipmi_hiomap *ctx)
 
 static int hiomap_ack(struct ipmi_hiomap *ctx, uint8_t ack)
 {
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result ack_res;
 	unsigned char req[3];
 	struct ipmi_msg *msg;
+	uint8_t ack_seq;
+	int orig_flags;
+	int tmp_sync_flags;
 	int rc;
 
+	ack_res.ctx = ctx;
+	ack_res.cc = -1;
+
+	lock(&ctx->lock);
+	orig_flags = ctx->bl.flags;
+	/* clear out async to always do sync */
+	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
+	ctx->bl.flags = tmp_sync_flags;
+	ctx->cc = -1;
+	ack_seq = ++ctx->seq;
+	ctx->inflight_seq = ack_seq;
+	unlock(&ctx->lock);
+
 	req[0] = HIOMAP_C_ACK;
-	req[1] = ++ctx->seq;
+	req[1] = ack_seq;
 	req[2] = ack;
 
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
+			 ipmi_hiomap_cmd_cb, &ack_res, req, sizeof(req), 2);
 
-	rc = hiomap_queue_msg_sync(ctx, msg);
+	rc = hiomap_queue_msg(ctx, msg);
+	lock(&ctx->lock);
+	ctx->bl.flags = orig_flags;
+	unlock(&ctx->lock);
 	if (rc)
 		return rc;
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prlog(PR_DEBUG, "%s failed: %d\n", __func__, res.cc);
-		return FLASH_ERR_PARM_ERROR;
+	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
+	if (rc) {
+		prlog(PR_TRACE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+			__func__, rc, IPMI_ACK_DEFAULT);
+		return rc;
 	}
 
 	prlog(PR_DEBUG, "Acked events: 0x%x\n", ack);
@@ -478,21 +801,27 @@  static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
 {
 	struct hiomap_v2_range *range;
 	enum lpc_window_state state;
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result erase_res;
 	unsigned char req[6];
 	struct ipmi_msg *msg;
+	uint8_t erase_seq;
 	uint32_t pos;
 	int rc;
 
+	erase_res.ctx = ctx;
+	erase_res.cc = -1;
 	lock(&ctx->lock);
 	state = ctx->window_state;
+	erase_seq = ++ctx->seq;
+	ctx->inflight_seq = erase_seq;
+	ctx->cc = -1;
 	unlock(&ctx->lock);
 
 	if (state != write_window)
 		return FLASH_ERR_PARM_ERROR;
 
 	req[0] = HIOMAP_C_ERASE;
-	req[1] = ++ctx->seq;
+	req[1] = erase_seq;
 
 	pos = offset - ctx->current.cur_pos;
 	range = (struct hiomap_v2_range *)&req[2];
@@ -501,16 +830,13 @@  static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
 
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
-	rc = hiomap_queue_msg_sync(ctx, msg);
+			 ipmi_hiomap_cmd_cb, &erase_res, req, sizeof(req), 2);
+
+	rc = hiomap_queue_msg(ctx, msg);
+
 	if (rc)
 		return rc;
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prerror("%s failed: %d\n", __func__, res.cc);
-		return FLASH_ERR_PARM_ERROR;
-	}
-
 	prlog(PR_DEBUG, "Erased flash at 0x%" PRIx64 " for %" PRIu64 "\n",
 	      offset, size);
 
@@ -519,24 +845,53 @@  static int hiomap_erase(struct ipmi_hiomap *ctx, uint64_t offset,
 
 static bool hiomap_reset(struct ipmi_hiomap *ctx)
 {
-	RESULT_INIT(res, ctx);
+	static struct ipmi_hiomap_result reset_res;
 	unsigned char req[2];
 	struct ipmi_msg *msg;
+	uint8_t reset_seq;
+	int orig_flags;
+	int tmp_sync_flags;
+	int rc;
 
-	prlog(PR_NOTICE, "Reset\n");
+	prlog(PR_NOTICE, "%s Reset ENTRY\n", __func__);
+	reset_res.ctx = ctx;
+	reset_res.cc = -1;
+
+	lock(&ctx->lock);
+	orig_flags = ctx->bl.flags;
+	/* clear out async to always do sync */
+	tmp_sync_flags = ctx->bl.flags &= ASYNC_REQUIRED_MASK;
+	ctx->bl.flags = tmp_sync_flags;
+	reset_seq = ++ctx->seq;
+	ctx->cc = -1;
+	ctx->inflight_seq = reset_seq;
+	unlock(&ctx->lock);
 
 	req[0] = HIOMAP_C_RESET;
-	req[1] = ++ctx->seq;
+	req[1] = reset_seq;
 	msg = ipmi_mkmsg(IPMI_DEFAULT_INTERFACE,
 		         bmc_platform->sw->ipmi_oem_hiomap_cmd,
-			 ipmi_hiomap_cmd_cb, &res, req, sizeof(req), 2);
-	ipmi_queue_msg_sync(msg);
+			 ipmi_hiomap_cmd_cb, &reset_res, req, sizeof(req), 2);
+
+	rc = hiomap_queue_msg(ctx, msg);
+	lock(&ctx->lock);
+	ctx->bl.flags = orig_flags;
+	unlock(&ctx->lock);
+
+	if (rc) {
+		prlog(PR_NOTICE, "%s reset queue msg failed: rc=%d\n", __func__, rc);
+		return false;
+	}
 
-	if (res.cc != IPMI_CC_NO_ERROR) {
-		prlog(PR_ERR, "%s failed: %d\n", __func__, res.cc);
+	rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_ACK_DEFAULT);
+
+	if (rc) {
+		prlog(PR_NOTICE, "%s hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+			__func__, rc, IPMI_ACK_DEFAULT);
 		return false;
 	}
 
+	prlog(PR_NOTICE, "%s Reset EXIT\n", __func__);
 	return true;
 }
 
@@ -666,6 +1021,7 @@  static int ipmi_hiomap_handle_events(struct ipmi_hiomap *ctx)
 	 * Therefore it is enough to mark the window as closed to consider it
 	 * recovered.
 	 */
+
 	if (status & (HIOMAP_E_PROTOCOL_RESET | HIOMAP_E_WINDOW_RESET))
 		ctx->window_state = closed_window;
 
@@ -737,8 +1093,9 @@  static int ipmi_hiomap_read(struct blocklevel_device *bl, uint64_t pos,
 			    void *buf, uint64_t len)
 {
 	struct ipmi_hiomap *ctx;
-	uint64_t size;
-	int rc = 0;
+	enum lpc_window_state state;
+	static uint64_t size;
+	int rc;
 
 	/* LPC is only 32bit */
 	if (pos > UINT_MAX || len > UINT_MAX)
@@ -746,88 +1103,208 @@  static int ipmi_hiomap_read(struct blocklevel_device *bl, uint64_t pos,
 
 	ctx = container_of(bl, struct ipmi_hiomap, bl);
 
+	lock(&ctx->transaction_lock);
+
 	rc = ipmi_hiomap_handle_events(ctx);
 	if (rc)
-		return rc;
+		goto out;
+
+	lock(&ctx->lock);
+	if (ctx->bl.flags & IN_PROGRESS) {
+		buf = ctx->tracking_buf;
+		pos = ctx->tracking_pos;
+		len = ctx->tracking_len;
+	} else {
+		ctx->tracking_buf = buf;
+		ctx->tracking_pos = 0;
+		ctx->tracking_len = 0;
+	}
+	unlock(&ctx->lock);
 
 	prlog(PR_TRACE, "Flash read at %#" PRIx64 " for %#" PRIx64 "\n", pos,
 	      len);
 	while (len > 0) {
-		/* Move window and get a new size to read */
-		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_READ_WINDOW, pos,
-				        len, &size);
-		if (rc)
-			return rc;
-
-		/* Perform the read for this window */
-		rc = lpc_window_read(ctx, pos, buf, size);
-		if (rc)
-			return rc;
-
-		/* Check we can trust what we read */
 		lock(&ctx->lock);
-		rc = hiomap_window_valid(ctx, pos, size);
+		state = ctx->window_state;
 		unlock(&ctx->lock);
-		if (rc)
-			return rc;
+		if (state != moving_window) {
+			/* Move window and get a new size to read */
+			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_READ_WINDOW, pos,
+				len, &size);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
+					__func__, rc);
+				goto out;
+			}
+		} else {
+			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_HIOMAP_TICKS_DEFAULT);
+			if (rc) {
+				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+					__func__, rc, IPMI_HIOMAP_TICKS_DEFAULT);
+				goto out;
+			}
+		}
 
-		len -= size;
-		pos += size;
-		buf += size;
+		lock(&ctx->lock);
+		state = ctx->window_state;
+		unlock(&ctx->lock);
+		if (state == read_window) {
+			/*
+			 * don't lock in case move_cb in progress
+			 * if we get here the state is good
+			 * just double-checking
+			 */
+			if (ctx->cc != IPMI_CC_NO_ERROR) {
+				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
+				rc = OPAL_HARDWARE;
+				goto out;
+			}
+			/* Perform the read for this window */
+			rc = lpc_window_read(ctx, pos, buf, size);
+			if (rc) {
+				prlog(PR_NOTICE, "%s lpc_window_read failed: rc=%d\n", __func__, rc);
+				goto out;
+			}
+
+			/* Check we can trust what we read */
+			lock(&ctx->lock);
+			rc = hiomap_window_valid(ctx, pos, size);
+			unlock(&ctx->lock);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_window_valid failed: rc=%d\n", __func__, rc);
+				goto out;
+			}
+
+			len -= size;
+			pos += size;
+			buf += size;
+			lock(&ctx->lock);
+			ctx->tracking_len = len;
+			ctx->tracking_pos = pos;
+			ctx->tracking_buf = buf;
+			unlock(&ctx->lock);
+		}
 	}
-	return rc;
 
+out:	unlock(&ctx->transaction_lock);
+	return rc;
 }
 
 static int ipmi_hiomap_write(struct blocklevel_device *bl, uint64_t pos,
 			     const void *buf, uint64_t len)
 {
 	struct ipmi_hiomap *ctx;
-	uint64_t size;
-	int rc = 0;
+	enum lpc_window_state state;
+	static uint64_t size;
+	int rc;
 
 	/* LPC is only 32bit */
 	if (pos > UINT_MAX || len > UINT_MAX)
 		return FLASH_ERR_PARM_ERROR;
 
 	ctx = container_of(bl, struct ipmi_hiomap, bl);
+	lock(&ctx->transaction_lock);
 
 	rc = ipmi_hiomap_handle_events(ctx);
 	if (rc)
-		return rc;
+		goto out;
+
+	lock(&ctx->lock);
+	if (ctx->bl.flags & IN_PROGRESS) {
+		buf = ctx->tracking_buf;
+		pos = ctx->tracking_pos;
+		len = ctx->tracking_len;
+	} else {
+		ctx->tracking_buf = (void *) buf;
+		ctx->tracking_pos = 0;
+		ctx->tracking_len = 0;
+	}
+	unlock(&ctx->lock);
 
 	prlog(PR_TRACE, "Flash write at %#" PRIx64 " for %#" PRIx64 "\n", pos,
 	      len);
 	while (len > 0) {
-		/* Move window and get a new size to read */
-		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
-				        len, &size);
-		if (rc)
-			return rc;
-
-		/* Perform the write for this window */
-		rc = lpc_window_write(ctx, pos, buf, size);
-		if (rc)
-			return rc;
-
-		rc = hiomap_mark_dirty(ctx, pos, size);
-		if (rc)
-			return rc;
+		lock(&ctx->lock);
+		state = ctx->window_state;
+		unlock(&ctx->lock);
+		if (state != moving_window) {
+			/* Move window and get a new size to read */
+			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
+					        len, &size);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
+					__func__, rc);
+				goto out;
+			}
+		} else {
+			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
+			if (rc) {
+				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+					__func__, rc, IPMI_LONG_TICKS);
+				goto out;
+			}
+		}
 
-		/*
-		 * The BMC *should* flush if the window is implicitly closed,
-		 * but do an explicit flush here to be sure.
-		 *
-		 * XXX: Removing this could improve performance
-		 */
-		rc = hiomap_flush(ctx);
-		if (rc)
-			return rc;
+		lock(&ctx->lock);
+		state = ctx->window_state;
+		unlock(&ctx->lock);
 
-		len -= size;
-		pos += size;
-		buf += size;
+		if (state == write_window) {
+			if (ctx->cc != IPMI_CC_NO_ERROR) {
+				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
+				rc = OPAL_HARDWARE;
+				goto out;
+			}
+
+			/* Perform the write for this window */
+			rc = lpc_window_write(ctx, pos, buf, size);
+			if (rc) {
+				prlog(PR_NOTICE, "%s lpc_window_write failed: rc=%d\n", __func__, rc);
+				goto out;
+			}
+
+			rc = hiomap_mark_dirty(ctx, pos, size);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_mark_dirty failed: rc=%d\n", __func__, rc);
+				goto out;
+			}
+			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
+			if (rc) {
+				prlog(PR_TRACE, "%s dirty hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+					__func__, rc, IPMI_LONG_TICKS);
+				goto out;
+			}
+
+			/*
+			 * The BMC *should* flush if the window is implicitly closed,
+			 * but do an explicit flush here to be sure.
+			 *
+			 * XXX: Removing this could improve performance
+			 */
+			rc = hiomap_flush(ctx);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_flush failed: rc=%d\n", __func__, rc);
+				goto out;
+			}
+			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
+			if (rc) {
+				prlog(PR_TRACE, "%s flush hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+					__func__, rc, IPMI_LONG_TICKS);
+				goto out;
+			}
+
+			len -= size;
+			pos += size;
+			buf += size;
+			lock(&ctx->lock);
+			ctx->tracking_len = len;
+			ctx->tracking_pos = pos;
+			ctx->tracking_buf = (void *) buf;
+			unlock(&ctx->lock);
+		}
 	}
+
+out:	unlock(&ctx->transaction_lock);
 	return rc;
 }
 
@@ -835,6 +1312,8 @@  static int ipmi_hiomap_erase(struct blocklevel_device *bl, uint64_t pos,
 			     uint64_t len)
 {
 	struct ipmi_hiomap *ctx;
+	enum lpc_window_state state;
+	static uint64_t size;
 	int rc;
 
 	/* LPC is only 32bit */
@@ -842,39 +1321,94 @@  static int ipmi_hiomap_erase(struct blocklevel_device *bl, uint64_t pos,
 		return FLASH_ERR_PARM_ERROR;
 
 	ctx = container_of(bl, struct ipmi_hiomap, bl);
+	lock(&ctx->transaction_lock);
 
 	rc = ipmi_hiomap_handle_events(ctx);
 	if (rc)
-		return rc;
+		goto out;
+
+	lock(&ctx->lock);
+	if (ctx->bl.flags & IN_PROGRESS) {
+		pos = ctx->tracking_pos;
+		len = ctx->tracking_len;
+	} else {
+		ctx->tracking_pos = 0;
+		ctx->tracking_len = 0;
+	}
+	unlock(&ctx->lock);
 
 	prlog(PR_TRACE, "Flash erase at 0x%08x for 0x%08x\n", (u32) pos,
 	      (u32) len);
 	while (len > 0) {
-		uint64_t size;
-
-		/* Move window and get a new size to erase */
-		rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
-				        len, &size);
-		if (rc)
-			return rc;
-
-		rc = hiomap_erase(ctx, pos, size);
-		if (rc)
-			return rc;
-
-		/*
-		 * Flush directly, don't mark that region dirty otherwise it
-		 * isn't clear if a write happened there or not
-		 */
-		rc = hiomap_flush(ctx);
-		if (rc)
-			return rc;
+		lock(&ctx->lock);
+		state = ctx->window_state;
+		unlock(&ctx->lock);
+		if (state != moving_window) {
+			/* Move window and get a new size to erase */
+			rc = hiomap_window_move(ctx, HIOMAP_C_CREATE_WRITE_WINDOW, pos,
+					        len, &size);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_window_move failed: rc=%d\n",
+					__func__, rc);
+				goto out;
+			}
+		} else {
+			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
+			if (rc) {
+				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+					__func__, rc, IPMI_LONG_TICKS);
+				goto out;
+			}
+		}
 
-		len -= size;
-		pos += size;
+		lock(&ctx->lock);
+		state = ctx->window_state;
+		unlock(&ctx->lock);
+		if (state == write_window) {
+			if (ctx->cc != IPMI_CC_NO_ERROR) {
+				prlog(PR_NOTICE, "%s failed: cc=%d\n", __func__, ctx->cc);
+				rc = OPAL_HARDWARE;
+				goto out;
+			}
+			rc = hiomap_erase(ctx, pos, size);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_erase failed: rc=%d\n", __func__, rc);
+				goto out;
+			}
+			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
+			if (rc) {
+				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+					__func__, rc, IPMI_LONG_TICKS);
+				goto out;
+			}
+
+			/*
+			 * Flush directly, don't mark that region dirty otherwise it
+			 * isn't clear if a write happened there or not
+			 */
+			rc = hiomap_flush(ctx);
+			if (rc) {
+				prlog(PR_NOTICE, "%s hiomap_flush failed: rc=%d\n", __func__, rc);
+				goto out;
+			}
+			rc = hiomap_wait_for_cc(ctx, &ctx->cc, &ctx->inflight_seq, IPMI_LONG_TICKS);
+			if (rc) {
+				prlog(PR_TRACE, "%s move hiomap_wait_for_cc failed: rc=%d ticks=%i\n",
+					__func__, rc, IPMI_LONG_TICKS);
+				goto out;
+			}
+
+			len -= size;
+			pos += size;
+			lock(&ctx->lock);
+			ctx->tracking_len = len;
+			ctx->tracking_pos = pos;
+			unlock(&ctx->lock);
+		}
 	}
 
-	return 0;
+out:	unlock(&ctx->transaction_lock);
+	return rc;
 }
 
 static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
@@ -885,6 +1419,7 @@  static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
 	int rc;
 
 	ctx = container_of(bl, struct ipmi_hiomap, bl);
+	lock(&ctx->transaction_lock);
 
 	rc = ipmi_hiomap_handle_events(ctx);
 	if (rc)
@@ -903,6 +1438,7 @@  static int ipmi_hiomap_get_flash_info(struct blocklevel_device *bl,
 	if (erase_granule)
 		*erase_granule = ctx->erase_granule;
 
+	unlock(&ctx->transaction_lock);
 	return 0;
 }
 
@@ -925,6 +1461,7 @@  int ipmi_hiomap_init(struct blocklevel_device **bl)
 		return FLASH_ERR_MALLOC_FAILED;
 
 	init_lock(&ctx->lock);
+	init_lock(&ctx->transaction_lock);
 
 	ctx->bl.read = &ipmi_hiomap_read;
 	ctx->bl.write = &ipmi_hiomap_write;
diff --git a/libflash/ipmi-hiomap.h b/libflash/ipmi-hiomap.h
index 489d55e..4870fc5 100644
--- a/libflash/ipmi-hiomap.h
+++ b/libflash/ipmi-hiomap.h
@@ -10,12 +10,36 @@ 
 
 #include "blocklevel.h"
 
-enum lpc_window_state { closed_window, read_window, write_window };
+/*
+ * we basically check for a quick response
+ * otherwise we catch the updated window in the next cycle
+ */
+#define IPMI_HIOMAP_TICKS 5
+#define IPMI_HIOMAP_TICKS_DEFAULT 0
+
+/* time to wait for write/erase/dirty ops */
+#define IPMI_LONG_TICKS 500
+
+/*
+ * default for ack'ing typically 1-10 wait_time's
+ * allow upper bounds because if we cannot ack
+ * we make no forward progress post protocol reset
+ * async paths will retry
+ * sync paths always hit with zero wait_time elapsed
+ * with ASYNC_REQUIRED_MASK'd out, this is not used
+ */
+#define IPMI_ACK_DEFAULT 500
+
+/* increment to skip the waiting loop */
+#define IPMI_SKIP_INC 2
+
+enum lpc_window_state { closed_window, read_window, write_window, moving_window };
 
 struct lpc_window {
 	uint32_t lpc_addr; /* Offset into LPC space */
 	uint32_t cur_pos;  /* Current position of the window in the flash */
 	uint32_t size;     /* Size of the window into the flash */
+	uint32_t adjusted_window_size; /* store adjusted window size */
 };
 
 struct ipmi_hiomap {
@@ -35,6 +59,21 @@  struct ipmi_hiomap {
 	 * three variables are protected by lock to avoid conflict.
 	 */
 	struct lock lock;
+	struct lock transaction_lock;
+
+	/* callers transaction info */
+	uint64_t *active_size;
+	uint64_t requested_len;
+	uint64_t requested_pos;
+	uint64_t tracking_len;
+	uint64_t tracking_pos;
+	void *tracking_buf;
+
+	int missed_cc_count;
+	int cc;
+	/* inflight_seq used to aide debug */
+	/* with other OPAL ipmi msg's      */
+	uint8_t inflight_seq;
 	uint8_t bmc_state;
 	enum lpc_window_state window_state;
 };