diff mbox

core/flash: Make opal_flash_op() actually asynchronous

Message ID 20170630053708.23917-1-cyril.bur@au1.ibm.com
State Superseded
Headers show

Commit Message

Cyril Bur June 30, 2017, 5:37 a.m. UTC
This patch provides a simple (although not particularly efficient)
asynchronous capability to the opal_flash interface. The advantage of
this approach is that it doesn't require any changing of blocklevel or
its backends to provide an asynchronous implementation. This is also the
disadvantage of this implementation as all it actually does is break the
work up in chunks that it can performed quickly, but still
synchronously. Only a backend could provide a more asynchronous
implementation.

This solves a problem we have right now with the opal_flash_erase call
where it can block in Skiboot for around three minutes. This causes a
variety of problems in Linux due to a processor being gone for a long
time.
For example:

[   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
[   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
[   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
[   98.610060] Task dump for CPU 113:
[   98.610062] pflash          R  running task        0  3335   3333 0x00040004
[   98.610066] Call Trace:
[   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
[   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
[   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
[   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40

It is for this reason that breaking the work up in smaller chunks solves
this problem as Skiboot can return the CPU to Linux between chunks to
avoid Linux getting upset.

Reported-By: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: Cyril Bur <cyril.bur@au1.ibm.com>
---
Like this Alistair? Something something you're correct. Lets go with
this one.

 core/flash.c                            | 116 +++++++++++++++++++++++++++-----
 doc/opal-api/opal-flash-110-111-112.rst |   4 ++
 2 files changed, 103 insertions(+), 17 deletions(-)

Comments

Robert Lippert June 30, 2017, 7:21 p.m. UTC | #1
On Thu, Jun 29, 2017 at 10:37 PM, Cyril Bur <cyril.bur@au1.ibm.com> wrote:
>
> This patch provides a simple (although not particularly efficient)
> asynchronous capability to the opal_flash interface. The advantage of
> this approach is that it doesn't require any changing of blocklevel or
> its backends to provide an asynchronous implementation. This is also the
> disadvantage of this implementation as all it actually does is break the
> work up in chunks that it can performed quickly, but still
> synchronously. Only a backend could provide a more asynchronous
> implementation.
>
> This solves a problem we have right now with the opal_flash_erase call
> where it can block in Skiboot for around three minutes. This causes a
> variety of problems in Linux due to a processor being gone for a long
> time.
> For example:
>
> [   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> [   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> [   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> [   98.610060] Task dump for CPU 113:
> [   98.610062] pflash          R  running task        0  3335   3333 0x00040004
> [   98.610066] Call Trace:
> [   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> [   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> [   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> [   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
>
> It is for this reason that breaking the work up in smaller chunks solves
> this problem as Skiboot can return the CPU to Linux between chunks to
> avoid Linux getting upset.

Any timing measurements on how this affects flash read/write speed
from the host?

>
>
> Reported-By: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> Signed-off-by: Cyril Bur <cyril.bur@au1.ibm.com>
> ---
> Like this Alistair? Something something you're correct. Lets go with
> this one.
>
>  core/flash.c                            | 116 +++++++++++++++++++++++++++-----
>  doc/opal-api/opal-flash-110-111-112.rst |   4 ++
>  2 files changed, 103 insertions(+), 17 deletions(-)
>
> diff --git a/core/flash.c b/core/flash.c
> index 177f7ae1..71cc3c87 100644
> --- a/core/flash.c
> +++ b/core/flash.c
> @@ -28,6 +28,23 @@
>  #include <libstb/stb.h>
>  #include <libstb/container.h>
>  #include <elf.h>
> +#include <timer.h>
> +#include <timebase.h>
> +
> +enum flash_op {
> +       FLASH_OP_READ,
> +       FLASH_OP_WRITE,
> +       FLASH_OP_ERASE,
> +};
> +
> +struct flash_async_info {
> +       enum flash_op op;
> +       struct timer poller;
> +       uint64_t token;
> +       uint64_t pos;
> +       uint64_t len;
> +       uint64_t buf;
> +};
>
>  struct flash {
>         struct list_node        list;
> @@ -36,6 +53,7 @@ struct flash {
>         uint64_t                size;
>         uint32_t                block_size;
>         int                     id;
> +       struct flash_async_info async;
>  };
>
>  static LIST_HEAD(flashes);
> @@ -200,6 +218,53 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
>
>  /* core flash support */
>
> +/*
> + * Called with flash lock held, drop it on async completion
> + */
> +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> +{
> +       struct flash *flash = data;
> +       uint64_t offset, buf, len;
> +       int rc;
> +
> +       offset = flash->async.pos;
> +       buf = flash->async.buf;
> +       len = MIN(flash->async.len, flash->block_size);
> +
> +       switch (flash->async.op) {
> +       case FLASH_OP_READ:
> +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> +               break;
> +       case FLASH_OP_WRITE:
> +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> +               break;
> +       case FLASH_OP_ERASE:
> +               rc = blocklevel_erase(flash->bl, offset, len);
> +               break;
> +       default:
> +               assert(0);
> +       }
> +
> +       if (rc)
> +               rc = OPAL_HARDWARE;
> +
> +       flash->async.pos += len;
> +       flash->async.buf += len;
> +       flash->async.len -= len;
> +       if (!rc && flash->async.len) {
> +               /*
> +                * We want to get called pretty much straight away, just have
> +                * to be sure that we jump back out to Linux so that if this
> +                * very long we don't cause RCU or the scheduler to freak
> +                */
> +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> +               return;
> +       }
> +
> +       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> +       flash_release(flash);
> +}
> +
>  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
>  {
>         struct dt_node *flash_node;
> @@ -295,6 +360,7 @@ int flash_register(struct blocklevel_device *bl)
>         flash->size = size;
>         flash->block_size = block_size;
>         flash->id = num_flashes();
> +       init_timer(&flash->async.poller, flash_poll, flash);
>
>         list_add(&flashes, &flash->list);
>
> @@ -323,16 +389,11 @@ int flash_register(struct blocklevel_device *bl)
>         return OPAL_SUCCESS;
>  }
>
> -enum flash_op {
> -       FLASH_OP_READ,
> -       FLASH_OP_WRITE,
> -       FLASH_OP_ERASE,
> -};
> -
>  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>                 uint64_t buf, uint64_t size, uint64_t token)
>  {
>         struct flash *flash = NULL;
> +       uint64_t len;
>         int rc;
>
>         list_for_each(&flashes, flash, list)
> @@ -351,9 +412,16 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>                 prlog(PR_DEBUG, "Requested flash op %d beyond flash size %" PRIu64 "\n",
>                                 op, flash->size);
>                 rc = OPAL_PARAMETER;
> -               goto err;
> +               goto out;
>         }
>
> +       len = MIN(size, flash->block_size);
> +       flash->async.op = op;
> +       flash->async.token = token;
> +       flash->async.buf = buf + len;
> +       flash->async.len = size - len;
> +       flash->async.pos = offset + len;
> +
>         /*
>          * These ops intentionally have no smarts (ecc correction or erase
>          * before write) to them.
> @@ -363,29 +431,43 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>          */
>         switch (op) {
>         case FLASH_OP_READ:
> -               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size);
> +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
>                 break;
>         case FLASH_OP_WRITE:
> -               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
> +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
>                 break;
>         case FLASH_OP_ERASE:
> -               rc = blocklevel_erase(flash->bl, offset, size);
> +               rc = blocklevel_erase(flash->bl, offset, len);
>                 break;
>         default:
>                 assert(0);
>         }
>
>         if (rc) {
> +               prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
>                 rc = OPAL_HARDWARE;
> -               goto err;
> +               goto out;
>         }

Some code duplication here now with the new flash_poll() function.
Might be worth a refactor.

>
> -       flash_release(flash);
> -
> -       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
> -       return OPAL_ASYNC_COMPLETION;
> -
> -err:
> +       if (size - len) {
> +               /* Work remains */
> +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> +               /* Don't release the flash */
> +               return OPAL_ASYNC_COMPLETION;
> +       } else {
> +               /*
> +                * As tempting as it might be here to return OPAL_SUCCESS
> +                * here, don't! As of 1/07/2017 the powernv_flash driver in
> +                * Linux will handle OPAL_SUCCESS as an error, the only thing
> +                * that makes it handle things as though they're working is
> +                * receiving OPAL_ASYNC_COMPLETION.
> +                *
> +                * XXX TODO: Revisit this in a few years *sigh*
> +                */
> +               opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> +       }
> +       rc = OPAL_ASYNC_COMPLETION;
> +out:
>         flash_release(flash);
>         return rc;
>  }
> diff --git a/doc/opal-api/opal-flash-110-111-112.rst b/doc/opal-api/opal-flash-110-111-112.rst
> index 71ba866d..086c4095 100644
> --- a/doc/opal-api/opal-flash-110-111-112.rst
> +++ b/doc/opal-api/opal-flash-110-111-112.rst
> @@ -20,6 +20,10 @@ success, the calls will return ``OPAL_ASYNC_COMPLETION``, and an
>  opal_async_completion message will be sent (with the appropriate token
>  argument) when the operation completes.
>
> +Due to an error in the powernv_flash driver in Linux these three OPAL
> +calls should never return ``OPAL_SUCCESS`` as the driver is likely to
> +treat this return value as an error.
> +
>  All calls share the same return values:
>
>  ``OPAL_ASYNC_COMPLETION``
> --
> 2.13.2
>
Cyril Bur July 3, 2017, 1:16 a.m. UTC | #2
On Fri, 2017-06-30 at 12:21 -0700, Rob Lippert wrote:
> On Thu, Jun 29, 2017 at 10:37 PM, Cyril Bur <cyril.bur@au1.ibm.com> wrote:
> > 
> > This patch provides a simple (although not particularly efficient)
> > asynchronous capability to the opal_flash interface. The advantage of
> > this approach is that it doesn't require any changing of blocklevel or
> > its backends to provide an asynchronous implementation. This is also the
> > disadvantage of this implementation as all it actually does is break the
> > work up in chunks that it can performed quickly, but still
> > synchronously. Only a backend could provide a more asynchronous
> > implementation.
> > 
> > This solves a problem we have right now with the opal_flash_erase call
> > where it can block in Skiboot for around three minutes. This causes a
> > variety of problems in Linux due to a processor being gone for a long
> > time.
> > For example:
> > 
> > [   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> > [   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> > [   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> > [   98.610060] Task dump for CPU 113:
> > [   98.610062] pflash          R  running task        0  3335   3333 0x00040004
> > [   98.610066] Call Trace:
> > [   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> > [   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> > [   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> > [   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
> > 
> > It is for this reason that breaking the work up in smaller chunks solves
> > this problem as Skiboot can return the CPU to Linux between chunks to
> > avoid Linux getting upset.
> 
> Any timing measurements on how this affects flash read/write speed
> from the host?
> 

Didn't take any measurements, purely interested in removing the RCU
stalls. I suppose in theory it is slower, unfortunately there isn't
much we can do.

The only thing I can think of is actually batching up more than one
erase block per flash_poll(). Unfortunately I don't really have the
cycles for a full performance analysis of how many would be best.

Cyril

> > 
> > 
> > Reported-By: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> > Signed-off-by: Cyril Bur <cyril.bur@au1.ibm.com>
> > ---
> > Like this Alistair? Something something you're correct. Lets go with
> > this one.
> > 
> >  core/flash.c                            | 116 +++++++++++++++++++++++++++-----
> >  doc/opal-api/opal-flash-110-111-112.rst |   4 ++
> >  2 files changed, 103 insertions(+), 17 deletions(-)
> > 
> > diff --git a/core/flash.c b/core/flash.c
> > index 177f7ae1..71cc3c87 100644
> > --- a/core/flash.c
> > +++ b/core/flash.c
> > @@ -28,6 +28,23 @@
> >  #include <libstb/stb.h>
> >  #include <libstb/container.h>
> >  #include <elf.h>
> > +#include <timer.h>
> > +#include <timebase.h>
> > +
> > +enum flash_op {
> > +       FLASH_OP_READ,
> > +       FLASH_OP_WRITE,
> > +       FLASH_OP_ERASE,
> > +};
> > +
> > +struct flash_async_info {
> > +       enum flash_op op;
> > +       struct timer poller;
> > +       uint64_t token;
> > +       uint64_t pos;
> > +       uint64_t len;
> > +       uint64_t buf;
> > +};
> > 
> >  struct flash {
> >         struct list_node        list;
> > @@ -36,6 +53,7 @@ struct flash {
> >         uint64_t                size;
> >         uint32_t                block_size;
> >         int                     id;
> > +       struct flash_async_info async;
> >  };
> > 
> >  static LIST_HEAD(flashes);
> > @@ -200,6 +218,53 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
> > 
> >  /* core flash support */
> > 
> > +/*
> > + * Called with flash lock held, drop it on async completion
> > + */
> > +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> > +{
> > +       struct flash *flash = data;
> > +       uint64_t offset, buf, len;
> > +       int rc;
> > +
> > +       offset = flash->async.pos;
> > +       buf = flash->async.buf;
> > +       len = MIN(flash->async.len, flash->block_size);
> > +
> > +       switch (flash->async.op) {
> > +       case FLASH_OP_READ:
> > +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> > +               break;
> > +       case FLASH_OP_WRITE:
> > +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> > +               break;
> > +       case FLASH_OP_ERASE:
> > +               rc = blocklevel_erase(flash->bl, offset, len);
> > +               break;
> > +       default:
> > +               assert(0);
> > +       }
> > +
> > +       if (rc)
> > +               rc = OPAL_HARDWARE;
> > +
> > +       flash->async.pos += len;
> > +       flash->async.buf += len;
> > +       flash->async.len -= len;
> > +       if (!rc && flash->async.len) {
> > +               /*
> > +                * We want to get called pretty much straight away, just have
> > +                * to be sure that we jump back out to Linux so that if this
> > +                * very long we don't cause RCU or the scheduler to freak
> > +                */
> > +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> > +               return;
> > +       }
> > +
> > +       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> > +       flash_release(flash);
> > +}
> > +
> >  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
> >  {
> >         struct dt_node *flash_node;
> > @@ -295,6 +360,7 @@ int flash_register(struct blocklevel_device *bl)
> >         flash->size = size;
> >         flash->block_size = block_size;
> >         flash->id = num_flashes();
> > +       init_timer(&flash->async.poller, flash_poll, flash);
> > 
> >         list_add(&flashes, &flash->list);
> > 
> > @@ -323,16 +389,11 @@ int flash_register(struct blocklevel_device *bl)
> >         return OPAL_SUCCESS;
> >  }
> > 
> > -enum flash_op {
> > -       FLASH_OP_READ,
> > -       FLASH_OP_WRITE,
> > -       FLASH_OP_ERASE,
> > -};
> > -
> >  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >                 uint64_t buf, uint64_t size, uint64_t token)
> >  {
> >         struct flash *flash = NULL;
> > +       uint64_t len;
> >         int rc;
> > 
> >         list_for_each(&flashes, flash, list)
> > @@ -351,9 +412,16 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >                 prlog(PR_DEBUG, "Requested flash op %d beyond flash size %" PRIu64 "\n",
> >                                 op, flash->size);
> >                 rc = OPAL_PARAMETER;
> > -               goto err;
> > +               goto out;
> >         }
> > 
> > +       len = MIN(size, flash->block_size);
> > +       flash->async.op = op;
> > +       flash->async.token = token;
> > +       flash->async.buf = buf + len;
> > +       flash->async.len = size - len;
> > +       flash->async.pos = offset + len;
> > +
> >         /*
> >          * These ops intentionally have no smarts (ecc correction or erase
> >          * before write) to them.
> > @@ -363,29 +431,43 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >          */
> >         switch (op) {
> >         case FLASH_OP_READ:
> > -               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size);
> > +               rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
> >                 break;
> >         case FLASH_OP_WRITE:
> > -               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
> > +               rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
> >                 break;
> >         case FLASH_OP_ERASE:
> > -               rc = blocklevel_erase(flash->bl, offset, size);
> > +               rc = blocklevel_erase(flash->bl, offset, len);
> >                 break;
> >         default:
> >                 assert(0);
> >         }
> > 
> >         if (rc) {
> > +               prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
> >                 rc = OPAL_HARDWARE;
> > -               goto err;
> > +               goto out;
> >         }
> 
> Some code duplication here now with the new flash_poll() function.
> Might be worth a refactor.

Could potentially pull out the switch statement. Not that much though I
suppose.

> 
> > 
> > -       flash_release(flash);
> > -
> > -       opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
> > -       return OPAL_ASYNC_COMPLETION;
> > -
> > -err:
> > +       if (size - len) {
> > +               /* Work remains */
> > +               schedule_timer(&flash->async.poller, msecs_to_tb(1));
> > +               /* Don't release the flash */
> > +               return OPAL_ASYNC_COMPLETION;
> > +       } else {
> > +               /*
> > +                * As tempting as it might be here to return OPAL_SUCCESS
> > +                * here, don't! As of 1/07/2017 the powernv_flash driver in
> > +                * Linux will handle OPAL_SUCCESS as an error, the only thing
> > +                * that makes it handle things as though they're working is
> > +                * receiving OPAL_ASYNC_COMPLETION.
> > +                *
> > +                * XXX TODO: Revisit this in a few years *sigh*
> > +                */
> > +               opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
> > +       }
> > +       rc = OPAL_ASYNC_COMPLETION;
> > +out:
> >         flash_release(flash);
> >         return rc;
> >  }
> > diff --git a/doc/opal-api/opal-flash-110-111-112.rst b/doc/opal-api/opal-flash-110-111-112.rst
> > index 71ba866d..086c4095 100644
> > --- a/doc/opal-api/opal-flash-110-111-112.rst
> > +++ b/doc/opal-api/opal-flash-110-111-112.rst
> > @@ -20,6 +20,10 @@ success, the calls will return ``OPAL_ASYNC_COMPLETION``, and an
> >  opal_async_completion message will be sent (with the appropriate token
> >  argument) when the operation completes.
> > 
> > +Due to an error in the powernv_flash driver in Linux these three OPAL
> > +calls should never return ``OPAL_SUCCESS`` as the driver is likely to
> > +treat this return value as an error.
> > +
> >  All calls share the same return values:
> > 
> >  ``OPAL_ASYNC_COMPLETION``
> > --
> > 2.13.2
> > 
> 
>
diff mbox

Patch

diff --git a/core/flash.c b/core/flash.c
index 177f7ae1..71cc3c87 100644
--- a/core/flash.c
+++ b/core/flash.c
@@ -28,6 +28,23 @@ 
 #include <libstb/stb.h>
 #include <libstb/container.h>
 #include <elf.h>
+#include <timer.h>
+#include <timebase.h>
+
+enum flash_op {
+	FLASH_OP_READ,
+	FLASH_OP_WRITE,
+	FLASH_OP_ERASE,
+};
+
+struct flash_async_info {
+	enum flash_op op;
+	struct timer poller;
+	uint64_t token;
+	uint64_t pos;
+	uint64_t len;
+	uint64_t buf;
+};
 
 struct flash {
 	struct list_node	list;
@@ -36,6 +53,7 @@  struct flash {
 	uint64_t		size;
 	uint32_t		block_size;
 	int			id;
+	struct flash_async_info async;
 };
 
 static LIST_HEAD(flashes);
@@ -200,6 +218,53 @@  static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
 
 /* core flash support */
 
+/*
+ * Called with flash lock held, drop it on async completion
+ */
+static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
+{
+	struct flash *flash = data;
+	uint64_t offset, buf, len;
+	int rc;
+
+	offset = flash->async.pos;
+	buf = flash->async.buf;
+	len = MIN(flash->async.len, flash->block_size);
+
+	switch (flash->async.op) {
+	case FLASH_OP_READ:
+		rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
+		break;
+	case FLASH_OP_WRITE:
+		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
+		break;
+	case FLASH_OP_ERASE:
+		rc = blocklevel_erase(flash->bl, offset, len);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rc)
+		rc = OPAL_HARDWARE;
+
+	flash->async.pos += len;
+	flash->async.buf += len;
+	flash->async.len -= len;
+	if (!rc && flash->async.len) {
+		/*
+		 * We want to get called pretty much straight away, just have
+		 * to be sure that we jump back out to Linux so that if this
+		 * very long we don't cause RCU or the scheduler to freak
+		 */
+		schedule_timer(&flash->async.poller, msecs_to_tb(1));
+		return;
+	}
+
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
+	flash_release(flash);
+}
+
 static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
 {
 	struct dt_node *flash_node;
@@ -295,6 +360,7 @@  int flash_register(struct blocklevel_device *bl)
 	flash->size = size;
 	flash->block_size = block_size;
 	flash->id = num_flashes();
+	init_timer(&flash->async.poller, flash_poll, flash);
 
 	list_add(&flashes, &flash->list);
 
@@ -323,16 +389,11 @@  int flash_register(struct blocklevel_device *bl)
 	return OPAL_SUCCESS;
 }
 
-enum flash_op {
-	FLASH_OP_READ,
-	FLASH_OP_WRITE,
-	FLASH_OP_ERASE,
-};
-
 static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 		uint64_t buf, uint64_t size, uint64_t token)
 {
 	struct flash *flash = NULL;
+	uint64_t len;
 	int rc;
 
 	list_for_each(&flashes, flash, list)
@@ -351,9 +412,16 @@  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 		prlog(PR_DEBUG, "Requested flash op %d beyond flash size %" PRIu64 "\n",
 				op, flash->size);
 		rc = OPAL_PARAMETER;
-		goto err;
+		goto out;
 	}
 
+	len = MIN(size, flash->block_size);
+	flash->async.op = op;
+	flash->async.token = token;
+	flash->async.buf = buf + len;
+	flash->async.len = size - len;
+	flash->async.pos = offset + len;
+
 	/*
 	 * These ops intentionally have no smarts (ecc correction or erase
 	 * before write) to them.
@@ -363,29 +431,43 @@  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 	 */
 	switch (op) {
 	case FLASH_OP_READ:
-		rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, size);
+		rc = blocklevel_raw_read(flash->bl, offset, (void *)buf, len);
 		break;
 	case FLASH_OP_WRITE:
-		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
+		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, len);
 		break;
 	case FLASH_OP_ERASE:
-		rc = blocklevel_erase(flash->bl, offset, size);
+		rc = blocklevel_erase(flash->bl, offset, len);
 		break;
 	default:
 		assert(0);
 	}
 
 	if (rc) {
+		prlog(PR_ERR, "%s: Op %d failed with rc %d\n", __func__, op, rc);
 		rc = OPAL_HARDWARE;
-		goto err;
+		goto out;
 	}
 
-	flash_release(flash);
-
-	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, token, rc);
-	return OPAL_ASYNC_COMPLETION;
-
-err:
+	if (size - len) {
+		/* Work remains */
+		schedule_timer(&flash->async.poller, msecs_to_tb(1));
+		/* Don't release the flash */
+		return OPAL_ASYNC_COMPLETION;
+	} else {
+		/*
+		 * As tempting as it might be here to return OPAL_SUCCESS
+		 * here, don't! As of 1/07/2017 the powernv_flash driver in
+		 * Linux will handle OPAL_SUCCESS as an error, the only thing
+		 * that makes it handle things as though they're working is
+		 * receiving OPAL_ASYNC_COMPLETION.
+		 *
+		 * XXX TODO: Revisit this in a few years *sigh*
+		 */
+		opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async.token, rc);
+	}
+	rc = OPAL_ASYNC_COMPLETION;
+out:
 	flash_release(flash);
 	return rc;
 }
diff --git a/doc/opal-api/opal-flash-110-111-112.rst b/doc/opal-api/opal-flash-110-111-112.rst
index 71ba866d..086c4095 100644
--- a/doc/opal-api/opal-flash-110-111-112.rst
+++ b/doc/opal-api/opal-flash-110-111-112.rst
@@ -20,6 +20,10 @@  success, the calls will return ``OPAL_ASYNC_COMPLETION``, and an
 opal_async_completion message will be sent (with the appropriate token
 argument) when the operation completes.
 
+Due to an error in the powernv_flash driver in Linux these three OPAL
+calls should never return ``OPAL_SUCCESS`` as the driver is likely to
+treat this return value as an error.
+
 All calls share the same return values:
 
 ``OPAL_ASYNC_COMPLETION``