[4/4] libflash/blocklevel: Backend agnostic blocklevel_erase_async()

Message ID 20170628233804.18412-4-cyril.bur@au1.ibm.com
State New
Headers show

Commit Message

Cyril Bur June 28, 2017, 11:38 p.m.
This patch provides a simple (although not particularly efficient)
asynchronous erase function. The advantage of this approach is that it
doesn't require any of the blocklevel backends to provide an
asynchronous implementation. This is also the disadvantage of this
implementation as all it actually does is break the work up in chunks
that it can performed quickly, but still synchronously. Only a backend
could provide a more asynchronous implementation.

The motivation behind this patch is two fold:
Firstly this starts a blocklevel async interface - which could easily be
extended to pass the async calls down to backends which provide async
implementation.
Secondly this solves a problem we have right now with the
opal_flash_erase call where it can block in Skiboot for around three
minutes. This causes a variety of problems in Linux due to a processor
being gone for a long time. For example:

[   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
[   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
[   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
[   98.610060] Task dump for CPU 113:
[   98.610062] pflash          R  running task        0  3335   3333 0x00040004
[   98.610066] Call Trace:
[   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
[   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
[   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
[   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40

It is for this reason that breaking the work up in smaller chunks solves
this problem as Skiboot can return the CPU to Linux between chunks to
avoid Linux getting upset.

Reported-By: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: Cyril Bur <cyril.bur@au1.ibm.com>
---
 core/flash.c          | 46 +++++++++++++++++++++++++--
 libflash/blocklevel.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++----
 libflash/blocklevel.h | 19 +++++++++++
 libflash/errors.h     | 14 ++++++++
 libflash/libflash.h   | 13 --------
 5 files changed, 159 insertions(+), 21 deletions(-)

Comments

Alistair Popple June 29, 2017, 3:19 a.m. | #1
On Thu, 29 Jun 2017 09:38:04 AM Cyril Bur wrote:
> This patch provides a simple (although not particularly efficient)
> asynchronous erase function. The advantage of this approach is that it
> doesn't require any of the blocklevel backends to provide an
> asynchronous implementation. This is also the disadvantage of this
> implementation as all it actually does is break the work up in chunks
> that it can performed quickly, but still synchronously. Only a backend
> could provide a more asynchronous implementation.
> 
> The motivation behind this patch is two fold:
> Firstly this starts a blocklevel async interface - which could easily be
> extended to pass the async calls down to backends which provide async
> implementation.

To be honest having seen the code this seems to add a reasonable amount of
complexity to the blocklevel layer for no real gain as we don't actually
implement async and only skiboot uses it.

Given this I am wondering if it would be simpler to confine the hack to skiboot
and wait until we have other users of libflash that want async functions, or for
when we actually implement async at the HW level?

> Secondly this solves a problem we have right now with the
> opal_flash_erase call where it can block in Skiboot for around three
> minutes. This causes a variety of problems in Linux due to a processor
> being gone for a long time. For example:
> 
> [   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> [   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> [   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> [   98.610060] Task dump for CPU 113:
> [   98.610062] pflash          R  running task        0  3335   3333 0x00040004
> [   98.610066] Call Trace:
> [   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> [   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> [   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> [   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
> 
> It is for this reason that breaking the work up in smaller chunks solves
> this problem as Skiboot can return the CPU to Linux between chunks to
> avoid Linux getting upset.
> 
> Reported-By: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> Signed-off-by: Cyril Bur <cyril.bur@au1.ibm.com>
> ---
>  core/flash.c          | 46 +++++++++++++++++++++++++--
>  libflash/blocklevel.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++----
>  libflash/blocklevel.h | 19 +++++++++++
>  libflash/errors.h     | 14 ++++++++
>  libflash/libflash.h   | 13 --------
>  5 files changed, 159 insertions(+), 21 deletions(-)
> 
> diff --git a/core/flash.c b/core/flash.c
> index 177f7ae1..e8b9fc3c 100644
> --- a/core/flash.c
> +++ b/core/flash.c
> @@ -28,6 +28,13 @@
>  #include <libstb/stb.h>
>  #include <libstb/container.h>
>  #include <elf.h>
> +#include <timer.h>
> +#include <timebase.h>
> +
> +struct flash_async_info {
> +	struct timer poller;
> +	uint64_t token;
> +};
>  
>  struct flash {
>  	struct list_node	list;
> @@ -36,6 +43,7 @@ struct flash {
>  	uint64_t		size;
>  	uint32_t		block_size;
>  	int			id;
> +	struct flash_async_info async_info;
>  };
>  
>  static LIST_HEAD(flashes);
> @@ -200,6 +208,29 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
>  
>  /* core flash support */
>  
> +/*
> + * Called with flash lock held, drop it on async completion
> + */
> +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> +{
> +	struct flash *flash = data;
> +	int rc;
> +
> +	rc = blocklevel_async_poll(flash->bl);
> +	if (rc == FLASH_ASYNC_POLL) {
> +		/*
> +		 * We want to get called pretty much straight away, just have
> +		 * to be sure that we jump back out to Linux so that if this
> +		 * very long we don't cause RCU or the scheduler to freak
> +		 */
> +		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
> +		return;
> +	}
> +
> +	flash_release(flash);
> +	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async_info.token, rc);
> +}
> +
>  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
>  {
>  	struct dt_node *flash_node;
> @@ -295,6 +326,7 @@ int flash_register(struct blocklevel_device *bl)
>  	flash->size = size;
>  	flash->block_size = block_size;
>  	flash->id = num_flashes();
> +	init_timer(&flash->async_info.poller, flash_poll, flash);
>  
>  	list_add(&flashes, &flash->list);
>  
> @@ -369,8 +401,18 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
>  		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
>  		break;
>  	case FLASH_OP_ERASE:
> -		rc = blocklevel_erase(flash->bl, offset, size);
> -		break;
> +		rc = blocklevel_erase_async(flash->bl, offset, size);
> +		if (rc != FLASH_ASYNC_POLL)
> +			break;
> +		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
> +		/*
> +		 * FLASH_OP_ERASE can now happen asynchronously, don't go out
> +		 * the regular path.
> +		 *
> +		 * Ensure we hold the lock to flash for the entirety of the
> +		 * async process
> +		 */
> +		return OPAL_ASYNC_COMPLETION;
>  	default:
>  		assert(0);
>  	}
> diff --git a/libflash/blocklevel.c b/libflash/blocklevel.c
> index 9802ad0f..acda2066 100644
> --- a/libflash/blocklevel.c
> +++ b/libflash/blocklevel.c
> @@ -29,6 +29,10 @@
>  
>  #define PROT_REALLOC_NUM 25
>  
> +#ifndef MIN
> +#define MIN(a, b)	((a) < (b) ? (a) : (b))
> +#endif
> +
>  /* This function returns tristate values.
>   * 1  - The region is ECC protected
>   * 0  - The region is not ECC protected
> @@ -78,6 +82,46 @@ static int release(struct blocklevel_device *bl)
>  	return rc;
>  }
>  
> +int blocklevel_async_poll(struct blocklevel_device *bl)
> +{
> +	uint64_t len;
> +	int rc;
> +
> +	if (!bl->async.active)
> +		return FLASH_ERR_PARM_ERROR;
> +
> +	/* Just chunk things up by erase granule */
> +	len = MIN(bl->erase_mask + 1, bl->async.len);
> +
> +	switch (bl->async.op) {
> +		case READ:
> +		case WRITE:
> +			/*
> +			 * Very large caveat - do not remove this without auditing
> +			 * this entire file. smart_write() and smart_erase() will
> +			 * not 'just work' async. They will need work.
> +			 */
> +			bl->async.buf += len;

How would async.op == READ/WRITE? I'd be inclinded to complain loudly and return
here as it should never happen.

> +			return FLASH_ERR_PARM_ERROR;
> +		case ERASE:
> +			rc = bl->erase(bl, bl->async.pos, len);
> +			break;
> +		default:
> +			return FLASH_ERR_PARM_ERROR;
> +	}
> +
> +	if (rc)
> +		return rc;
> +
> +	bl->async.pos += len;
> +	bl->async.len -= len;

Wouldn't storing the end position be a little clearer (ie. bl->async.end)?

> +	if (bl->async.len == 0)
> +		bl->async.active = false;
> +
> +	return bl->async.active ? FLASH_ASYNC_POLL : 0;
> +}
> +
>  int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len)
>  {
>  	int rc;
> @@ -189,9 +233,9 @@ out:
>  	return rc;
>  }
>  
> -int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> +static int check_erase_params(struct blocklevel_device *bl, const char *caller,
> +		uint64_t pos, uint64_t len)
>  {
> -	int rc;
>  	if (!bl || !bl->erase) {
>  		errno = EINVAL;
>  		return FLASH_ERR_PARM_ERROR;
> @@ -199,16 +243,48 @@ int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
>  
>  	/* Programmer may be making a horrible mistake without knowing it */
>  	if (pos & bl->erase_mask) {
> -		fprintf(stderr, "blocklevel_erase: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> -				pos, bl->erase_mask + 1);
> +		FL_ERR("%s: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> +				caller, pos, bl->erase_mask + 1);
> +		return FLASH_ERR_ERASE_BOUNDARY;
>  	}
>  
>  	if (len & bl->erase_mask) {
> -		fprintf(stderr, "blocklevel_erase: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> -				len, bl->erase_mask + 1);
> +		FL_ERR("%s: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> +				caller, len, bl->erase_mask + 1);
>  		return FLASH_ERR_ERASE_BOUNDARY;
>  	}
>  
> +	return 0;
> +}
> +
> +int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> +{
> +	int rc;
> +
> +	rc = check_erase_params(bl, __func__, pos, len);
> +	if (rc)
> +		return rc;
> +
> +	if (bl->async.active)
> +		return FLASH_ERR_PARM_ERROR;
> +
> +	bl->async.op = ERASE;
> +	bl->async.pos = pos;
> +	bl->async.len = len;
> +	bl->async.active = true;
> +
> +	/* Call the poll once to get everything going */
> +	return blocklevel_async_poll(bl);
> +}
> +
> +int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> +{
> +	int rc;
> +
> +	rc = check_erase_params(bl, __func__, pos, len);
> +	if (rc)
> +		return rc;
> +
>  	rc = reacquire(bl);
>  	if (rc)
>  		return rc;
> diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
> index ba42c83d..7079e2fc 100644
> --- a/libflash/blocklevel.h
> +++ b/libflash/blocklevel.h
> @@ -34,6 +34,20 @@ enum blocklevel_flags {
>  	WRITE_NEED_ERASE = 1,
>  };
>  
> +enum blocklevel_async_op {
> +	READ = 1,
> +	WRITE,
> +	ERASE
> +};
> +
> +struct blocklevel_async {
> +	bool active;
> +	enum blocklevel_async_op op;
> +	uint64_t pos;
> +	uint64_t len;
> +	void *buf;
> +};
> +
>  /*
>   * libffs may be used with different backends, all should provide these for
>   * libflash to get the information it needs
> @@ -56,6 +70,8 @@ struct blocklevel_device {
>  	enum blocklevel_flags flags;
>  
>  	struct blocklevel_range ecc_prot;
> +
> +	struct blocklevel_async async;
>  };
>  int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
>  int blocklevel_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
> @@ -65,6 +81,9 @@ int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
>  int blocklevel_get_info(struct blocklevel_device *bl, const char **name, uint64_t *total_size,
>  		uint32_t *erase_granule);
>  
> +int blocklevel_async_poll(struct blocklevel_device *bl);
> +int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
> +
>  /*
>   * blocklevel_smart_write() performs reads on the data to see if it
>   * can skip erase or write calls. This is likely more convenient for
> diff --git a/libflash/errors.h b/libflash/errors.h
> index 2f567211..deda7b87 100644
> --- a/libflash/errors.h
> +++ b/libflash/errors.h

Why are we moving this in this patch? Just as clean-up? If so it probably should
be a seperate patch.

> @@ -33,5 +33,19 @@
>  #define FLASH_ERR_BAD_READ		15
>  #define FLASH_ERR_DEVICE_GONE	16
>  #define FLASH_ERR_AGAIN	17
> +#define FLASH_ASYNC_POLL 18
> +
> +#ifdef __SKIBOOT__
> +#include <skiboot.h>
> +#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
> +#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
> +#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
> +#else
> +#include <stdio.h>
> +extern bool libflash_debug;
> +#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
> +#define FL_INF(fmt...) do { printf(fmt); } while(0)
> +#define FL_ERR(fmt...) do { fprintf(stderr, fmt); } while(0)
> +#endif
>  
>  #endif /* __LIBFLASH_ERRORS_H */
> diff --git a/libflash/libflash.h b/libflash/libflash.h
> index 4fecfe75..ff3a9823 100644
> --- a/libflash/libflash.h
> +++ b/libflash/libflash.h
> @@ -28,19 +28,6 @@
>   */
>  #include <libflash/errors.h>
>  
> -#ifdef __SKIBOOT__
> -#include <skiboot.h>
> -#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
> -#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
> -#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
> -#else
> -#include <stdio.h>
> -extern bool libflash_debug;
> -#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
> -#define FL_INF(fmt...) do { printf(fmt); } while(0)
> -#define FL_ERR(fmt...) do { printf(fmt); } while(0)
> -#endif
> -
>  /* Flash chip, opaque */
>  struct flash_chip;
>  struct spi_flash_ctrl;
>
Cyril Bur June 29, 2017, 3:49 a.m. | #2
On Thu, 2017-06-29 at 13:19 +1000, Alistair Popple wrote:
> On Thu, 29 Jun 2017 09:38:04 AM Cyril Bur wrote:
> > This patch provides a simple (although not particularly efficient)
> > asynchronous erase function. The advantage of this approach is that it
> > doesn't require any of the blocklevel backends to provide an
> > asynchronous implementation. This is also the disadvantage of this
> > implementation as all it actually does is break the work up in chunks
> > that it can performed quickly, but still synchronously. Only a backend
> > could provide a more asynchronous implementation.
> > 
> > The motivation behind this patch is two fold:
> > Firstly this starts a blocklevel async interface - which could easily be
> > extended to pass the async calls down to backends which provide async
> > implementation.
> 
> To be honest having seen the code this seems to add a reasonable amount of
> complexity to the blocklevel layer for no real gain as we don't actually
> implement async and only skiboot uses it.
> 
> Given this I am wondering if it would be simpler to confine the hack to skiboot
> and wait until we have other users of libflash that want async functions, or for
> when we actually implement async at the HW level?
> 

*sigh* maybe, thoughts anyone?

> > Secondly this solves a problem we have right now with the
> > opal_flash_erase call where it can block in Skiboot for around three
> > minutes. This causes a variety of problems in Linux due to a processor
> > being gone for a long time. For example:
> > 
> > [   98.610043] INFO: rcu_sched detected stalls on CPUs/tasks:
> > [   98.610050]         113-...: (1 GPs behind) idle=96f/140000000000000/0 softirq=527/528 fqs=1044
> > [   98.610051]         (detected by 112, t=2102 jiffies, g=223, c=222, q=123)
> > [   98.610060] Task dump for CPU 113:
> > [   98.610062] pflash          R  running task        0  3335   3333 0x00040004
> > [   98.610066] Call Trace:
> > [   98.610070] [c000001fdd847730] [0000000000000001] 0x1 (unreliable)
> > [   98.610076] [c000001fdd847900] [c000000000013854] __switch_to+0x1e8/0x1f4
> > [   98.610081] [c000001fdd847960] [c0000000006122c4] __schedule+0x32c/0x874
> > [   98.610083] [c000001fdd847a30] [c000001fdd847b40] 0xc000001fdd847b40
> > 
> > It is for this reason that breaking the work up in smaller chunks solves
> > this problem as Skiboot can return the CPU to Linux between chunks to
> > avoid Linux getting upset.
> > 
> > Reported-By: Samuel Mendoza-Jonas <sam@mendozajonas.com>
> > Signed-off-by: Cyril Bur <cyril.bur@au1.ibm.com>
> > ---
> >  core/flash.c          | 46 +++++++++++++++++++++++++--
> >  libflash/blocklevel.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++----
> >  libflash/blocklevel.h | 19 +++++++++++
> >  libflash/errors.h     | 14 ++++++++
> >  libflash/libflash.h   | 13 --------
> >  5 files changed, 159 insertions(+), 21 deletions(-)
> > 
> > diff --git a/core/flash.c b/core/flash.c
> > index 177f7ae1..e8b9fc3c 100644
> > --- a/core/flash.c
> > +++ b/core/flash.c
> > @@ -28,6 +28,13 @@
> >  #include <libstb/stb.h>
> >  #include <libstb/container.h>
> >  #include <elf.h>
> > +#include <timer.h>
> > +#include <timebase.h>
> > +
> > +struct flash_async_info {
> > +	struct timer poller;
> > +	uint64_t token;
> > +};
> >  
> >  struct flash {
> >  	struct list_node	list;
> > @@ -36,6 +43,7 @@ struct flash {
> >  	uint64_t		size;
> >  	uint32_t		block_size;
> >  	int			id;
> > +	struct flash_async_info async_info;
> >  };
> >  
> >  static LIST_HEAD(flashes);
> > @@ -200,6 +208,29 @@ static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
> >  
> >  /* core flash support */
> >  
> > +/*
> > + * Called with flash lock held, drop it on async completion
> > + */
> > +static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
> > +{
> > +	struct flash *flash = data;
> > +	int rc;
> > +
> > +	rc = blocklevel_async_poll(flash->bl);
> > +	if (rc == FLASH_ASYNC_POLL) {
> > +		/*
> > +		 * We want to get called pretty much straight away, just have
> > +		 * to be sure that we jump back out to Linux so that if this
> > +		 * very long we don't cause RCU or the scheduler to freak
> > +		 */
> > +		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
> > +		return;
> > +	}
> > +
> > +	flash_release(flash);
> > +	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async_info.token, rc);
> > +}
> > +
> >  static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
> >  {
> >  	struct dt_node *flash_node;
> > @@ -295,6 +326,7 @@ int flash_register(struct blocklevel_device *bl)
> >  	flash->size = size;
> >  	flash->block_size = block_size;
> >  	flash->id = num_flashes();
> > +	init_timer(&flash->async_info.poller, flash_poll, flash);
> >  
> >  	list_add(&flashes, &flash->list);
> >  
> > @@ -369,8 +401,18 @@ static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
> >  		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
> >  		break;
> >  	case FLASH_OP_ERASE:
> > -		rc = blocklevel_erase(flash->bl, offset, size);
> > -		break;
> > +		rc = blocklevel_erase_async(flash->bl, offset, size);
> > +		if (rc != FLASH_ASYNC_POLL)
> > +			break;
> > +		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
> > +		/*
> > +		 * FLASH_OP_ERASE can now happen asynchronously, don't go out
> > +		 * the regular path.
> > +		 *
> > +		 * Ensure we hold the lock to flash for the entirety of the
> > +		 * async process
> > +		 */
> > +		return OPAL_ASYNC_COMPLETION;
> >  	default:
> >  		assert(0);
> >  	}
> > diff --git a/libflash/blocklevel.c b/libflash/blocklevel.c
> > index 9802ad0f..acda2066 100644
> > --- a/libflash/blocklevel.c
> > +++ b/libflash/blocklevel.c
> > @@ -29,6 +29,10 @@
> >  
> >  #define PROT_REALLOC_NUM 25
> >  
> > +#ifndef MIN
> > +#define MIN(a, b)	((a) < (b) ? (a) : (b))
> > +#endif
> > +
> >  /* This function returns tristate values.
> >   * 1  - The region is ECC protected
> >   * 0  - The region is not ECC protected
> > @@ -78,6 +82,46 @@ static int release(struct blocklevel_device *bl)
> >  	return rc;
> >  }
> >  
> > +int blocklevel_async_poll(struct blocklevel_device *bl)
> > +{
> > +	uint64_t len;
> > +	int rc;
> > +
> > +	if (!bl->async.active)
> > +		return FLASH_ERR_PARM_ERROR;
> > +
> > +	/* Just chunk things up by erase granule */
> > +	len = MIN(bl->erase_mask + 1, bl->async.len);
> > +
> > +	switch (bl->async.op) {
> > +		case READ:
> > +		case WRITE:
> > +			/*
> > +			 * Very large caveat - do not remove this without auditing
> > +			 * this entire file. smart_write() and smart_erase() will
> > +			 * not 'just work' async. They will need work.
> > +			 */
> > +			bl->async.buf += len;
> 
> How would async.op == READ/WRITE? I'd be inclinded to complain loudly and return
> here as it should never happen.

Perhaps, I wasn't sure so I did nothing... Yes yes inaction is a form
of action... but its less action?

> 
> > +			return FLASH_ERR_PARM_ERROR;
> > +		case ERASE:
> > +			rc = bl->erase(bl, bl->async.pos, len);
> > +			break;
> > +		default:
> > +			return FLASH_ERR_PARM_ERROR;
> > +	}
> > +
> > +	if (rc)
> > +		return rc;
> > +
> > +	bl->async.pos += len;
> > +	bl->async.len -= len;
> 
> Wouldn't storing the end position be a little clearer (ie. bl->async.end)?

Would it? Everything is always specified as a pos and len, doesn't it
make sense to keep it in the same layout as the functions accept their
args? Otherwise there will be a lot of bl->async.end - bl->async.pos.
And I'm not sure how nice if (bl->async.pos == bl->async.end) is over
if (bl->async.len == 0)

> 
> > +	if (bl->async.len == 0)
> > +		bl->async.active = false;
> > +
> > +	return bl->async.active ? FLASH_ASYNC_POLL : 0;
> > +}
> > +
> >  int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len)
> >  {
> >  	int rc;
> > @@ -189,9 +233,9 @@ out:
> >  	return rc;
> >  }
> >  
> > -int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> > +static int check_erase_params(struct blocklevel_device *bl, const char *caller,
> > +		uint64_t pos, uint64_t len)
> >  {
> > -	int rc;
> >  	if (!bl || !bl->erase) {
> >  		errno = EINVAL;
> >  		return FLASH_ERR_PARM_ERROR;
> > @@ -199,16 +243,48 @@ int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> >  
> >  	/* Programmer may be making a horrible mistake without knowing it */
> >  	if (pos & bl->erase_mask) {
> > -		fprintf(stderr, "blocklevel_erase: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > -				pos, bl->erase_mask + 1);
> > +		FL_ERR("%s: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > +				caller, pos, bl->erase_mask + 1);
> > +		return FLASH_ERR_ERASE_BOUNDARY;
> >  	}
> >  
> >  	if (len & bl->erase_mask) {
> > -		fprintf(stderr, "blocklevel_erase: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > -				len, bl->erase_mask + 1);
> > +		FL_ERR("%s: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
> > +				caller, len, bl->erase_mask + 1);
> >  		return FLASH_ERR_ERASE_BOUNDARY;
> >  	}
> >  
> > +	return 0;
> > +}
> > +
> > +int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> > +{
> > +	int rc;
> > +
> > +	rc = check_erase_params(bl, __func__, pos, len);
> > +	if (rc)
> > +		return rc;
> > +
> > +	if (bl->async.active)
> > +		return FLASH_ERR_PARM_ERROR;
> > +
> > +	bl->async.op = ERASE;
> > +	bl->async.pos = pos;
> > +	bl->async.len = len;
> > +	bl->async.active = true;
> > +
> > +	/* Call the poll once to get everything going */
> > +	return blocklevel_async_poll(bl);
> > +}
> > +
> > +int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
> > +{
> > +	int rc;
> > +
> > +	rc = check_erase_params(bl, __func__, pos, len);
> > +	if (rc)
> > +		return rc;
> > +
> >  	rc = reacquire(bl);
> >  	if (rc)
> >  		return rc;
> > diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
> > index ba42c83d..7079e2fc 100644
> > --- a/libflash/blocklevel.h
> > +++ b/libflash/blocklevel.h
> > @@ -34,6 +34,20 @@ enum blocklevel_flags {
> >  	WRITE_NEED_ERASE = 1,
> >  };
> >  
> > +enum blocklevel_async_op {
> > +	READ = 1,
> > +	WRITE,
> > +	ERASE
> > +};
> > +
> > +struct blocklevel_async {
> > +	bool active;
> > +	enum blocklevel_async_op op;
> > +	uint64_t pos;
> > +	uint64_t len;
> > +	void *buf;
> > +};
> > +
> >  /*
> >   * libffs may be used with different backends, all should provide these for
> >   * libflash to get the information it needs
> > @@ -56,6 +70,8 @@ struct blocklevel_device {
> >  	enum blocklevel_flags flags;
> >  
> >  	struct blocklevel_range ecc_prot;
> > +
> > +	struct blocklevel_async async;
> >  };
> >  int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
> >  int blocklevel_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
> > @@ -65,6 +81,9 @@ int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
> >  int blocklevel_get_info(struct blocklevel_device *bl, const char **name, uint64_t *total_size,
> >  		uint32_t *erase_granule);
> >  
> > +int blocklevel_async_poll(struct blocklevel_device *bl);
> > +int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
> > +
> >  /*
> >   * blocklevel_smart_write() performs reads on the data to see if it
> >   * can skip erase or write calls. This is likely more convenient for
> > diff --git a/libflash/errors.h b/libflash/errors.h
> > index 2f567211..deda7b87 100644
> > --- a/libflash/errors.h
> > +++ b/libflash/errors.h
> 
> Why are we moving this in this patch? Just as clean-up? If so it probably should
> be a seperate patch.

Damn, busted trying to sneak in a cleanup.

Yeah subtly hidden in there is that blocklevel does from using
fprintf(stderr, ...) to FL_ERR() which, for userspace use boils down to
the same but makes it look nicer in the skiboot log.

I'll split it up on resend.

> 
> > @@ -33,5 +33,19 @@
> >  #define FLASH_ERR_BAD_READ		15
> >  #define FLASH_ERR_DEVICE_GONE	16
> >  #define FLASH_ERR_AGAIN	17
> > +#define FLASH_ASYNC_POLL 18
> > +
> > +#ifdef __SKIBOOT__
> > +#include <skiboot.h>
> > +#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
> > +#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
> > +#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
> > +#else
> > +#include <stdio.h>
> > +extern bool libflash_debug;
> > +#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
> > +#define FL_INF(fmt...) do { printf(fmt); } while(0)
> > +#define FL_ERR(fmt...) do { fprintf(stderr, fmt); } while(0)
> > +#endif
> >  
> >  #endif /* __LIBFLASH_ERRORS_H */
> > diff --git a/libflash/libflash.h b/libflash/libflash.h
> > index 4fecfe75..ff3a9823 100644
> > --- a/libflash/libflash.h
> > +++ b/libflash/libflash.h
> > @@ -28,19 +28,6 @@
> >   */
> >  #include <libflash/errors.h>
> >  
> > -#ifdef __SKIBOOT__
> > -#include <skiboot.h>
> > -#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
> > -#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
> > -#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
> > -#else
> > -#include <stdio.h>
> > -extern bool libflash_debug;
> > -#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
> > -#define FL_INF(fmt...) do { printf(fmt); } while(0)
> > -#define FL_ERR(fmt...) do { printf(fmt); } while(0)
> > -#endif
> > -
> >  /* Flash chip, opaque */
> >  struct flash_chip;
> >  struct spi_flash_ctrl;
> > 
> 
>

Patch

diff --git a/core/flash.c b/core/flash.c
index 177f7ae1..e8b9fc3c 100644
--- a/core/flash.c
+++ b/core/flash.c
@@ -28,6 +28,13 @@ 
 #include <libstb/stb.h>
 #include <libstb/container.h>
 #include <elf.h>
+#include <timer.h>
+#include <timebase.h>
+
+struct flash_async_info {
+	struct timer poller;
+	uint64_t token;
+};
 
 struct flash {
 	struct list_node	list;
@@ -36,6 +43,7 @@  struct flash {
 	uint64_t		size;
 	uint32_t		block_size;
 	int			id;
+	struct flash_async_info async_info;
 };
 
 static LIST_HEAD(flashes);
@@ -200,6 +208,29 @@  static int flash_nvram_probe(struct flash *flash, struct ffs_handle *ffs)
 
 /* core flash support */
 
+/*
+ * Called with flash lock held, drop it on async completion
+ */
+static void flash_poll(struct timer *t __unused, void *data, uint64_t now __unused)
+{
+	struct flash *flash = data;
+	int rc;
+
+	rc = blocklevel_async_poll(flash->bl);
+	if (rc == FLASH_ASYNC_POLL) {
+		/*
+		 * We want to get called pretty much straight away, just have
+		 * to be sure that we jump back out to Linux so that if this
+		 * very long we don't cause RCU or the scheduler to freak
+		 */
+		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
+		return;
+	}
+
+	flash_release(flash);
+	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL, flash->async_info.token, rc);
+}
+
 static struct dt_node *flash_add_dt_node(struct flash *flash, int id)
 {
 	struct dt_node *flash_node;
@@ -295,6 +326,7 @@  int flash_register(struct blocklevel_device *bl)
 	flash->size = size;
 	flash->block_size = block_size;
 	flash->id = num_flashes();
+	init_timer(&flash->async_info.poller, flash_poll, flash);
 
 	list_add(&flashes, &flash->list);
 
@@ -369,8 +401,18 @@  static int64_t opal_flash_op(enum flash_op op, uint64_t id, uint64_t offset,
 		rc = blocklevel_raw_write(flash->bl, offset, (void *)buf, size);
 		break;
 	case FLASH_OP_ERASE:
-		rc = blocklevel_erase(flash->bl, offset, size);
-		break;
+		rc = blocklevel_erase_async(flash->bl, offset, size);
+		if (rc != FLASH_ASYNC_POLL)
+			break;
+		schedule_timer(&flash->async_info.poller, msecs_to_tb(1));
+		/*
+		 * FLASH_OP_ERASE can now happen asynchronously, don't go out
+		 * the regular path.
+		 *
+		 * Ensure we hold the lock to flash for the entirety of the
+		 * async process
+		 */
+		return OPAL_ASYNC_COMPLETION;
 	default:
 		assert(0);
 	}
diff --git a/libflash/blocklevel.c b/libflash/blocklevel.c
index 9802ad0f..acda2066 100644
--- a/libflash/blocklevel.c
+++ b/libflash/blocklevel.c
@@ -29,6 +29,10 @@ 
 
 #define PROT_REALLOC_NUM 25
 
+#ifndef MIN
+#define MIN(a, b)	((a) < (b) ? (a) : (b))
+#endif
+
 /* This function returns tristate values.
  * 1  - The region is ECC protected
  * 0  - The region is not ECC protected
@@ -78,6 +82,46 @@  static int release(struct blocklevel_device *bl)
 	return rc;
 }
 
+int blocklevel_async_poll(struct blocklevel_device *bl)
+{
+	uint64_t len;
+	int rc;
+
+	if (!bl->async.active)
+		return FLASH_ERR_PARM_ERROR;
+
+	/* Just chunk things up by erase granule */
+	len = MIN(bl->erase_mask + 1, bl->async.len);
+
+	switch (bl->async.op) {
+		case READ:
+		case WRITE:
+			/*
+			 * Very large caveat - do not remove this without auditing
+			 * this entire file. smart_write() and smart_erase() will
+			 * not 'just work' async. They will need work.
+			 */
+			bl->async.buf += len;
+			return FLASH_ERR_PARM_ERROR;
+		case ERASE:
+			rc = bl->erase(bl, bl->async.pos, len);
+			break;
+		default:
+			return FLASH_ERR_PARM_ERROR;
+	}
+
+	if (rc)
+		return rc;
+
+	bl->async.pos += len;
+	bl->async.len -= len;
+
+	if (bl->async.len == 0)
+		bl->async.active = false;
+
+	return bl->async.active ? FLASH_ASYNC_POLL : 0;
+}
+
 int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len)
 {
 	int rc;
@@ -189,9 +233,9 @@  out:
 	return rc;
 }
 
-int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
+static int check_erase_params(struct blocklevel_device *bl, const char *caller,
+		uint64_t pos, uint64_t len)
 {
-	int rc;
 	if (!bl || !bl->erase) {
 		errno = EINVAL;
 		return FLASH_ERR_PARM_ERROR;
@@ -199,16 +243,48 @@  int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
 
 	/* Programmer may be making a horrible mistake without knowing it */
 	if (pos & bl->erase_mask) {
-		fprintf(stderr, "blocklevel_erase: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
-				pos, bl->erase_mask + 1);
+		FL_ERR("%s: pos (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
+				caller, pos, bl->erase_mask + 1);
+		return FLASH_ERR_ERASE_BOUNDARY;
 	}
 
 	if (len & bl->erase_mask) {
-		fprintf(stderr, "blocklevel_erase: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
-				len, bl->erase_mask + 1);
+		FL_ERR("%s: len (0x%"PRIx64") is not erase block (0x%08x) aligned\n",
+				caller, len, bl->erase_mask + 1);
 		return FLASH_ERR_ERASE_BOUNDARY;
 	}
 
+	return 0;
+}
+
+int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
+{
+	int rc;
+
+	rc = check_erase_params(bl, __func__, pos, len);
+	if (rc)
+		return rc;
+
+	if (bl->async.active)
+		return FLASH_ERR_PARM_ERROR;
+
+	bl->async.op = ERASE;
+	bl->async.pos = pos;
+	bl->async.len = len;
+	bl->async.active = true;
+
+	/* Call the poll once to get everything going */
+	return blocklevel_async_poll(bl);
+}
+
+int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len)
+{
+	int rc;
+
+	rc = check_erase_params(bl, __func__, pos, len);
+	if (rc)
+		return rc;
+
 	rc = reacquire(bl);
 	if (rc)
 		return rc;
diff --git a/libflash/blocklevel.h b/libflash/blocklevel.h
index ba42c83d..7079e2fc 100644
--- a/libflash/blocklevel.h
+++ b/libflash/blocklevel.h
@@ -34,6 +34,20 @@  enum blocklevel_flags {
 	WRITE_NEED_ERASE = 1,
 };
 
+enum blocklevel_async_op {
+	READ = 1,
+	WRITE,
+	ERASE
+};
+
+struct blocklevel_async {
+	bool active;
+	enum blocklevel_async_op op;
+	uint64_t pos;
+	uint64_t len;
+	void *buf;
+};
+
 /*
  * libffs may be used with different backends, all should provide these for
  * libflash to get the information it needs
@@ -56,6 +70,8 @@  struct blocklevel_device {
 	enum blocklevel_flags flags;
 
 	struct blocklevel_range ecc_prot;
+
+	struct blocklevel_async async;
 };
 int blocklevel_raw_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
 int blocklevel_read(struct blocklevel_device *bl, uint64_t pos, void *buf, uint64_t len);
@@ -65,6 +81,9 @@  int blocklevel_erase(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
 int blocklevel_get_info(struct blocklevel_device *bl, const char **name, uint64_t *total_size,
 		uint32_t *erase_granule);
 
+int blocklevel_async_poll(struct blocklevel_device *bl);
+int blocklevel_erase_async(struct blocklevel_device *bl, uint64_t pos, uint64_t len);
+
 /*
  * blocklevel_smart_write() performs reads on the data to see if it
  * can skip erase or write calls. This is likely more convenient for
diff --git a/libflash/errors.h b/libflash/errors.h
index 2f567211..deda7b87 100644
--- a/libflash/errors.h
+++ b/libflash/errors.h
@@ -33,5 +33,19 @@ 
 #define FLASH_ERR_BAD_READ		15
 #define FLASH_ERR_DEVICE_GONE	16
 #define FLASH_ERR_AGAIN	17
+#define FLASH_ASYNC_POLL 18
+
+#ifdef __SKIBOOT__
+#include <skiboot.h>
+#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
+#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
+#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
+#else
+#include <stdio.h>
+extern bool libflash_debug;
+#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
+#define FL_INF(fmt...) do { printf(fmt); } while(0)
+#define FL_ERR(fmt...) do { fprintf(stderr, fmt); } while(0)
+#endif
 
 #endif /* __LIBFLASH_ERRORS_H */
diff --git a/libflash/libflash.h b/libflash/libflash.h
index 4fecfe75..ff3a9823 100644
--- a/libflash/libflash.h
+++ b/libflash/libflash.h
@@ -28,19 +28,6 @@ 
  */
 #include <libflash/errors.h>
 
-#ifdef __SKIBOOT__
-#include <skiboot.h>
-#define FL_INF(fmt...) do { prlog(PR_INFO, fmt);  } while(0)
-#define FL_DBG(fmt...) do { prlog(PR_DEBUG, fmt); } while(0)
-#define FL_ERR(fmt...) do { prlog(PR_ERR, fmt);   } while(0)
-#else
-#include <stdio.h>
-extern bool libflash_debug;
-#define FL_DBG(fmt...) do { if (libflash_debug) printf(fmt); } while(0)
-#define FL_INF(fmt...) do { printf(fmt); } while(0)
-#define FL_ERR(fmt...) do { printf(fmt); } while(0)
-#endif
-
 /* Flash chip, opaque */
 struct flash_chip;
 struct spi_flash_ctrl;