diff mbox

Block: don't do copy-on-read in before_write_notifier

Message ID 55D3EFF4.7030004@cn.fujitsu.com
State New
Headers show

Commit Message

Wen Congyang Aug. 19, 2015, 2:54 a.m. UTC
We will copy data in before_write_notifier to do backup.
It is a nested I/O request, so we cannot do copy-on-read.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 block/backup.c        | 19 +++++++++++++------
 block/io.c            | 11 ++++++++++-
 include/block/block.h |  3 +++
 trace-events          |  1 +
 4 files changed, 27 insertions(+), 7 deletions(-)

Comments

Paolo Bonzini Aug. 19, 2015, 5:41 a.m. UTC | #1
On 18/08/2015 19:54, Wen Congyang wrote:
> We will copy data in before_write_notifier to do backup.
> It is a nested I/O request, so we cannot do copy-on-read.

Can you explain why?  What is the bug that this is fixing?

Paolo

> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Wen Congyang Aug. 19, 2015, 5:43 a.m. UTC | #2
On 08/19/2015 01:41 PM, Paolo Bonzini wrote:
> On 18/08/2015 19:54, Wen Congyang wrote:
>> We will copy data in before_write_notifier to do backup.
>> It is a nested I/O request, so we cannot do copy-on-read.
> 
> Can you explain why?  What is the bug that this is fixing?

(gdb) bt
#0  0x00007fd53a6cdb55 in raise () from /lib64/libc.so.6
#1  0x00007fd53a6cf131 in abort () from /lib64/libc.so.6
#2  0x00007fd53a6c6a10 in __assert_fail () from /lib64/libc.so.6
#3  0x00007fd53dffe5ad in wait_serialising_requests (self=0x7fd50cdb6ae0) at block/io.c:452
#4  0x00007fd53dfff351 in bdrv_aligned_preadv (bs=0x7fd53ea33130, req=0x7fd50cdb6ae0, offset=26347307008, bytes=65536, align=512, qiov=0x7fd50cdb6c90, flags=
    1) at block/io.c:847
#5  0x00007fd53dfff897 in bdrv_co_do_preadv (bs=0x7fd53ea33130, offset=26347307008, bytes=65536, qiov=0x7fd50cdb6c90, flags=BDRV_REQ_COPY_ON_READ)
    at block/io.c:970
#6  0x00007fd53dfff962 in bdrv_co_do_readv (bs=0x7fd53ea33130, sector_num=51459584, nb_sectors=128, qiov=0x7fd50cdb6c90, flags=0) at block/io.c:992
#7  0x00007fd53dfff9cf in bdrv_co_readv (bs=0x7fd53ea33130, sector_num=51459584, nb_sectors=128, qiov=0x7fd50cdb6c90) at block/io.c:1001
#8  0x00007fd53ddb077a in backup_do_cow (bs=0x7fd53ea33130, sector_num=51459648, nb_sectors=16, error_is_read=0x0) at block/backup.c:132
#9  0x00007fd53ddb0f07 in backup_before_write_notify (notifier=0x7fd5118c9f30, opaque=0x7fd50cdb6e40) at block/backup.c:193
#10 0x00007fd53e063193 in notifier_with_return_list_notify (list=0x7fd53ea361b8, data=0x7fd50cdb6e40) at util/notify.c:65
#11 0x00007fd53e000079 in bdrv_aligned_pwritev (bs=0x7fd53ea33130, req=0x7fd50cdb6e40, offset=26347339776, bytes=8192, qiov=0x7fd54001c848, flags=0)
    at block/io.c:1116
#12 0x00007fd53e000b4f in bdrv_co_do_pwritev (bs=0x7fd53ea33130, offset=26347339776, bytes=8192, qiov=0x7fd54001c848, flags=0) at block/io.c:1354
#13 0x00007fd53e000c18 in bdrv_co_do_writev (bs=0x7fd53ea33130, sector_num=51459648, nb_sectors=16, qiov=0x7fd54001c848, flags=0) at block/io.c:1378
#14 0x00007fd53e002dba in bdrv_co_do_rw (opaque=0x7fd53fb76830) at block/io.c:2113
#15 0x00007fd53dfafde9 in coroutine_trampoline (i0=1073594560, i1=32725) at coroutine-ucontext.c:80
#16 0x00007fd53a6debe0 in __correctly_grouped_prefixwc () from /lib64/libc.so.6
#17 0x0000000000000000 in ?? ()


> 
> Paolo
> 
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>
Jeff Cody Aug. 19, 2015, 5:02 p.m. UTC | #3
On Wed, Aug 19, 2015 at 01:43:41PM +0800, Wen Congyang wrote:
> On 08/19/2015 01:41 PM, Paolo Bonzini wrote:
> > On 18/08/2015 19:54, Wen Congyang wrote:
> >> We will copy data in before_write_notifier to do backup.
> >> It is a nested I/O request, so we cannot do copy-on-read.
> > 
> > Can you explain why?  What is the bug that this is fixing?
> 
> (gdb) bt
> #0  0x00007fd53a6cdb55 in raise () from /lib64/libc.so.6
> #1  0x00007fd53a6cf131 in abort () from /lib64/libc.so.6
> #2  0x00007fd53a6c6a10 in __assert_fail () from /lib64/libc.so.6
> #3  0x00007fd53dffe5ad in wait_serialising_requests (self=0x7fd50cdb6ae0) at block/io.c:452
> #4  0x00007fd53dfff351 in bdrv_aligned_preadv (bs=0x7fd53ea33130, req=0x7fd50cdb6ae0, offset=26347307008, bytes=65536, align=512, qiov=0x7fd50cdb6c90, flags=
>     1) at block/io.c:847
> #5  0x00007fd53dfff897 in bdrv_co_do_preadv (bs=0x7fd53ea33130, offset=26347307008, bytes=65536, qiov=0x7fd50cdb6c90, flags=BDRV_REQ_COPY_ON_READ)
>     at block/io.c:970
> #6  0x00007fd53dfff962 in bdrv_co_do_readv (bs=0x7fd53ea33130, sector_num=51459584, nb_sectors=128, qiov=0x7fd50cdb6c90, flags=0) at block/io.c:992
> #7  0x00007fd53dfff9cf in bdrv_co_readv (bs=0x7fd53ea33130, sector_num=51459584, nb_sectors=128, qiov=0x7fd50cdb6c90) at block/io.c:1001
> #8  0x00007fd53ddb077a in backup_do_cow (bs=0x7fd53ea33130, sector_num=51459648, nb_sectors=16, error_is_read=0x0) at block/backup.c:132
> #9  0x00007fd53ddb0f07 in backup_before_write_notify (notifier=0x7fd5118c9f30, opaque=0x7fd50cdb6e40) at block/backup.c:193
> #10 0x00007fd53e063193 in notifier_with_return_list_notify (list=0x7fd53ea361b8, data=0x7fd50cdb6e40) at util/notify.c:65
> #11 0x00007fd53e000079 in bdrv_aligned_pwritev (bs=0x7fd53ea33130, req=0x7fd50cdb6e40, offset=26347339776, bytes=8192, qiov=0x7fd54001c848, flags=0)
>     at block/io.c:1116
> #12 0x00007fd53e000b4f in bdrv_co_do_pwritev (bs=0x7fd53ea33130, offset=26347339776, bytes=8192, qiov=0x7fd54001c848, flags=0) at block/io.c:1354
> #13 0x00007fd53e000c18 in bdrv_co_do_writev (bs=0x7fd53ea33130, sector_num=51459648, nb_sectors=16, qiov=0x7fd54001c848, flags=0) at block/io.c:1378
> #14 0x00007fd53e002dba in bdrv_co_do_rw (opaque=0x7fd53fb76830) at block/io.c:2113
> #15 0x00007fd53dfafde9 in coroutine_trampoline (i0=1073594560, i1=32725) at coroutine-ucontext.c:80
> #16 0x00007fd53a6debe0 in __correctly_grouped_prefixwc () from /lib64/libc.so.6
> #17 0x0000000000000000 in ?? ()
> 

Can you give the steps used to reproduce this?  I ask because I am
wondering if it would be worth adding an iotest for this or similar
scenarios.

Thanks,
Jeff
Wen Congyang Aug. 20, 2015, 12:46 a.m. UTC | #4
On 08/20/2015 01:02 AM, Jeff Cody wrote:
> On Wed, Aug 19, 2015 at 01:43:41PM +0800, Wen Congyang wrote:
>> On 08/19/2015 01:41 PM, Paolo Bonzini wrote:
>>> On 18/08/2015 19:54, Wen Congyang wrote:
>>>> We will copy data in before_write_notifier to do backup.
>>>> It is a nested I/O request, so we cannot do copy-on-read.
>>>
>>> Can you explain why?  What is the bug that this is fixing?
>>
>> (gdb) bt
>> #0  0x00007fd53a6cdb55 in raise () from /lib64/libc.so.6
>> #1  0x00007fd53a6cf131 in abort () from /lib64/libc.so.6
>> #2  0x00007fd53a6c6a10 in __assert_fail () from /lib64/libc.so.6
>> #3  0x00007fd53dffe5ad in wait_serialising_requests (self=0x7fd50cdb6ae0) at block/io.c:452
>> #4  0x00007fd53dfff351 in bdrv_aligned_preadv (bs=0x7fd53ea33130, req=0x7fd50cdb6ae0, offset=26347307008, bytes=65536, align=512, qiov=0x7fd50cdb6c90, flags=
>>     1) at block/io.c:847
>> #5  0x00007fd53dfff897 in bdrv_co_do_preadv (bs=0x7fd53ea33130, offset=26347307008, bytes=65536, qiov=0x7fd50cdb6c90, flags=BDRV_REQ_COPY_ON_READ)
>>     at block/io.c:970
>> #6  0x00007fd53dfff962 in bdrv_co_do_readv (bs=0x7fd53ea33130, sector_num=51459584, nb_sectors=128, qiov=0x7fd50cdb6c90, flags=0) at block/io.c:992
>> #7  0x00007fd53dfff9cf in bdrv_co_readv (bs=0x7fd53ea33130, sector_num=51459584, nb_sectors=128, qiov=0x7fd50cdb6c90) at block/io.c:1001
>> #8  0x00007fd53ddb077a in backup_do_cow (bs=0x7fd53ea33130, sector_num=51459648, nb_sectors=16, error_is_read=0x0) at block/backup.c:132
>> #9  0x00007fd53ddb0f07 in backup_before_write_notify (notifier=0x7fd5118c9f30, opaque=0x7fd50cdb6e40) at block/backup.c:193
>> #10 0x00007fd53e063193 in notifier_with_return_list_notify (list=0x7fd53ea361b8, data=0x7fd50cdb6e40) at util/notify.c:65
>> #11 0x00007fd53e000079 in bdrv_aligned_pwritev (bs=0x7fd53ea33130, req=0x7fd50cdb6e40, offset=26347339776, bytes=8192, qiov=0x7fd54001c848, flags=0)
>>     at block/io.c:1116
>> #12 0x00007fd53e000b4f in bdrv_co_do_pwritev (bs=0x7fd53ea33130, offset=26347339776, bytes=8192, qiov=0x7fd54001c848, flags=0) at block/io.c:1354
>> #13 0x00007fd53e000c18 in bdrv_co_do_writev (bs=0x7fd53ea33130, sector_num=51459648, nb_sectors=16, qiov=0x7fd54001c848, flags=0) at block/io.c:1378
>> #14 0x00007fd53e002dba in bdrv_co_do_rw (opaque=0x7fd53fb76830) at block/io.c:2113
>> #15 0x00007fd53dfafde9 in coroutine_trampoline (i0=1073594560, i1=32725) at coroutine-ucontext.c:80
>> #16 0x00007fd53a6debe0 in __correctly_grouped_prefixwc () from /lib64/libc.so.6
>> #17 0x0000000000000000 in ?? ()
>>
> 
> Can you give the steps used to reproduce this?  I ask because I am
> wondering if it would be worth adding an iotest for this or similar
> scenarios.

It is very easy to reproduce it:
1. -drive copy-on-read=on,...  // qemu option
2. drive_backup -f disk0 /path_to_backup.img // monitor command

Thanks
Wen Congyang

> 
> Thanks,
> Jeff
> .
>
Stefan Hajnoczi Sept. 2, 2015, 2:18 p.m. UTC | #5
On Wed, Aug 19, 2015 at 10:54:44AM +0800, Wen Congyang wrote:
> We will copy data in before_write_notifier to do backup.
> It is a nested I/O request, so we cannot do copy-on-read.
> 
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> ---
>  block/backup.c        | 19 +++++++++++++------
>  block/io.c            | 11 ++++++++++-
>  include/block/block.h |  3 +++
>  trace-events          |  1 +
>  4 files changed, 27 insertions(+), 7 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index 965654d..b729c4b 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -89,7 +89,8 @@ static void cow_request_end(CowRequest *req)
>  
>  static int coroutine_fn backup_do_cow(BlockDriverState *bs,
>                                        int64_t sector_num, int nb_sectors,
> -                                      bool *error_is_read)
> +                                      bool *error_is_read,
> +                                      bool is_write_notifier)
>  {
>      BackupBlockJob *job = (BackupBlockJob *)bs->job;
>      CowRequest cow_request;
> @@ -129,8 +130,13 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
>          iov.iov_len = n * BDRV_SECTOR_SIZE;
>          qemu_iovec_init_external(&bounce_qiov, &iov, 1);
>  
> -        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
> -                            &bounce_qiov);
> +        if (is_write_notifier) {
> +            ret = bdrv_co_nested_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER,
> +                                       n, &bounce_qiov);
> +        } else {
> +            ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
> +                                &bounce_qiov);
> +        }
>          if (ret < 0) {
>              trace_backup_do_cow_read_fail(job, start, ret);
>              if (error_is_read) {
> @@ -190,7 +196,7 @@ static int coroutine_fn backup_before_write_notify(
>      assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
>      assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
>  
> -    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
> +    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
>  }
>  
>  static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
> @@ -303,7 +309,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
>                      return ret;
>                  }
>                  ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
> -                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
> +                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
> +                                    false);
>                  if ((ret < 0) &&
>                      backup_error_action(job, error_is_read, -ret) ==
>                      BLOCK_ERROR_ACTION_REPORT) {
> @@ -408,7 +415,7 @@ static void coroutine_fn backup_run(void *opaque)
>              }
>              /* FULL sync mode we copy the whole drive. */
>              ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
> -                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
> +                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
>              if (ret < 0) {
>                  /* Depending on error action, fail now or retry cluster */
>                  BlockErrorAction action =
> diff --git a/block/io.c b/block/io.c
> index d4bc83b..04325f9 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -932,7 +932,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
>          return ret;
>      }
>  
> -    if (bs->copy_on_read) {
> +    /* Don't do copy-on-read if we read data before write operation */
> +    if (bs->copy_on_read && !(flags & BDRV_REQ_NESTED)) {
>          flags |= BDRV_REQ_COPY_ON_READ;
>      }
>  
> @@ -1001,6 +1002,14 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
>      return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
>  }
>  
> +int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
> +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> +{
> +    trace_bdrv_co_nested_readv(bs, sector_num, nb_sectors);
> +
> +    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, BDRV_REQ_NESTED);
> +}
> +
>  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
>  {
> diff --git a/include/block/block.h b/include/block/block.h
> index 608cd4e..f5578b2 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -60,6 +60,7 @@ typedef enum {
>       * opened with BDRV_O_UNMAP.
>       */
>      BDRV_REQ_MAY_UNMAP    = 0x4,
> +    BDRV_REQ_NESTED       = 0x8,
>  } BdrvRequestFlags;
>  
>  typedef struct BlockSizes {
> @@ -253,6 +254,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
>      int nb_sectors, QEMUIOVector *qiov);
>  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
> +int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
> +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
>  int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
>      int nb_sectors, QEMUIOVector *qiov);
>  /*
> diff --git a/trace-events b/trace-events
> index 8f9614a..e29e1cf 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -69,6 +69,7 @@ bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int flags, v
>  bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
>  bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> +bdrv_co_nested_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x"
>  bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"

This solution looks good to me.

I think the BDRV_REQ_NESTED and bdrv_co_nested_readv() name is too
vague, especially since there is no documentation about what "nested"
means here.  I'm afraid the flag will be (ab)used for other stuff in the
future and we'll end up with confusing/broken semantics.

Please call it BDRV_REQ_NO_COPY_ON_READ so it's clear what this flag
does.
Jeff Cody Sept. 2, 2015, 2:23 p.m. UTC | #6
On Wed, Sep 02, 2015 at 03:18:54PM +0100, Stefan Hajnoczi wrote:
> On Wed, Aug 19, 2015 at 10:54:44AM +0800, Wen Congyang wrote:
> > We will copy data in before_write_notifier to do backup.
> > It is a nested I/O request, so we cannot do copy-on-read.
> > 
> > Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> > ---
> >  block/backup.c        | 19 +++++++++++++------
> >  block/io.c            | 11 ++++++++++-
> >  include/block/block.h |  3 +++
> >  trace-events          |  1 +
> >  4 files changed, 27 insertions(+), 7 deletions(-)
> > 
> > diff --git a/block/backup.c b/block/backup.c
> > index 965654d..b729c4b 100644
> > --- a/block/backup.c
> > +++ b/block/backup.c
> > @@ -89,7 +89,8 @@ static void cow_request_end(CowRequest *req)
> >  
> >  static int coroutine_fn backup_do_cow(BlockDriverState *bs,
> >                                        int64_t sector_num, int nb_sectors,
> > -                                      bool *error_is_read)
> > +                                      bool *error_is_read,
> > +                                      bool is_write_notifier)
> >  {
> >      BackupBlockJob *job = (BackupBlockJob *)bs->job;
> >      CowRequest cow_request;
> > @@ -129,8 +130,13 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
> >          iov.iov_len = n * BDRV_SECTOR_SIZE;
> >          qemu_iovec_init_external(&bounce_qiov, &iov, 1);
> >  
> > -        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
> > -                            &bounce_qiov);
> > +        if (is_write_notifier) {
> > +            ret = bdrv_co_nested_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER,
> > +                                       n, &bounce_qiov);
> > +        } else {
> > +            ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
> > +                                &bounce_qiov);
> > +        }
> >          if (ret < 0) {
> >              trace_backup_do_cow_read_fail(job, start, ret);
> >              if (error_is_read) {
> > @@ -190,7 +196,7 @@ static int coroutine_fn backup_before_write_notify(
> >      assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
> >      assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
> >  
> > -    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
> > +    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
> >  }
> >  
> >  static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
> > @@ -303,7 +309,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
> >                      return ret;
> >                  }
> >                  ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
> > -                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
> > +                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
> > +                                    false);
> >                  if ((ret < 0) &&
> >                      backup_error_action(job, error_is_read, -ret) ==
> >                      BLOCK_ERROR_ACTION_REPORT) {
> > @@ -408,7 +415,7 @@ static void coroutine_fn backup_run(void *opaque)
> >              }
> >              /* FULL sync mode we copy the whole drive. */
> >              ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
> > -                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
> > +                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
> >              if (ret < 0) {
> >                  /* Depending on error action, fail now or retry cluster */
> >                  BlockErrorAction action =
> > diff --git a/block/io.c b/block/io.c
> > index d4bc83b..04325f9 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -932,7 +932,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
> >          return ret;
> >      }
> >  
> > -    if (bs->copy_on_read) {
> > +    /* Don't do copy-on-read if we read data before write operation */
> > +    if (bs->copy_on_read && !(flags & BDRV_REQ_NESTED)) {
> >          flags |= BDRV_REQ_COPY_ON_READ;
> >      }
> >  
> > @@ -1001,6 +1002,14 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
> >      return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
> >  }
> >  
> > +int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
> > +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> > +{
> > +    trace_bdrv_co_nested_readv(bs, sector_num, nb_sectors);
> > +
> > +    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, BDRV_REQ_NESTED);
> > +}
> > +
> >  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
> >      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> >  {
> > diff --git a/include/block/block.h b/include/block/block.h
> > index 608cd4e..f5578b2 100644
> > --- a/include/block/block.h
> > +++ b/include/block/block.h
> > @@ -60,6 +60,7 @@ typedef enum {
> >       * opened with BDRV_O_UNMAP.
> >       */
> >      BDRV_REQ_MAY_UNMAP    = 0x4,
> > +    BDRV_REQ_NESTED       = 0x8,
> >  } BdrvRequestFlags;
> >  
> >  typedef struct BlockSizes {
> > @@ -253,6 +254,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
> >      int nb_sectors, QEMUIOVector *qiov);
> >  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
> >      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
> > +int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
> > +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
> >  int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
> >      int nb_sectors, QEMUIOVector *qiov);
> >  /*
> > diff --git a/trace-events b/trace-events
> > index 8f9614a..e29e1cf 100644
> > --- a/trace-events
> > +++ b/trace-events
> > @@ -69,6 +69,7 @@ bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int flags, v
> >  bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
> >  bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> >  bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> > +bdrv_co_nested_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> >  bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> >  bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x"
> >  bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
> 
> This solution looks good to me.
> 
> I think the BDRV_REQ_NESTED and bdrv_co_nested_readv() name is too
> vague, especially since there is no documentation about what "nested"
> means here.  I'm afraid the flag will be (ab)used for other stuff in the
> future and we'll end up with confusing/broken semantics.
> 
> Please call it BDRV_REQ_NO_COPY_ON_READ so it's clear what this flag
> does.
>

I was able to test his solution, and can verify it fixed it, so I can
give:

Tested-by: Jeff Cody <jcody@redhat.com>

If you are re-spinning this patch, could you also split the patch in
two?  One for the core block io changes, and the second patch for
using the new nested readv function in backup.c

Thanks,

Jeff
Kevin Wolf Sept. 3, 2015, 3:44 p.m. UTC | #7
Am 02.09.2015 um 16:18 hat Stefan Hajnoczi geschrieben:
> On Wed, Aug 19, 2015 at 10:54:44AM +0800, Wen Congyang wrote:
> > We will copy data in before_write_notifier to do backup.
> > It is a nested I/O request, so we cannot do copy-on-read.
> > 
> > Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> > ---
> >  block/backup.c        | 19 +++++++++++++------
> >  block/io.c            | 11 ++++++++++-
> >  include/block/block.h |  3 +++
> >  trace-events          |  1 +
> >  4 files changed, 27 insertions(+), 7 deletions(-)
> > 
> > diff --git a/block/backup.c b/block/backup.c
> > index 965654d..b729c4b 100644
> > --- a/block/backup.c
> > +++ b/block/backup.c
> > @@ -89,7 +89,8 @@ static void cow_request_end(CowRequest *req)
> >  
> >  static int coroutine_fn backup_do_cow(BlockDriverState *bs,
> >                                        int64_t sector_num, int nb_sectors,
> > -                                      bool *error_is_read)
> > +                                      bool *error_is_read,
> > +                                      bool is_write_notifier)
> >  {
> >      BackupBlockJob *job = (BackupBlockJob *)bs->job;
> >      CowRequest cow_request;
> > @@ -129,8 +130,13 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
> >          iov.iov_len = n * BDRV_SECTOR_SIZE;
> >          qemu_iovec_init_external(&bounce_qiov, &iov, 1);
> >  
> > -        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
> > -                            &bounce_qiov);
> > +        if (is_write_notifier) {
> > +            ret = bdrv_co_nested_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER,
> > +                                       n, &bounce_qiov);
> > +        } else {
> > +            ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
> > +                                &bounce_qiov);
> > +        }
> >          if (ret < 0) {
> >              trace_backup_do_cow_read_fail(job, start, ret);
> >              if (error_is_read) {
> > @@ -190,7 +196,7 @@ static int coroutine_fn backup_before_write_notify(
> >      assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
> >      assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
> >  
> > -    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
> > +    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
> >  }
> >  
> >  static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
> > @@ -303,7 +309,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
> >                      return ret;
> >                  }
> >                  ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
> > -                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
> > +                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
> > +                                    false);
> >                  if ((ret < 0) &&
> >                      backup_error_action(job, error_is_read, -ret) ==
> >                      BLOCK_ERROR_ACTION_REPORT) {
> > @@ -408,7 +415,7 @@ static void coroutine_fn backup_run(void *opaque)
> >              }
> >              /* FULL sync mode we copy the whole drive. */
> >              ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
> > -                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
> > +                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
> >              if (ret < 0) {
> >                  /* Depending on error action, fail now or retry cluster */
> >                  BlockErrorAction action =
> > diff --git a/block/io.c b/block/io.c
> > index d4bc83b..04325f9 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -932,7 +932,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
> >          return ret;
> >      }
> >  
> > -    if (bs->copy_on_read) {
> > +    /* Don't do copy-on-read if we read data before write operation */
> > +    if (bs->copy_on_read && !(flags & BDRV_REQ_NESTED)) {
> >          flags |= BDRV_REQ_COPY_ON_READ;
> >      }
> >  
> > @@ -1001,6 +1002,14 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
> >      return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
> >  }
> >  
> > +int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
> > +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> > +{
> > +    trace_bdrv_co_nested_readv(bs, sector_num, nb_sectors);
> > +
> > +    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, BDRV_REQ_NESTED);
> > +}
> > +
> >  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
> >      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> >  {
> > diff --git a/include/block/block.h b/include/block/block.h
> > index 608cd4e..f5578b2 100644
> > --- a/include/block/block.h
> > +++ b/include/block/block.h
> > @@ -60,6 +60,7 @@ typedef enum {
> >       * opened with BDRV_O_UNMAP.
> >       */
> >      BDRV_REQ_MAY_UNMAP    = 0x4,
> > +    BDRV_REQ_NESTED       = 0x8,
> >  } BdrvRequestFlags;
> >  
> >  typedef struct BlockSizes {
> > @@ -253,6 +254,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
> >      int nb_sectors, QEMUIOVector *qiov);
> >  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
> >      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
> > +int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
> > +    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
> >  int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
> >      int nb_sectors, QEMUIOVector *qiov);
> >  /*
> > diff --git a/trace-events b/trace-events
> > index 8f9614a..e29e1cf 100644
> > --- a/trace-events
> > +++ b/trace-events
> > @@ -69,6 +69,7 @@ bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int flags, v
> >  bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
> >  bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> >  bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> > +bdrv_co_nested_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> >  bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
> >  bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x"
> >  bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
> 
> This solution looks good to me.
> 
> I think the BDRV_REQ_NESTED and bdrv_co_nested_readv() name is too
> vague, especially since there is no documentation about what "nested"
> means here.  I'm afraid the flag will be (ab)used for other stuff in the
> future and we'll end up with confusing/broken semantics.
> 
> Please call it BDRV_REQ_NO_COPY_ON_READ so it's clear what this flag
> does.

It also makes clear that it's an ugly hack. :-)

The scenario that is fixed here is one of the reasons why I disliked
these notifiers from the beginning. I still hope that we can move to
filter BDSes at some point and get rid of the hacks that were required
in the generic block layer to make things work without filters.

Kevin
diff mbox

Patch

diff --git a/block/backup.c b/block/backup.c
index 965654d..b729c4b 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -89,7 +89,8 @@  static void cow_request_end(CowRequest *req)
 
 static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                                       int64_t sector_num, int nb_sectors,
-                                      bool *error_is_read)
+                                      bool *error_is_read,
+                                      bool is_write_notifier)
 {
     BackupBlockJob *job = (BackupBlockJob *)bs->job;
     CowRequest cow_request;
@@ -129,8 +130,13 @@  static int coroutine_fn backup_do_cow(BlockDriverState *bs,
         iov.iov_len = n * BDRV_SECTOR_SIZE;
         qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
-        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
-                            &bounce_qiov);
+        if (is_write_notifier) {
+            ret = bdrv_co_nested_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER,
+                                       n, &bounce_qiov);
+        } else {
+            ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
+                                &bounce_qiov);
+        }
         if (ret < 0) {
             trace_backup_do_cow_read_fail(job, start, ret);
             if (error_is_read) {
@@ -190,7 +196,7 @@  static int coroutine_fn backup_before_write_notify(
     assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
+    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
 }
 
 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -303,7 +309,8 @@  static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                     return ret;
                 }
                 ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
-                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
+                                    false);
                 if ((ret < 0) &&
                     backup_error_action(job, error_is_read, -ret) ==
                     BLOCK_ERROR_ACTION_REPORT) {
@@ -408,7 +415,7 @@  static void coroutine_fn backup_run(void *opaque)
             }
             /* FULL sync mode we copy the whole drive. */
             ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
-                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
             if (ret < 0) {
                 /* Depending on error action, fail now or retry cluster */
                 BlockErrorAction action =
diff --git a/block/io.c b/block/io.c
index d4bc83b..04325f9 100644
--- a/block/io.c
+++ b/block/io.c
@@ -932,7 +932,8 @@  static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
         return ret;
     }
 
-    if (bs->copy_on_read) {
+    /* Don't do copy-on-read if we read data before write operation */
+    if (bs->copy_on_read && !(flags & BDRV_REQ_NESTED)) {
         flags |= BDRV_REQ_COPY_ON_READ;
     }
 
@@ -1001,6 +1002,14 @@  int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
 }
 
+int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_nested_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, BDRV_REQ_NESTED);
+}
+
 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
diff --git a/include/block/block.h b/include/block/block.h
index 608cd4e..f5578b2 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -60,6 +60,7 @@  typedef enum {
      * opened with BDRV_O_UNMAP.
      */
     BDRV_REQ_MAY_UNMAP    = 0x4,
+    BDRV_REQ_NESTED       = 0x8,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
@@ -253,6 +254,8 @@  int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_nested_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 /*
diff --git a/trace-events b/trace-events
index 8f9614a..e29e1cf 100644
--- a/trace-events
+++ b/trace-events
@@ -69,6 +69,7 @@  bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int flags, v
 bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+bdrv_co_nested_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x"
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"