diff mbox

[v7,3/8] mirror: Do zero write on target if sectors not allocated

Message ID 1433742974-20128-4-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng June 8, 2015, 5:56 a.m. UTC
If guest discards a source cluster, mirroring with bdrv_aio_readv is overkill.
Some protocols do zero upon discard, where it's best to use
bdrv_aio_write_zeroes, otherwise, bdrv_aio_discard will be enough.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block/mirror.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

Comments

Kevin Wolf Nov. 4, 2015, 6:35 p.m. UTC | #1
Am 08.06.2015 um 07:56 hat Fam Zheng geschrieben:
> If guest discards a source cluster, mirroring with bdrv_aio_readv is overkill.
> Some protocols do zero upon discard, where it's best to use
> bdrv_aio_write_zeroes, otherwise, bdrv_aio_discard will be enough.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block/mirror.c | 20 ++++++++++++++++++--
>  1 file changed, 18 insertions(+), 2 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index d2515c7..3c38695 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -164,6 +164,8 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
>      int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
>      uint64_t delay_ns = 0;
>      MirrorOp *op;
> +    int pnum;
> +    int64_t ret;
>  
>      s->sector_num = hbitmap_iter_next(&s->hbi);
>      if (s->sector_num < 0) {
> @@ -290,8 +292,22 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
>      s->in_flight++;
>      s->sectors_in_flight += nb_sectors;
>      trace_mirror_one_iteration(s, sector_num, nb_sectors);
> -    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> -                   mirror_read_complete, op);
> +
> +    ret = bdrv_get_block_status_above(source, NULL, sector_num,
> +                                      nb_sectors, &pnum);
> +    if (ret < 0 || pnum < nb_sectors ||

Earlier today I told Richard Jones that qemu-img commit should really
be using zero cluster support in the backing file since 2.4 because I
remembered this commit. Turns out it doesn't actually use it but writes
explicit zeros instead.

The reason is the condition 'pnum < nb_sectors' here, which makes mirror
fall back to explicit writes if bdrv_get_block_status_above() doesn't
return enough sectors (enough being relatively large here, I think in
qemu-img commit it's always the full 10 MB buffer).

In other words, we are ignoring any zero areas smaller than 10 MB!

(What made this worse is that qcow2 had a bug that reports only a single
zero cluster at a time, so it would never report more than 10 MB, even
if the image was completely zeroed. I've sent a fix for that one.)

In order to fix this, we'll probably need to move the call to
bdrv_get_block_status_above() before actually allocating memory and
all that for the full nb_chunks. We should detect zeros on the usual
block job granularity (64k by default, I think).

> +            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> +        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> +                       mirror_read_complete, op);
> +    } else if (ret & BDRV_BLOCK_ZERO) {
> +        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> +                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> +                              mirror_write_complete, op);
> +    } else {
> +        assert(!(ret & BDRV_BLOCK_DATA));
> +        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> +                         mirror_write_complete, op);
> +    }
>      return delay_ns;
>  }

Paolo also noticed that there's no reason at all to allocate buffers
and a qiov for the write_zeroes and discard cases.

Kevin
Fam Zheng Nov. 5, 2015, 5:42 a.m. UTC | #2
On Wed, 11/04 19:35, Kevin Wolf wrote:
> Am 08.06.2015 um 07:56 hat Fam Zheng geschrieben:
> > If guest discards a source cluster, mirroring with bdrv_aio_readv is overkill.
> > Some protocols do zero upon discard, where it's best to use
> > bdrv_aio_write_zeroes, otherwise, bdrv_aio_discard will be enough.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> > ---
> >  block/mirror.c | 20 ++++++++++++++++++--
> >  1 file changed, 18 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/mirror.c b/block/mirror.c
> > index d2515c7..3c38695 100644
> > --- a/block/mirror.c
> > +++ b/block/mirror.c
> > @@ -164,6 +164,8 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> >      int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
> >      uint64_t delay_ns = 0;
> >      MirrorOp *op;
> > +    int pnum;
> > +    int64_t ret;
> >  
> >      s->sector_num = hbitmap_iter_next(&s->hbi);
> >      if (s->sector_num < 0) {
> > @@ -290,8 +292,22 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> >      s->in_flight++;
> >      s->sectors_in_flight += nb_sectors;
> >      trace_mirror_one_iteration(s, sector_num, nb_sectors);
> > -    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> > -                   mirror_read_complete, op);
> > +
> > +    ret = bdrv_get_block_status_above(source, NULL, sector_num,
> > +                                      nb_sectors, &pnum);
> > +    if (ret < 0 || pnum < nb_sectors ||
> 
> Earlier today I told Richard Jones that qemu-img commit should really
> be using zero cluster support in the backing file since 2.4 because I
> remembered this commit. Turns out it doesn't actually use it but writes
> explicit zeros instead.
> 
> The reason is the condition 'pnum < nb_sectors' here, which makes mirror
> fall back to explicit writes if bdrv_get_block_status_above() doesn't
> return enough sectors (enough being relatively large here, I think in
> qemu-img commit it's always the full 10 MB buffer).
> 
> In other words, we are ignoring any zero areas smaller than 10 MB!
> 
> (What made this worse is that qcow2 had a bug that reports only a single
> zero cluster at a time, so it would never report more than 10 MB, even
> if the image was completely zeroed. I've sent a fix for that one.)
> 
> In order to fix this, we'll probably need to move the call to
> bdrv_get_block_status_above() before actually allocating memory and
> all that for the full nb_chunks. We should detect zeros on the usual
> block job granularity (64k by default, I think).
> 
> > +            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> > +        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> > +                       mirror_read_complete, op);
> > +    } else if (ret & BDRV_BLOCK_ZERO) {
> > +        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> > +                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> > +                              mirror_write_complete, op);
> > +    } else {
> > +        assert(!(ret & BDRV_BLOCK_DATA));
> > +        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> > +                         mirror_write_complete, op);
> > +    }
> >      return delay_ns;
> >  }
> 
> Paolo also noticed that there's no reason at all to allocate buffers
> and a qiov for the write_zeroes and discard cases.

I'll write a patch to address these. Thanks!

Fam
Kevin Wolf Nov. 5, 2015, 9:55 a.m. UTC | #3
Am 05.11.2015 um 06:42 hat Fam Zheng geschrieben:
> On Wed, 11/04 19:35, Kevin Wolf wrote:
> > Am 08.06.2015 um 07:56 hat Fam Zheng geschrieben:
> > > If guest discards a source cluster, mirroring with bdrv_aio_readv is overkill.
> > > Some protocols do zero upon discard, where it's best to use
> > > bdrv_aio_write_zeroes, otherwise, bdrv_aio_discard will be enough.
> > > 
> > > Signed-off-by: Fam Zheng <famz@redhat.com>
> > > ---
> > >  block/mirror.c | 20 ++++++++++++++++++--
> > >  1 file changed, 18 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/block/mirror.c b/block/mirror.c
> > > index d2515c7..3c38695 100644
> > > --- a/block/mirror.c
> > > +++ b/block/mirror.c
> > > @@ -164,6 +164,8 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> > >      int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
> > >      uint64_t delay_ns = 0;
> > >      MirrorOp *op;
> > > +    int pnum;
> > > +    int64_t ret;
> > >  
> > >      s->sector_num = hbitmap_iter_next(&s->hbi);
> > >      if (s->sector_num < 0) {
> > > @@ -290,8 +292,22 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
> > >      s->in_flight++;
> > >      s->sectors_in_flight += nb_sectors;
> > >      trace_mirror_one_iteration(s, sector_num, nb_sectors);
> > > -    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> > > -                   mirror_read_complete, op);
> > > +
> > > +    ret = bdrv_get_block_status_above(source, NULL, sector_num,
> > > +                                      nb_sectors, &pnum);
> > > +    if (ret < 0 || pnum < nb_sectors ||
> > 
> > Earlier today I told Richard Jones that qemu-img commit should really
> > be using zero cluster support in the backing file since 2.4 because I
> > remembered this commit. Turns out it doesn't actually use it but writes
> > explicit zeros instead.
> > 
> > The reason is the condition 'pnum < nb_sectors' here, which makes mirror
> > fall back to explicit writes if bdrv_get_block_status_above() doesn't
> > return enough sectors (enough being relatively large here, I think in
> > qemu-img commit it's always the full 10 MB buffer).
> > 
> > In other words, we are ignoring any zero areas smaller than 10 MB!
> > 
> > (What made this worse is that qcow2 had a bug that reports only a single
> > zero cluster at a time, so it would never report more than 10 MB, even
> > if the image was completely zeroed. I've sent a fix for that one.)
> > 
> > In order to fix this, we'll probably need to move the call to
> > bdrv_get_block_status_above() before actually allocating memory and
> > all that for the full nb_chunks. We should detect zeros on the usual
> > block job granularity (64k by default, I think).
> > 
> > > +            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
> > > +        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
> > > +                       mirror_read_complete, op);
> > > +    } else if (ret & BDRV_BLOCK_ZERO) {
> > > +        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
> > > +                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
> > > +                              mirror_write_complete, op);
> > > +    } else {
> > > +        assert(!(ret & BDRV_BLOCK_DATA));
> > > +        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
> > > +                         mirror_write_complete, op);
> > > +    }
> > >      return delay_ns;
> > >  }
> > 
> > Paolo also noticed that there's no reason at all to allocate buffers
> > and a qiov for the write_zeroes and discard cases.
> 
> I'll write a patch to address these. Thanks!

Thanks, Fam!

Kevin
diff mbox

Patch

diff --git a/block/mirror.c b/block/mirror.c
index d2515c7..3c38695 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -164,6 +164,8 @@  static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
     uint64_t delay_ns = 0;
     MirrorOp *op;
+    int pnum;
+    int64_t ret;
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
     if (s->sector_num < 0) {
@@ -290,8 +292,22 @@  static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
     s->in_flight++;
     s->sectors_in_flight += nb_sectors;
     trace_mirror_one_iteration(s, sector_num, nb_sectors);
-    bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
-                   mirror_read_complete, op);
+
+    ret = bdrv_get_block_status_above(source, NULL, sector_num,
+                                      nb_sectors, &pnum);
+    if (ret < 0 || pnum < nb_sectors ||
+            (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) {
+        bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
+                       mirror_read_complete, op);
+    } else if (ret & BDRV_BLOCK_ZERO) {
+        bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors,
+                              s->unmap ? BDRV_REQ_MAY_UNMAP : 0,
+                              mirror_write_complete, op);
+    } else {
+        assert(!(ret & BDRV_BLOCK_DATA));
+        bdrv_aio_discard(s->target, sector_num, op->nb_sectors,
+                         mirror_write_complete, op);
+    }
     return delay_ns;
 }