diff mbox

qcow2: Support BDRV_REQ_MAY_UNMAP

Message ID 1475046261-15679-1-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng Sept. 28, 2016, 7:04 a.m. UTC
Handling this is similar to what is done to the L2 entry in the case of
compressed clusters.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block/qcow2-cluster.c | 9 +++++----
 block/qcow2.c         | 3 ++-
 block/qcow2.h         | 3 ++-
 3 files changed, 9 insertions(+), 6 deletions(-)

Comments

Max Reitz Sept. 28, 2016, 4:11 p.m. UTC | #1
On 28.09.2016 09:04, Fam Zheng wrote:
> Handling this is similar to what is done to the L2 entry in the case of
> compressed clusters.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block/qcow2-cluster.c | 9 +++++----
>  block/qcow2.c         | 3 ++-
>  block/qcow2.h         | 3 ++-
>  3 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 61d1ffd..928c1e2 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -1558,7 +1558,7 @@ fail:
>   * clusters.
>   */
>  static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> -                          uint64_t nb_clusters)
> +                          uint64_t nb_clusters, int flags)
>  {
>      BDRVQcow2State *s = bs->opaque;
>      uint64_t *l2_table;
> @@ -1582,7 +1582,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
>  
>          /* Update L2 entries */
>          qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
> -        if (old_offset & QCOW_OFLAG_COMPRESSED) {
> +        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
>              l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
>              qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);

I don't quite understand the reasoning behind this. How is this more
efficient than just using the existing path where we don't discard anything?

Note that BDRV_REQ_MAY_UNMAP does not mean "Yes, please discard" but
just "You may discard if it's easier for you". But it's actually not
easier for us, so I don't see why we're doing it.

As far as I can guess you actually want some way to tell a block driver
to actually make an effort to discard clusters as long they then read
back as zero (which is why you cannot simply use bdrv_pdiscard()).
However, I think this would require a new flag called
BDRV_REQ_SHOULD_UNMAP (which should imply BDRV_REQ_MAY_UNMAP).

Note that there is actually a case where qcow2 should support
BDRV_REQ_MAY_UNMAP, which is for v2 images. Currently, we just return
-ENOTSUP for them, but if we don't have a backing file and
BDRV_REQ_MAY_UNMAP is set, we could go on and make qcow2_zero_clusters()
work for them.

Max

>          } else {
> @@ -1595,7 +1595,8 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
>      return nb_clusters;
>  }
>  
> -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
> +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
> +                        int flags)
>  {
>      BDRVQcow2State *s = bs->opaque;
>      uint64_t nb_clusters;
> @@ -1612,7 +1613,7 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
>      s->cache_discards = true;
>  
>      while (nb_clusters > 0) {
> -        ret = zero_single_l2(bs, offset, nb_clusters);
> +        ret = zero_single_l2(bs, offset, nb_clusters, flags);
>          if (ret < 0) {
>              goto fail;
>          }
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 0e53a4d..474f244 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -1154,6 +1154,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
>  
>      /* Initialise locks */
>      qemu_co_mutex_init(&s->lock);
> +    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
>  
>      /* Repair image if dirty */
>      if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
> @@ -2476,7 +2477,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
>      trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
>  
>      /* Whatever is left can use real zero clusters */
> -    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
> +    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS, flags);
>      qemu_co_mutex_unlock(&s->lock);
>  
>      return ret;
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 9ce5a37..92203a8 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -547,7 +547,8 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
>  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
>  int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
>      int nb_sectors, enum qcow2_discard_type type, bool full_discard);
> -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
> +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
> +                        int flags);
>  
>  int qcow2_expand_zero_clusters(BlockDriverState *bs,
>                                 BlockDriverAmendStatusCB *status_cb,
>
Fam Zheng Sept. 29, 2016, 2:21 a.m. UTC | #2
On Wed, 09/28 18:11, Max Reitz wrote:
> On 28.09.2016 09:04, Fam Zheng wrote:
> > Handling this is similar to what is done to the L2 entry in the case of
> > compressed clusters.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> > ---
> >  block/qcow2-cluster.c | 9 +++++----
> >  block/qcow2.c         | 3 ++-
> >  block/qcow2.h         | 3 ++-
> >  3 files changed, 9 insertions(+), 6 deletions(-)
> > 
> > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> > index 61d1ffd..928c1e2 100644
> > --- a/block/qcow2-cluster.c
> > +++ b/block/qcow2-cluster.c
> > @@ -1558,7 +1558,7 @@ fail:
> >   * clusters.
> >   */
> >  static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> > -                          uint64_t nb_clusters)
> > +                          uint64_t nb_clusters, int flags)
> >  {
> >      BDRVQcow2State *s = bs->opaque;
> >      uint64_t *l2_table;
> > @@ -1582,7 +1582,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> >  
> >          /* Update L2 entries */
> >          qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
> > -        if (old_offset & QCOW_OFLAG_COMPRESSED) {
> > +        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
> >              l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
> >              qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> 
> I don't quite understand the reasoning behind this. How is this more
> efficient than just using the existing path where we don't discard anything?

It's more efficient in disk space. I didn't mention because it is so not
specific to this, but: what virt-sparsify does is creating an overlay -> fstrim
it -> qemu-img commit. This flow revealed to me that BDRV_REQ_MAY_UNMAP should
have been honored this way (after a hint of "how" by Kevin).

> 
> Note that BDRV_REQ_MAY_UNMAP does not mean "Yes, please discard" but
> just "You may discard if it's easier for you". But it's actually not
> easier for us, so I don't see why we're doing it.
> 
> As far as I can guess you actually want some way to tell a block driver
> to actually make an effort to discard clusters as long they then read
> back as zero (which is why you cannot simply use bdrv_pdiscard()).
> However, I think this would require a new flag called
> BDRV_REQ_SHOULD_UNMAP (which should imply BDRV_REQ_MAY_UNMAP).

This flag doesn't make sense to me, if the protocol doesn't know how to unmap,
it can ignore BDRV_REQ_MAY_UNMAP, but not BDRV_REQ_SHOULD_UNMAP. It just
complicates things a little.

Fam
Paolo Bonzini Sept. 29, 2016, 7:58 a.m. UTC | #3
On 29/09/2016 04:21, Fam Zheng wrote:
> On Wed, 09/28 18:11, Max Reitz wrote:
>> Note that BDRV_REQ_MAY_UNMAP does not mean "Yes, please discard" but
>> just "You may discard if it's easier for you". But it's actually not
>> easier for us, so I don't see why we're doing it.
>>
>> As far as I can guess you actually want some way to tell a block driver
>> to actually make an effort to discard clusters as long they then read
>> back as zero (which is why you cannot simply use bdrv_pdiscard()).
>> However, I think this would require a new flag called
>> BDRV_REQ_SHOULD_UNMAP (which should imply BDRV_REQ_MAY_UNMAP).
> 
> This flag doesn't make sense to me, if the protocol doesn't know how to unmap,
> it can ignore BDRV_REQ_MAY_UNMAP, but not BDRV_REQ_SHOULD_UNMAP. It just
> complicates things a little.

I don't think we actually have a use for a "MAY" unmap flag.  Either we
keep the not-so-perfect name or we replace MAY_UNMAP with "should" or
"want" or "would_like" unmap...  But Fam's patch does do what was
intended for the flag (which is the equivalent of the UNMAP bit in the
SCSI WRITE SAME command).

Paolo
Fam Zheng Sept. 29, 2016, 8:10 a.m. UTC | #4
On Thu, 09/29 09:58, Paolo Bonzini wrote:
> 
> 
> On 29/09/2016 04:21, Fam Zheng wrote:
> > On Wed, 09/28 18:11, Max Reitz wrote:
> >> Note that BDRV_REQ_MAY_UNMAP does not mean "Yes, please discard" but
> >> just "You may discard if it's easier for you". But it's actually not
> >> easier for us, so I don't see why we're doing it.
> >>
> >> As far as I can guess you actually want some way to tell a block driver
> >> to actually make an effort to discard clusters as long they then read
> >> back as zero (which is why you cannot simply use bdrv_pdiscard()).
> >> However, I think this would require a new flag called
> >> BDRV_REQ_SHOULD_UNMAP (which should imply BDRV_REQ_MAY_UNMAP).
> > 
> > This flag doesn't make sense to me, if the protocol doesn't know how to unmap,
> > it can ignore BDRV_REQ_MAY_UNMAP, but not BDRV_REQ_SHOULD_UNMAP. It just
> > complicates things a little.
> 
> I don't think we actually have a use for a "MAY" unmap flag.  Either we
> keep the not-so-perfect name or we replace MAY_UNMAP with "should" or
> "want" or "would_like" unmap...  But Fam's patch does do what was
> intended for the flag (which is the equivalent of the UNMAP bit in the
> SCSI WRITE SAME command).

After reading rfc2119, now I agree that "SHOULD" is better. :)

Fam
Kevin Wolf Sept. 29, 2016, 9:29 a.m. UTC | #5
Am 28.09.2016 um 09:04 hat Fam Zheng geschrieben:
> Handling this is similar to what is done to the L2 entry in the case of
> compressed clusters.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  block/qcow2-cluster.c | 9 +++++----
>  block/qcow2.c         | 3 ++-
>  block/qcow2.h         | 3 ++-
>  3 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 61d1ffd..928c1e2 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -1558,7 +1558,7 @@ fail:
>   * clusters.
>   */
>  static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> -                          uint64_t nb_clusters)
> +                          uint64_t nb_clusters, int flags)
>  {
>      BDRVQcow2State *s = bs->opaque;
>      uint64_t *l2_table;
> @@ -1582,7 +1582,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
>  
>          /* Update L2 entries */
>          qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
> -        if (old_offset & QCOW_OFLAG_COMPRESSED) {
> +        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
>              l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
>              qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
>          } else {

I don't think we should always do this for BDRV_REQ_MAY_UNMAP. The user
may want to keep the image fully allocated even if the guest (or block
job etc.) thinks this can be discarded.

When we discussed this in another email thread (?), I think I suggested
checking whether the image was opened with pass-discard-request=on. If
so, go ahead and deallocate the cluster, otherwise keep it allocated.

In those cases where pass-discard-request=off, it doesn't make a lot of
sense to deallocate the cluster anyway because it won't shrink the file
size.

Kevin
Fam Zheng Sept. 29, 2016, 9:55 a.m. UTC | #6
On Thu, 09/29 11:29, Kevin Wolf wrote:
> Am 28.09.2016 um 09:04 hat Fam Zheng geschrieben:
> > Handling this is similar to what is done to the L2 entry in the case of
> > compressed clusters.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> > ---
> >  block/qcow2-cluster.c | 9 +++++----
> >  block/qcow2.c         | 3 ++-
> >  block/qcow2.h         | 3 ++-
> >  3 files changed, 9 insertions(+), 6 deletions(-)
> > 
> > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> > index 61d1ffd..928c1e2 100644
> > --- a/block/qcow2-cluster.c
> > +++ b/block/qcow2-cluster.c
> > @@ -1558,7 +1558,7 @@ fail:
> >   * clusters.
> >   */
> >  static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> > -                          uint64_t nb_clusters)
> > +                          uint64_t nb_clusters, int flags)
> >  {
> >      BDRVQcow2State *s = bs->opaque;
> >      uint64_t *l2_table;
> > @@ -1582,7 +1582,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> >  
> >          /* Update L2 entries */
> >          qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
> > -        if (old_offset & QCOW_OFLAG_COMPRESSED) {
> > +        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
> >              l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
> >              qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> >          } else {
> 
> I don't think we should always do this for BDRV_REQ_MAY_UNMAP. The user
> may want to keep the image fully allocated even if the guest (or block
> job etc.) thinks this can be discarded.
> 
> When we discussed this in another email thread (?), I think I suggested
> checking whether the image was opened with pass-discard-request=on. If
> so, go ahead and deallocate the cluster, otherwise keep it allocated.
> 
> In those cases where pass-discard-request=off, it doesn't make a lot of
> sense to deallocate the cluster anyway because it won't shrink the file
> size.

I think this patch does what you mean:

    $ strace -f -e fallocate qemu-io \
        -c 'open -o pass-discard-request=on /var/tmp/test' \
        -c 'write 0 1M' \
        -c 'write -u -z 0 1M' \
        2>&1 >/dev/null | \
        grep fallocate
    [pid 17548] fallocate(17, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 327680, 1048576) = 0
    $ strace -f -e fallocate qemu-io \
        -c 'open -o pass-discard-request=off /var/tmp/test' \
        -c 'write 0 1M' \
        -c 'write -u -z 0 1M' \
        2>&1 >/dev/null | \
        grep fallocate

Because there is another check of pass-discard-request value in
update_refcount:

        if (refcount == 0 && s->discard_passthrough[type]) {
            update_refcount_discard(bs, cluster_offset, s->cluster_size);
        }

Fam
Kevin Wolf Sept. 29, 2016, 10:39 a.m. UTC | #7
Am 29.09.2016 um 11:55 hat Fam Zheng geschrieben:
> On Thu, 09/29 11:29, Kevin Wolf wrote:
> > Am 28.09.2016 um 09:04 hat Fam Zheng geschrieben:
> > > Handling this is similar to what is done to the L2 entry in the case of
> > > compressed clusters.
> > > 
> > > Signed-off-by: Fam Zheng <famz@redhat.com>
> > > ---
> > >  block/qcow2-cluster.c | 9 +++++----
> > >  block/qcow2.c         | 3 ++-
> > >  block/qcow2.h         | 3 ++-
> > >  3 files changed, 9 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> > > index 61d1ffd..928c1e2 100644
> > > --- a/block/qcow2-cluster.c
> > > +++ b/block/qcow2-cluster.c
> > > @@ -1558,7 +1558,7 @@ fail:
> > >   * clusters.
> > >   */
> > >  static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> > > -                          uint64_t nb_clusters)
> > > +                          uint64_t nb_clusters, int flags)
> > >  {
> > >      BDRVQcow2State *s = bs->opaque;
> > >      uint64_t *l2_table;
> > > @@ -1582,7 +1582,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
> > >  
> > >          /* Update L2 entries */
> > >          qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
> > > -        if (old_offset & QCOW_OFLAG_COMPRESSED) {
> > > +        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
> > >              l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
> > >              qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> > >          } else {
> > 
> > I don't think we should always do this for BDRV_REQ_MAY_UNMAP. The user
> > may want to keep the image fully allocated even if the guest (or block
> > job etc.) thinks this can be discarded.
> > 
> > When we discussed this in another email thread (?), I think I suggested
> > checking whether the image was opened with pass-discard-request=on. If
> > so, go ahead and deallocate the cluster, otherwise keep it allocated.
> > 
> > In those cases where pass-discard-request=off, it doesn't make a lot of
> > sense to deallocate the cluster anyway because it won't shrink the file
> > size.
> 
> I think this patch does what you mean:
> 
>     $ strace -f -e fallocate qemu-io \
>         -c 'open -o pass-discard-request=on /var/tmp/test' \
>         -c 'write 0 1M' \
>         -c 'write -u -z 0 1M' \
>         2>&1 >/dev/null | \
>         grep fallocate
>     [pid 17548] fallocate(17, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 327680, 1048576) = 0
>     $ strace -f -e fallocate qemu-io \
>         -c 'open -o pass-discard-request=off /var/tmp/test' \
>         -c 'write 0 1M' \
>         -c 'write -u -z 0 1M' \
>         2>&1 >/dev/null | \
>         grep fallocate
> 
> Because there is another check of pass-discard-request value in
> update_refcount:
> 
>         if (refcount == 0 && s->discard_passthrough[type]) {
>             update_refcount_discard(bs, cluster_offset, s->cluster_size);
>         }

What I mean is that in the second case, you're still uselessly
deallocating the cluster on the qcow2 level while you can't reclaim it
on the filesystem level. So it would be better to leave it allocated in
qcow2, too, so that you don't get an expensive reallocation the next
time you write to it.

Kevin
Paolo Bonzini Sept. 29, 2016, 12:14 p.m. UTC | #8
On 29/09/2016 12:39, Kevin Wolf wrote:
>> > 
>> > Because there is another check of pass-discard-request value in
>> > update_refcount:
>> > 
>> >         if (refcount == 0 && s->discard_passthrough[type]) {
>> >             update_refcount_discard(bs, cluster_offset, s->cluster_size);
>> >         }
> What I mean is that in the second case, you're still uselessly
> deallocating the cluster on the qcow2 level while you can't reclaim it
> on the filesystem level. So it would be better to leave it allocated in
> qcow2, too, so that you don't get an expensive reallocation the next
> time you write to it.

But if you do a qemu-img convert, the deallocated cluster wouldn't be in
the destination.

Paolo
Kevin Wolf Sept. 29, 2016, 12:48 p.m. UTC | #9
Am 29.09.2016 um 14:14 hat Paolo Bonzini geschrieben:
> On 29/09/2016 12:39, Kevin Wolf wrote:
> >> > Because there is another check of pass-discard-request value in
> >> > update_refcount:
> >> > 
> >> >         if (refcount == 0 && s->discard_passthrough[type]) {
> >> >             update_refcount_discard(bs, cluster_offset, s->cluster_size);
> >> >         }
> > What I mean is that in the second case, you're still uselessly
> > deallocating the cluster on the qcow2 level while you can't reclaim it
> > on the filesystem level. So it would be better to leave it allocated in
> > qcow2, too, so that you don't get an expensive reallocation the next
> > time you write to it.
> 
> But if you do a qemu-img convert, the deallocated cluster wouldn't be in
> the destination.

Right. I still think that there has to be an option to keep the image
fully allocated. Perhaps what we really need to check is BDRV_O_UNMAP.

Kevin
Paolo Bonzini Sept. 29, 2016, 12:50 p.m. UTC | #10
On 29/09/2016 14:48, Kevin Wolf wrote:
> Am 29.09.2016 um 14:14 hat Paolo Bonzini geschrieben:
>> On 29/09/2016 12:39, Kevin Wolf wrote:
>>>>> Because there is another check of pass-discard-request value in
>>>>> update_refcount:
>>>>>
>>>>>         if (refcount == 0 && s->discard_passthrough[type]) {
>>>>>             update_refcount_discard(bs, cluster_offset, s->cluster_size);
>>>>>         }
>>> What I mean is that in the second case, you're still uselessly
>>> deallocating the cluster on the qcow2 level while you can't reclaim it
>>> on the filesystem level. So it would be better to leave it allocated in
>>> qcow2, too, so that you don't get an expensive reallocation the next
>>> time you write to it.
>>
>> But if you do a qemu-img convert, the deallocated cluster wouldn't be in
>> the destination.
> 
> Right. I still think that there has to be an option to keep the image
> fully allocated. Perhaps what we really need to check is BDRV_O_UNMAP.

Duh, of course it is.

Paolo
Paolo Bonzini Sept. 29, 2016, 12:56 p.m. UTC | #11
On 29/09/2016 14:50, Paolo Bonzini wrote:
> 
> 
> On 29/09/2016 14:48, Kevin Wolf wrote:
>> Am 29.09.2016 um 14:14 hat Paolo Bonzini geschrieben:
>>> On 29/09/2016 12:39, Kevin Wolf wrote:
>>>>>> Because there is another check of pass-discard-request value in
>>>>>> update_refcount:
>>>>>>
>>>>>>         if (refcount == 0 && s->discard_passthrough[type]) {
>>>>>>             update_refcount_discard(bs, cluster_offset, s->cluster_size);
>>>>>>         }
>>>> What I mean is that in the second case, you're still uselessly
>>>> deallocating the cluster on the qcow2 level while you can't reclaim it
>>>> on the filesystem level. So it would be better to leave it allocated in
>>>> qcow2, too, so that you don't get an expensive reallocation the next
>>>> time you write to it.
>>>
>>> But if you do a qemu-img convert, the deallocated cluster wouldn't be in
>>> the destination.
>>
>> Right. I still think that there has to be an option to keep the image
>> fully allocated. Perhaps what we really need to check is BDRV_O_UNMAP.
> 
> Duh, of course it is.

... and it's handled in bdrv_co_pwrite_zeroes, so Fam's patch should be
okay:

    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
        flags &= ~BDRV_REQ_MAY_UNMAP;
    }

    return bdrv_co_pwritev(child, offset, count, NULL,
                           BDRV_REQ_ZERO_WRITE | flags);

Paolo
Max Reitz Oct. 1, 2016, 1:08 p.m. UTC | #12
On 29.09.2016 10:10, Fam Zheng wrote:
> On Thu, 09/29 09:58, Paolo Bonzini wrote:
>>
>>
>> On 29/09/2016 04:21, Fam Zheng wrote:
>>> On Wed, 09/28 18:11, Max Reitz wrote:
>>>> Note that BDRV_REQ_MAY_UNMAP does not mean "Yes, please discard" but
>>>> just "You may discard if it's easier for you". But it's actually not
>>>> easier for us, so I don't see why we're doing it.
>>>>
>>>> As far as I can guess you actually want some way to tell a block driver
>>>> to actually make an effort to discard clusters as long they then read
>>>> back as zero (which is why you cannot simply use bdrv_pdiscard()).
>>>> However, I think this would require a new flag called
>>>> BDRV_REQ_SHOULD_UNMAP (which should imply BDRV_REQ_MAY_UNMAP).
>>>
>>> This flag doesn't make sense to me, if the protocol doesn't know how to unmap,
>>> it can ignore BDRV_REQ_MAY_UNMAP, but not BDRV_REQ_SHOULD_UNMAP. It just
>>> complicates things a little.
>>
>> I don't think we actually have a use for a "MAY" unmap flag.  Either we
>> keep the not-so-perfect name or we replace MAY_UNMAP with "should" or
>> "want" or "would_like" unmap...  But Fam's patch does do what was
>> intended for the flag (which is the equivalent of the UNMAP bit in the
>> SCSI WRITE SAME command).
> 
> After reading rfc2119, now I agree that "SHOULD" is better. :)

That's OK with me, then.

Max
Fam Zheng Oct. 12, 2016, 1:14 a.m. UTC | #13
On Wed, 09/28 15:04, Fam Zheng wrote:
> Handling this is similar to what is done to the L2 entry in the case of
> compressed clusters.

Kevin, Max, is there anything else I need to do before this patch can be
applied?

Fam
Kevin Wolf Oct. 12, 2016, 8:42 a.m. UTC | #14
Am 12.10.2016 um 03:14 hat Fam Zheng geschrieben:
> On Wed, 09/28 15:04, Fam Zheng wrote:
> > Handling this is similar to what is done to the L2 entry in the case of
> > compressed clusters.
> 
> Kevin, Max, is there anything else I need to do before this patch can be
> applied?

Hm, actually, it looks like we had a few discussions, but came to the
conclusions that the patch is alright.

Thanks, applied to the block branch.

Kevin
diff mbox

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 61d1ffd..928c1e2 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1558,7 +1558,7 @@  fail:
  * clusters.
  */
 static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
-                          uint64_t nb_clusters)
+                          uint64_t nb_clusters, int flags)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t *l2_table;
@@ -1582,7 +1582,7 @@  static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
 
         /* Update L2 entries */
         qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
-        if (old_offset & QCOW_OFLAG_COMPRESSED) {
+        if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
             l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
             qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
         } else {
@@ -1595,7 +1595,8 @@  static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
     return nb_clusters;
 }
 
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
+                        int flags)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t nb_clusters;
@@ -1612,7 +1613,7 @@  int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors)
     s->cache_discards = true;
 
     while (nb_clusters > 0) {
-        ret = zero_single_l2(bs, offset, nb_clusters);
+        ret = zero_single_l2(bs, offset, nb_clusters, flags);
         if (ret < 0) {
             goto fail;
         }
diff --git a/block/qcow2.c b/block/qcow2.c
index 0e53a4d..474f244 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1154,6 +1154,7 @@  static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
 
     /* Initialise locks */
     qemu_co_mutex_init(&s->lock);
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
 
     /* Repair image if dirty */
     if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only &&
@@ -2476,7 +2477,7 @@  static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
     trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
 
     /* Whatever is left can use real zero clusters */
-    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS);
+    ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS, flags);
     qemu_co_mutex_unlock(&s->lock);
 
     return ret;
diff --git a/block/qcow2.h b/block/qcow2.h
index 9ce5a37..92203a8 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -547,7 +547,8 @@  uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
 int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
     int nb_sectors, enum qcow2_discard_type type, bool full_discard);
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
+int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
+                        int flags);
 
 int qcow2_expand_zero_clusters(BlockDriverState *bs,
                                BlockDriverAmendStatusCB *status_cb,