diff mbox

[17/17] block: Make bdrv_is_allocated_above() byte-based

Message ID 20170411222945.11741-18-eblake@redhat.com
State New
Headers show

Commit Message

Eric Blake April 11, 2017, 10:29 p.m. UTC
We are gradually moving away from sector-based interfaces, towards
byte-based.  In the common case, allocation is unlikely to ever use
values that are not naturally sector-aligned, but it is possible
that byte-based values will let us be more precise about allocation
at the end of an unaligned file that can do byte-based access.

Changing the signature of the function to use int64_t *pnum ensures
that the compiler enforces that all callers are updated.  For now,
the io.c layer still assert()s that all callers are sector-aligned,
but that can be relaxed when a later patch implements byte-based
block status.  Therefore, for the most part this patch is just the
addition of scaling at the callers followed by inverse scaling at
bdrv_is_allocated().  But some code, particularly stream_run(),
gets a lot simpler because it no longer has to mess with sectors.

For ease of review, bdrv_is_allocated() was tackled separately.

Signed-off-by: Eric Blake <eblake@redhat.com>
---
 include/block/block.h |  2 +-
 block/commit.c        | 20 ++++++++------------
 block/io.c            | 36 +++++++++++++++---------------------
 block/mirror.c        |  5 ++++-
 block/replication.c   | 17 ++++++++++++-----
 block/stream.c        | 21 +++++++++------------
 qemu-img.c            | 10 +++++++---
 7 files changed, 56 insertions(+), 55 deletions(-)

Comments

John Snow April 24, 2017, 11:06 p.m. UTC | #1
On 04/11/2017 06:29 PM, Eric Blake wrote:
> We are gradually moving away from sector-based interfaces, towards
> byte-based.  In the common case, allocation is unlikely to ever use
> values that are not naturally sector-aligned, but it is possible
> that byte-based values will let us be more precise about allocation
> at the end of an unaligned file that can do byte-based access.
> 
> Changing the signature of the function to use int64_t *pnum ensures
> that the compiler enforces that all callers are updated.  For now,
> the io.c layer still assert()s that all callers are sector-aligned,
> but that can be relaxed when a later patch implements byte-based
> block status.  Therefore, for the most part this patch is just the
> addition of scaling at the callers followed by inverse scaling at
> bdrv_is_allocated().  But some code, particularly stream_run(),
> gets a lot simpler because it no longer has to mess with sectors.
> 
> For ease of review, bdrv_is_allocated() was tackled separately.
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> ---
>  include/block/block.h |  2 +-
>  block/commit.c        | 20 ++++++++------------
>  block/io.c            | 36 +++++++++++++++---------------------
>  block/mirror.c        |  5 ++++-
>  block/replication.c   | 17 ++++++++++++-----
>  block/stream.c        | 21 +++++++++------------
>  qemu-img.c            | 10 +++++++---
>  7 files changed, 56 insertions(+), 55 deletions(-)
> 
> diff --git a/include/block/block.h b/include/block/block.h
> index 8641149..740cb86 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -425,7 +425,7 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs,
>  int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                        int64_t *pnum);
>  int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
> -                            int64_t sector_num, int nb_sectors, int *pnum);
> +                            int64_t offset, int64_t bytes, int64_t *pnum);
> 
>  bool bdrv_is_read_only(BlockDriverState *bs);
>  bool bdrv_is_sg(BlockDriverState *bs);
> diff --git a/block/commit.c b/block/commit.c
> index 4d6bb2a..989de7d 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -132,7 +132,7 @@ static void coroutine_fn commit_run(void *opaque)
>      int64_t offset;
>      uint64_t delay_ns = 0;
>      int ret = 0;
> -    int n = 0; /* sectors */
> +    int64_t n = 0; /* bytes */
>      void *buf = NULL;
>      int bytes_written = 0;
>      int64_t base_len;
> @@ -157,7 +157,7 @@ static void coroutine_fn commit_run(void *opaque)
> 
>      buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
> 
> -    for (offset = 0; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) {
> +    for (offset = 0; offset < s->common.len; offset += n) {
>          bool copy;
> 
>          /* Note that even when no rate limit is applied we need to yield
> @@ -169,15 +169,12 @@ static void coroutine_fn commit_run(void *opaque)
>          }
>          /* Copy if allocated above the base */
>          ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
> -                                      offset / BDRV_SECTOR_SIZE,
> -                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
> -                                      &n);
> +                                      offset, COMMIT_BUFFER_SIZE, &n);
>          copy = (ret == 1);
> -        trace_commit_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret);
> +        trace_commit_one_iteration(s, offset, n, ret);
>          if (copy) {
> -            ret = commit_populate(s->top, s->base, offset,
> -                                  n * BDRV_SECTOR_SIZE, buf);
> -            bytes_written += n * BDRV_SECTOR_SIZE;
> +            ret = commit_populate(s->top, s->base, offset, n, buf);
> +            bytes_written += n;
>          }
>          if (ret < 0) {
>              BlockErrorAction action =
> @@ -190,11 +187,10 @@ static void coroutine_fn commit_run(void *opaque)
>              }
>          }
>          /* Publish progress */
> -        s->common.offset += n * BDRV_SECTOR_SIZE;
> +        s->common.offset += n;
> 
>          if (copy && s->common.speed) {
> -            delay_ns = ratelimit_calculate_delay(&s->limit,
> -                                                 n * BDRV_SECTOR_SIZE);
> +            delay_ns = ratelimit_calculate_delay(&s->limit, n);
>          }
>      }
> 
> diff --git a/block/io.c b/block/io.c
> index 438a493..9218329 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -1930,52 +1930,46 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
>  /*
>   * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
>   *
> - * Return true if the given sector is allocated in any image between
> + * Return true if the given offset is allocated in any image between

perhaps "range" instead of "offset"?

>   * BASE and TOP (inclusive).  BASE can be NULL to check if the given
> - * sector is allocated in any image of the chain.  Return false otherwise,
> + * offset is allocated in any image of the chain.  Return false otherwise,
>   * or negative errno on failure.
>   *
> - * 'pnum' is set to the number of sectors (including and immediately following
> - *  the specified sector) that are known to be in the same
> + * 'pnum' is set to the number of bytes (including and immediately following
> + *  the specified offset) that are known to be in the same
>   *  allocated/unallocated state.
>   *
>   */
>  int bdrv_is_allocated_above(BlockDriverState *top,
>                              BlockDriverState *base,
> -                            int64_t sector_num,
> -                            int nb_sectors, int *pnum)
> +                            int64_t offset, int64_t bytes, int64_t *pnum)
>  {
>      BlockDriverState *intermediate;
> -    int ret, n = nb_sectors;
> +    int ret;
> +    int64_t n = bytes;
> 
>      intermediate = top;
>      while (intermediate && intermediate != base) {
>          int64_t pnum_inter;
> -        int psectors_inter;
> 
> -        ret = bdrv_is_allocated(intermediate, sector_num * BDRV_SECTOR_SIZE,
> -                                nb_sectors * BDRV_SECTOR_SIZE,
> -                                &pnum_inter);
> +        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
>          if (ret < 0) {
>              return ret;
>          }
> -        assert(pnum_inter < INT_MAX * BDRV_SECTOR_SIZE);
> -        psectors_inter = pnum_inter >> BDRV_SECTOR_BITS;
>          if (ret) {
> -            *pnum = psectors_inter;
> +            *pnum = pnum_inter;
>              return 1;
>          }
> 
>          /*
> -         * [sector_num, nb_sectors] is unallocated on top but intermediate
> -         * might have
> -         *
> -         * [sector_num+x, nr_sectors] allocated.
> +         * [offset, bytes] is unallocated on top but intermediate
> +         * might have [offset+x, bytes-x] allocated.
>           */
> -        if (n > psectors_inter &&
> +        if (n > pnum_inter &&
>              (intermediate == top ||
> -             sector_num + psectors_inter < intermediate->total_sectors)) {



> -            n = psectors_inter;
> +             offset + pnum_inter < (intermediate->total_sectors *
> +                                    BDRV_SECTOR_SIZE))) {

Naive question: not worth using either bdrv_getlength for bytes, or the
bdrv_nb_sectors helpers?

(Not sure if this is appropriate due to the variable length calls which
might disqualify it from internal usage)

> +            n = pnum_inter;
>          }
> 
>          intermediate = backing_bs(intermediate);
> diff --git a/block/mirror.c b/block/mirror.c
> index 8de0492..c92335a 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -609,6 +609,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
>      BlockDriverState *bs = s->source;
>      BlockDriverState *target_bs = blk_bs(s->target);
>      int ret, n;
> +    int64_t count;
> 
>      end = s->bdev_length / BDRV_SECTOR_SIZE;
> 
> @@ -658,11 +659,13 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
>              return 0;
>          }
> 
> -        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
> +        ret = bdrv_is_allocated_above(bs, base, sector_num * BDRV_SECTOR_SIZE,
> +                                      nb_sectors * BDRV_SECTOR_SIZE, &count);
>          if (ret < 0) {
>              return ret;
>          }
> 
> +        n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>          assert(n > 0);
>          if (ret == 1) {
>              bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
> diff --git a/block/replication.c b/block/replication.c
> index 414ecc4..605d90f 100644
> --- a/block/replication.c
> +++ b/block/replication.c
> @@ -264,7 +264,8 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
>      BdrvChild *top = bs->file;
>      BdrvChild *base = s->secondary_disk;
>      BdrvChild *target;
> -    int ret, n;
> +    int ret;
> +    int64_t n;
> 
>      ret = replication_get_io_status(s);
>      if (ret < 0) {
> @@ -283,14 +284,20 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
>       */
>      qemu_iovec_init(&hd_qiov, qiov->niov);
>      while (remaining_sectors > 0) {
> -        ret = bdrv_is_allocated_above(top->bs, base->bs, sector_num,
> -                                      remaining_sectors, &n);
> +        int64_t count;
> +
> +        ret = bdrv_is_allocated_above(top->bs, base->bs,
> +                                      sector_num * BDRV_SECTOR_SIZE,
> +                                      remaining_sectors * BDRV_SECTOR_SIZE,
> +                                      &count);
>          if (ret < 0) {
>              goto out1;
>          }
> 
> +        assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
> +        n = count >> BDRV_SECTOR_BITS;
>          qemu_iovec_reset(&hd_qiov);
> -        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * BDRV_SECTOR_SIZE);
> +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
> 
>          target = ret ? top : base;
>          ret = bdrv_co_writev(target, sector_num, n, &hd_qiov);
> @@ -300,7 +307,7 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
> 
>          remaining_sectors -= n;
>          sector_num += n;
> -        bytes_done += n * BDRV_SECTOR_SIZE;
> +        bytes_done += count;
>      }
> 
>  out1:
> diff --git a/block/stream.c b/block/stream.c
> index 85502eb..9033655 100644
> --- a/block/stream.c
> +++ b/block/stream.c
> @@ -112,7 +112,7 @@ static void coroutine_fn stream_run(void *opaque)
>      uint64_t delay_ns = 0;
>      int error = 0;
>      int ret = 0;
> -    int n = 0; /* sectors */
> +    int64_t n = 0; /* bytes */
>      void *buf;
> 
>      if (!bs->backing) {
> @@ -136,9 +136,8 @@ static void coroutine_fn stream_run(void *opaque)
>          bdrv_enable_copy_on_read(bs);
>      }
> 
> -    for ( ; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) {
> +    for ( ; offset < s->common.len; offset += n) {
>          bool copy;
> -        int64_t count = 0;
> 
>          /* Note that even when no rate limit is applied we need to yield
>           * with no pending I/O here so that bdrv_drain_all() returns.
> @@ -150,26 +149,25 @@ static void coroutine_fn stream_run(void *opaque)
> 
>          copy = false;
> 
> -        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &count);
> -        n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
> +        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &n);
>          if (ret == 1) {
>              /* Allocated in the top, no need to copy.  */
>          } else if (ret >= 0) {
>              /* Copy if allocated in the intermediate images.  Limit to the
>               * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE).  */
>              ret = bdrv_is_allocated_above(backing_bs(bs), base,
> -                                          offset / BDRV_SECTOR_SIZE, n, &n);
> +                                          offset, n, &n);
> 
>              /* Finish early if end of backing file has been reached */
>              if (ret == 0 && n == 0) {
> -                n = (s->common.len - offset) / BDRV_SECTOR_SIZE;
> +                n = s->common.len - offset;
>              }
> 
>              copy = (ret == 1);
>          }
> -        trace_stream_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret);
> +        trace_stream_one_iteration(s, offset, n, ret);
>          if (copy) {
> -            ret = stream_populate(blk, offset, n * BDRV_SECTOR_SIZE, buf);
> +            ret = stream_populate(blk, offset, n, buf);
>          }
>          if (ret < 0) {
>              BlockErrorAction action =
> @@ -188,10 +186,9 @@ static void coroutine_fn stream_run(void *opaque)
>          ret = 0;
> 
>          /* Publish progress */
> -        s->common.offset += n * BDRV_SECTOR_SIZE;
> +        s->common.offset += n;
>          if (copy && s->common.speed) {
> -            delay_ns = ratelimit_calculate_delay(&s->limit,
> -                                                 n * BDRV_SECTOR_SIZE);
> +            delay_ns = ratelimit_calculate_delay(&s->limit, n);
>          }
>      }
> 
> diff --git a/qemu-img.c b/qemu-img.c
> index 2f21d69..d96b4d6 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -1448,12 +1448,16 @@ static int img_compare(int argc, char **argv)
>          }
> 
>          for (;;) {
> +            int64_t count;
> +
>              nb_sectors = sectors_to_process(total_sectors_over, sector_num);
>              if (nb_sectors <= 0) {
>                  break;
>              }
> -            ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL, sector_num,
> -                                          nb_sectors, &pnum);
> +            ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL,
> +                                          sector_num * BDRV_SECTOR_SIZE,
> +                                          nb_sectors * BDRV_SECTOR_SIZE,
> +                                          &count);
>              if (ret < 0) {
>                  ret = 3;
>                  error_report("Sector allocation test failed for %s",
> @@ -1461,7 +1465,7 @@ static int img_compare(int argc, char **argv)
>                  goto out;
> 
>              }
> -            nb_sectors = pnum;
> +            nb_sectors = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
>              if (ret) {
>                  ret = check_empty_sectors(blk_over, sector_num, nb_sectors,
>                                            filename_over, buf1, quiet);
> 

Reviewed-by: John Snow <jsnow@redhat.com>
Eric Blake April 25, 2017, 1:48 a.m. UTC | #2
On 04/24/2017 06:06 PM, John Snow wrote:
> 
> 
> On 04/11/2017 06:29 PM, Eric Blake wrote:
>> We are gradually moving away from sector-based interfaces, towards
>> byte-based.  In the common case, allocation is unlikely to ever use
>> values that are not naturally sector-aligned, but it is possible
>> that byte-based values will let us be more precise about allocation
>> at the end of an unaligned file that can do byte-based access.
>>
>> Changing the signature of the function to use int64_t *pnum ensures
>> that the compiler enforces that all callers are updated.  For now,
>> the io.c layer still assert()s that all callers are sector-aligned,
>> but that can be relaxed when a later patch implements byte-based
>> block status.  Therefore, for the most part this patch is just the
>> addition of scaling at the callers followed by inverse scaling at
>> bdrv_is_allocated().  But some code, particularly stream_run(),
>> gets a lot simpler because it no longer has to mess with sectors.
>>

>> +++ b/block/io.c
>> @@ -1930,52 +1930,46 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
>>  /*
>>   * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
>>   *
>> - * Return true if the given sector is allocated in any image between
>> + * Return true if the given offset is allocated in any image between
> 
> perhaps "range" instead of "offset"?
> 
>>   * BASE and TOP (inclusive).  BASE can be NULL to check if the given
>> - * sector is allocated in any image of the chain.  Return false otherwise,
>> + * offset is allocated in any image of the chain.  Return false otherwise,
>>   * or negative errno on failure.

Seems reasonable.


>>          /*
>> -         * [sector_num, nb_sectors] is unallocated on top but intermediate
>> -         * might have
>> -         *
>> -         * [sector_num+x, nr_sectors] allocated.
>> +         * [offset, bytes] is unallocated on top but intermediate
>> +         * might have [offset+x, bytes-x] allocated.
>>           */
>> -        if (n > psectors_inter &&
>> +        if (n > pnum_inter &&
>>              (intermediate == top ||
>> -             sector_num + psectors_inter < intermediate->total_sectors)) {
> 
> 
> 
>> -            n = psectors_inter;
>> +             offset + pnum_inter < (intermediate->total_sectors *
>> +                                    BDRV_SECTOR_SIZE))) {
> 
> Naive question: not worth using either bdrv_getlength for bytes, or the
> bdrv_nb_sectors helpers?

bdrv_getlength(intermediate) should indeed be the same as
intermediate->total_sectors * BDRV_SECTOR_SIZE (for now - ultimately it
would be nice to track a byte length rather than a sector length for
images). I can make that cleanup for v2.

> 
> Reviewed-by: John Snow <jsnow@redhat.com>
>
Eric Blake May 10, 2017, 3:42 p.m. UTC | #3
On 04/24/2017 08:48 PM, Eric Blake wrote:
> On 04/24/2017 06:06 PM, John Snow wrote:
>>
>>
>> On 04/11/2017 06:29 PM, Eric Blake wrote:
>>> We are gradually moving away from sector-based interfaces, towards
>>> byte-based.  In the common case, allocation is unlikely to ever use
>>> values that are not naturally sector-aligned, but it is possible
>>> that byte-based values will let us be more precise about allocation
>>> at the end of an unaligned file that can do byte-based access.
>>>
>>> Changing the signature of the function to use int64_t *pnum ensures
>>> that the compiler enforces that all callers are updated.  For now,
>>> the io.c layer still assert()s that all callers are sector-aligned,
>>> but that can be relaxed when a later patch implements byte-based
>>> block status.  Therefore, for the most part this patch is just the
>>> addition of scaling at the callers followed by inverse scaling at
>>> bdrv_is_allocated().  But some code, particularly stream_run(),
>>> gets a lot simpler because it no longer has to mess with sectors.
>>>
> 
>>> +++ b/block/io.c
>>> @@ -1930,52 +1930,46 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
>>>  /*
>>>   * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
>>>   *
>>> - * Return true if the given sector is allocated in any image between
>>> + * Return true if the given offset is allocated in any image between
>>
>> perhaps "range" instead of "offset"?
>>
>>>   * BASE and TOP (inclusive).  BASE can be NULL to check if the given
>>> - * sector is allocated in any image of the chain.  Return false otherwise,
>>> + * offset is allocated in any image of the chain.  Return false otherwise,
>>>   * or negative errno on failure.
> 
> Seems reasonable.

Actually, not quite. Suppose we have the following sector allocations:

backing: -- -- XX --
active:  -- -- -- XX

bdrv_is_allocated_above(active, NULL, 0, 2048, &num)

will return 0 with num set to 1024 (offset is not allocated, and the
not-allocated range is at least 1024 bytes).  But calling

bdr_is_allocated_above(backing, NULL, 1024, 1024, &num)

will return 1 with num set to 512 (the entire range is not allocated,
but the 512-byte prefix of the range IS allocated).  Meanwhile, note that:

bdrv_is_allocated_above(active, NULL, 1024, 1024, &num)

will ALSO return 1 with num set to 512, even though it would be nicer if
it could return 1024 (from the active layer, all 1024 bytes of the given
range ARE allocated, just not from the same location).  So callers have
to manually coalesce multiple bdrv_is_allocated_above() calls to get a
full picture of what is allocated.

The same is ALSO true if you have a fragmented image.  For example:

$ qemu-img create -f qcow2 -o cluster_size=1m file3 10m
$ qemu-io -f qcow2 -c 'w 0 5m' -c 'discard 0 2m' -c 'w 1m 1m' \
  -c 'w 0 1m' -c 'w 8m 2m' file3

The image is now fragmented (the clusters at 0 and 1m swapped mappings):

[{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data":
true, "offset": 6291456},
{ "start": 1048576, "length": 1048576, "depth": 0, "zero": false,
"data": true, "offset": 5242880},
{ "start": 2097152, "length": 3145728, "depth": 0, "zero": false,
"data": true, "offset": 7340032},
{ "start": 5242880, "length": 3145728, "depth": 0, "zero": true, "data":
false},
{ "start": 8388608, "length": 2097152, "depth": 0, "zero": false,
"data": true, "offset": 10485760}]


'qemu-io alloc' and 'qemu-io map' are callers which coalesce similar
results:

$ ./qemu-io -f qcow2 file3
qemu-io> alloc 0 10m
7340032/10485760 bytes allocated at offset 0 bytes
qemu-io> map
5 MiB (0x500000) bytes     allocated at offset 0 bytes (0x0)
3 MiB (0x300000) bytes not allocated at offset 5 MiB (0x500000)
2 MiB (0x200000) bytes     allocated at offset 8 MiB (0x800000)

although tracing under gdb showed 5 calls to bdrv_is_allocated() during
'alloc' (at [0, 10m], [1m, 9m], [2m, 8m], [5m, 5m], [8m, 2m]), and 8
calls during 'map' (at [0, 10m], [1m, 9m], [2m, 8m], [5m, 5m], [5m, 5m],
[8m, 2m], [8m, 2m], [10m, 0]).  Part of that is that the qemu-io map
implementation is rather inefficient - it makes a wasted query for 0
bytes, and at every transition between allocated and unallocated it ends
up asking for status on the same offset a second time around, rather
than remembering what it already learned on the previous iteration.

It might be worth followup patches to improve the efficiency of qemu-io
map; but also to change semantics such that bdrv_is_allocated_above()
gives the largest answer possible, rather than making the callers
coalesce identical answers.  Part of that would include teaching
.bdrv_co_get_block_status() new semantics: if file==NULL, then don't
return BDRV_BLOCK_OFFSET_VALID, but merely grab as much
BDRV_BLOCK_DATA/BDRV_BLOCK_ZERO information as possible (in spite of
fragmentation) [in the example above, it would mean returning
BDRV_BLOCK_DATA for 5m, rather than three separate returns of 1m/1m/3m];
as well as teaching bdrv_is_allocated_above() to concatenate all answers
along the backing chain until it reaches an actual change in status.
Looks like I've just added more work to my queue.

That said, there still has to be coalescing somewhere.  For example,
qcow2 images can easily return status for any range covered by a single
L2 cluster, but intentionally quits its response at the end of an L2
cluster because the cost of loading up the next cluster (with the
potential of dropping locks and racing with other threads doing writes)
becomes too hard to control within the qcow2 layer, and callers may need
to realize that the larger a request is on an actively-changing image,
the more likely the overall response can be inaccurate by the time
multiple sub-queries have been coalesced.
Eric Blake May 10, 2017, 10:11 p.m. UTC | #4
On 04/24/2017 08:48 PM, Eric Blake wrote:

>>
>>
>>> -            n = psectors_inter;
>>> +             offset + pnum_inter < (intermediate->total_sectors *
>>> +                                    BDRV_SECTOR_SIZE))) {
>>
>> Naive question: not worth using either bdrv_getlength for bytes, or the
>> bdrv_nb_sectors helpers?
> 
> bdrv_getlength(intermediate) should indeed be the same as
> intermediate->total_sectors * BDRV_SECTOR_SIZE (for now - ultimately it
> would be nice to track a byte length rather than a sector length for
> images). I can make that cleanup for v2.

This one's tricky.  Calling bdrv_nb_sectors()/bdrv_getlength() (the two
are identical other than scale) guarantees that you have a sane answer
for a variably-sized BDS, but can fail with -ENOMEDIUM.  Which, given
the position in the 'if' clause, makes it a difficult rewrite to
properly catch.  On the other hand, since we just barely called
bdrv_is_allocated(intermediate), which in turn called
bdrv_co_get_block_status(), and that calls bdrv_nb_sectors(), we are
assured that intermediate->total_sectors has not changed in the meantime.

So my options are to either add a big comment stating why we are safe,
or to use bdrv_getlength() anyways but with proper error checking.  Best
done as a separate patch from the conversion in scale; so I'll do that
for v2.
diff mbox

Patch

diff --git a/include/block/block.h b/include/block/block.h
index 8641149..740cb86 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -425,7 +425,7 @@  int64_t bdrv_get_block_status_above(BlockDriverState *bs,
 int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
                       int64_t *pnum);
 int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
-                            int64_t sector_num, int nb_sectors, int *pnum);
+                            int64_t offset, int64_t bytes, int64_t *pnum);

 bool bdrv_is_read_only(BlockDriverState *bs);
 bool bdrv_is_sg(BlockDriverState *bs);
diff --git a/block/commit.c b/block/commit.c
index 4d6bb2a..989de7d 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -132,7 +132,7 @@  static void coroutine_fn commit_run(void *opaque)
     int64_t offset;
     uint64_t delay_ns = 0;
     int ret = 0;
-    int n = 0; /* sectors */
+    int64_t n = 0; /* bytes */
     void *buf = NULL;
     int bytes_written = 0;
     int64_t base_len;
@@ -157,7 +157,7 @@  static void coroutine_fn commit_run(void *opaque)

     buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);

-    for (offset = 0; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) {
+    for (offset = 0; offset < s->common.len; offset += n) {
         bool copy;

         /* Note that even when no rate limit is applied we need to yield
@@ -169,15 +169,12 @@  static void coroutine_fn commit_run(void *opaque)
         }
         /* Copy if allocated above the base */
         ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
-                                      offset / BDRV_SECTOR_SIZE,
-                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
-                                      &n);
+                                      offset, COMMIT_BUFFER_SIZE, &n);
         copy = (ret == 1);
-        trace_commit_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret);
+        trace_commit_one_iteration(s, offset, n, ret);
         if (copy) {
-            ret = commit_populate(s->top, s->base, offset,
-                                  n * BDRV_SECTOR_SIZE, buf);
-            bytes_written += n * BDRV_SECTOR_SIZE;
+            ret = commit_populate(s->top, s->base, offset, n, buf);
+            bytes_written += n;
         }
         if (ret < 0) {
             BlockErrorAction action =
@@ -190,11 +187,10 @@  static void coroutine_fn commit_run(void *opaque)
             }
         }
         /* Publish progress */
-        s->common.offset += n * BDRV_SECTOR_SIZE;
+        s->common.offset += n;

         if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit,
-                                                 n * BDRV_SECTOR_SIZE);
+            delay_ns = ratelimit_calculate_delay(&s->limit, n);
         }
     }

diff --git a/block/io.c b/block/io.c
index 438a493..9218329 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1930,52 +1930,46 @@  int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
 /*
  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
  *
- * Return true if the given sector is allocated in any image between
+ * Return true if the given offset is allocated in any image between
  * BASE and TOP (inclusive).  BASE can be NULL to check if the given
- * sector is allocated in any image of the chain.  Return false otherwise,
+ * offset is allocated in any image of the chain.  Return false otherwise,
  * or negative errno on failure.
  *
- * 'pnum' is set to the number of sectors (including and immediately following
- *  the specified sector) that are known to be in the same
+ * 'pnum' is set to the number of bytes (including and immediately following
+ *  the specified offset) that are known to be in the same
  *  allocated/unallocated state.
  *
  */
 int bdrv_is_allocated_above(BlockDriverState *top,
                             BlockDriverState *base,
-                            int64_t sector_num,
-                            int nb_sectors, int *pnum)
+                            int64_t offset, int64_t bytes, int64_t *pnum)
 {
     BlockDriverState *intermediate;
-    int ret, n = nb_sectors;
+    int ret;
+    int64_t n = bytes;

     intermediate = top;
     while (intermediate && intermediate != base) {
         int64_t pnum_inter;
-        int psectors_inter;

-        ret = bdrv_is_allocated(intermediate, sector_num * BDRV_SECTOR_SIZE,
-                                nb_sectors * BDRV_SECTOR_SIZE,
-                                &pnum_inter);
+        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
         if (ret < 0) {
             return ret;
         }
-        assert(pnum_inter < INT_MAX * BDRV_SECTOR_SIZE);
-        psectors_inter = pnum_inter >> BDRV_SECTOR_BITS;
         if (ret) {
-            *pnum = psectors_inter;
+            *pnum = pnum_inter;
             return 1;
         }

         /*
-         * [sector_num, nb_sectors] is unallocated on top but intermediate
-         * might have
-         *
-         * [sector_num+x, nr_sectors] allocated.
+         * [offset, bytes] is unallocated on top but intermediate
+         * might have [offset+x, bytes-x] allocated.
          */
-        if (n > psectors_inter &&
+        if (n > pnum_inter &&
             (intermediate == top ||
-             sector_num + psectors_inter < intermediate->total_sectors)) {
-            n = psectors_inter;
+             offset + pnum_inter < (intermediate->total_sectors *
+                                    BDRV_SECTOR_SIZE))) {
+            n = pnum_inter;
         }

         intermediate = backing_bs(intermediate);
diff --git a/block/mirror.c b/block/mirror.c
index 8de0492..c92335a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -609,6 +609,7 @@  static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
     BlockDriverState *bs = s->source;
     BlockDriverState *target_bs = blk_bs(s->target);
     int ret, n;
+    int64_t count;

     end = s->bdev_length / BDRV_SECTOR_SIZE;

@@ -658,11 +659,13 @@  static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
             return 0;
         }

-        ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n);
+        ret = bdrv_is_allocated_above(bs, base, sector_num * BDRV_SECTOR_SIZE,
+                                      nb_sectors * BDRV_SECTOR_SIZE, &count);
         if (ret < 0) {
             return ret;
         }

+        n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
         assert(n > 0);
         if (ret == 1) {
             bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n);
diff --git a/block/replication.c b/block/replication.c
index 414ecc4..605d90f 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -264,7 +264,8 @@  static coroutine_fn int replication_co_writev(BlockDriverState *bs,
     BdrvChild *top = bs->file;
     BdrvChild *base = s->secondary_disk;
     BdrvChild *target;
-    int ret, n;
+    int ret;
+    int64_t n;

     ret = replication_get_io_status(s);
     if (ret < 0) {
@@ -283,14 +284,20 @@  static coroutine_fn int replication_co_writev(BlockDriverState *bs,
      */
     qemu_iovec_init(&hd_qiov, qiov->niov);
     while (remaining_sectors > 0) {
-        ret = bdrv_is_allocated_above(top->bs, base->bs, sector_num,
-                                      remaining_sectors, &n);
+        int64_t count;
+
+        ret = bdrv_is_allocated_above(top->bs, base->bs,
+                                      sector_num * BDRV_SECTOR_SIZE,
+                                      remaining_sectors * BDRV_SECTOR_SIZE,
+                                      &count);
         if (ret < 0) {
             goto out1;
         }

+        assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
+        n = count >> BDRV_SECTOR_BITS;
         qemu_iovec_reset(&hd_qiov);
-        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * BDRV_SECTOR_SIZE);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);

         target = ret ? top : base;
         ret = bdrv_co_writev(target, sector_num, n, &hd_qiov);
@@ -300,7 +307,7 @@  static coroutine_fn int replication_co_writev(BlockDriverState *bs,

         remaining_sectors -= n;
         sector_num += n;
-        bytes_done += n * BDRV_SECTOR_SIZE;
+        bytes_done += count;
     }

 out1:
diff --git a/block/stream.c b/block/stream.c
index 85502eb..9033655 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -112,7 +112,7 @@  static void coroutine_fn stream_run(void *opaque)
     uint64_t delay_ns = 0;
     int error = 0;
     int ret = 0;
-    int n = 0; /* sectors */
+    int64_t n = 0; /* bytes */
     void *buf;

     if (!bs->backing) {
@@ -136,9 +136,8 @@  static void coroutine_fn stream_run(void *opaque)
         bdrv_enable_copy_on_read(bs);
     }

-    for ( ; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) {
+    for ( ; offset < s->common.len; offset += n) {
         bool copy;
-        int64_t count = 0;

         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that bdrv_drain_all() returns.
@@ -150,26 +149,25 @@  static void coroutine_fn stream_run(void *opaque)

         copy = false;

-        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &count);
-        n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
+        ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &n);
         if (ret == 1) {
             /* Allocated in the top, no need to copy.  */
         } else if (ret >= 0) {
             /* Copy if allocated in the intermediate images.  Limit to the
              * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE).  */
             ret = bdrv_is_allocated_above(backing_bs(bs), base,
-                                          offset / BDRV_SECTOR_SIZE, n, &n);
+                                          offset, n, &n);

             /* Finish early if end of backing file has been reached */
             if (ret == 0 && n == 0) {
-                n = (s->common.len - offset) / BDRV_SECTOR_SIZE;
+                n = s->common.len - offset;
             }

             copy = (ret == 1);
         }
-        trace_stream_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret);
+        trace_stream_one_iteration(s, offset, n, ret);
         if (copy) {
-            ret = stream_populate(blk, offset, n * BDRV_SECTOR_SIZE, buf);
+            ret = stream_populate(blk, offset, n, buf);
         }
         if (ret < 0) {
             BlockErrorAction action =
@@ -188,10 +186,9 @@  static void coroutine_fn stream_run(void *opaque)
         ret = 0;

         /* Publish progress */
-        s->common.offset += n * BDRV_SECTOR_SIZE;
+        s->common.offset += n;
         if (copy && s->common.speed) {
-            delay_ns = ratelimit_calculate_delay(&s->limit,
-                                                 n * BDRV_SECTOR_SIZE);
+            delay_ns = ratelimit_calculate_delay(&s->limit, n);
         }
     }

diff --git a/qemu-img.c b/qemu-img.c
index 2f21d69..d96b4d6 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1448,12 +1448,16 @@  static int img_compare(int argc, char **argv)
         }

         for (;;) {
+            int64_t count;
+
             nb_sectors = sectors_to_process(total_sectors_over, sector_num);
             if (nb_sectors <= 0) {
                 break;
             }
-            ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL, sector_num,
-                                          nb_sectors, &pnum);
+            ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL,
+                                          sector_num * BDRV_SECTOR_SIZE,
+                                          nb_sectors * BDRV_SECTOR_SIZE,
+                                          &count);
             if (ret < 0) {
                 ret = 3;
                 error_report("Sector allocation test failed for %s",
@@ -1461,7 +1465,7 @@  static int img_compare(int argc, char **argv)
                 goto out;

             }
-            nb_sectors = pnum;
+            nb_sectors = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE);
             if (ret) {
                 ret = check_empty_sectors(blk_over, sector_num, nb_sectors,
                                           filename_over, buf1, quiet);