Message ID | 20170411222945.11741-18-eblake@redhat.com |
---|---|
State | New |
Headers | show |
On 04/11/2017 06:29 PM, Eric Blake wrote: > We are gradually moving away from sector-based interfaces, towards > byte-based. In the common case, allocation is unlikely to ever use > values that are not naturally sector-aligned, but it is possible > that byte-based values will let us be more precise about allocation > at the end of an unaligned file that can do byte-based access. > > Changing the signature of the function to use int64_t *pnum ensures > that the compiler enforces that all callers are updated. For now, > the io.c layer still assert()s that all callers are sector-aligned, > but that can be relaxed when a later patch implements byte-based > block status. Therefore, for the most part this patch is just the > addition of scaling at the callers followed by inverse scaling at > bdrv_is_allocated(). But some code, particularly stream_run(), > gets a lot simpler because it no longer has to mess with sectors. > > For ease of review, bdrv_is_allocated() was tackled separately. > > Signed-off-by: Eric Blake <eblake@redhat.com> > --- > include/block/block.h | 2 +- > block/commit.c | 20 ++++++++------------ > block/io.c | 36 +++++++++++++++--------------------- > block/mirror.c | 5 ++++- > block/replication.c | 17 ++++++++++++----- > block/stream.c | 21 +++++++++------------ > qemu-img.c | 10 +++++++--- > 7 files changed, 56 insertions(+), 55 deletions(-) > > diff --git a/include/block/block.h b/include/block/block.h > index 8641149..740cb86 100644 > --- a/include/block/block.h > +++ b/include/block/block.h > @@ -425,7 +425,7 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs, > int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, > int64_t *pnum); > int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, > - int64_t sector_num, int nb_sectors, int *pnum); > + int64_t offset, int64_t bytes, int64_t *pnum); > > bool bdrv_is_read_only(BlockDriverState *bs); > bool bdrv_is_sg(BlockDriverState *bs); > diff --git a/block/commit.c b/block/commit.c > index 4d6bb2a..989de7d 100644 > --- a/block/commit.c > +++ b/block/commit.c > @@ -132,7 +132,7 @@ static void coroutine_fn commit_run(void *opaque) > int64_t offset; > uint64_t delay_ns = 0; > int ret = 0; > - int n = 0; /* sectors */ > + int64_t n = 0; /* bytes */ > void *buf = NULL; > int bytes_written = 0; > int64_t base_len; > @@ -157,7 +157,7 @@ static void coroutine_fn commit_run(void *opaque) > > buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE); > > - for (offset = 0; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) { > + for (offset = 0; offset < s->common.len; offset += n) { > bool copy; > > /* Note that even when no rate limit is applied we need to yield > @@ -169,15 +169,12 @@ static void coroutine_fn commit_run(void *opaque) > } > /* Copy if allocated above the base */ > ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), > - offset / BDRV_SECTOR_SIZE, > - COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, > - &n); > + offset, COMMIT_BUFFER_SIZE, &n); > copy = (ret == 1); > - trace_commit_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret); > + trace_commit_one_iteration(s, offset, n, ret); > if (copy) { > - ret = commit_populate(s->top, s->base, offset, > - n * BDRV_SECTOR_SIZE, buf); > - bytes_written += n * BDRV_SECTOR_SIZE; > + ret = commit_populate(s->top, s->base, offset, n, buf); > + bytes_written += n; > } > if (ret < 0) { > BlockErrorAction action = > @@ -190,11 +187,10 @@ static void coroutine_fn commit_run(void *opaque) > } > } > /* Publish progress */ > - s->common.offset += n * BDRV_SECTOR_SIZE; > + s->common.offset += n; > > if (copy && s->common.speed) { > - delay_ns = ratelimit_calculate_delay(&s->limit, > - n * BDRV_SECTOR_SIZE); > + delay_ns = ratelimit_calculate_delay(&s->limit, n); > } > } > > diff --git a/block/io.c b/block/io.c > index 438a493..9218329 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -1930,52 +1930,46 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, > /* > * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] > * > - * Return true if the given sector is allocated in any image between > + * Return true if the given offset is allocated in any image between perhaps "range" instead of "offset"? > * BASE and TOP (inclusive). BASE can be NULL to check if the given > - * sector is allocated in any image of the chain. Return false otherwise, > + * offset is allocated in any image of the chain. Return false otherwise, > * or negative errno on failure. > * > - * 'pnum' is set to the number of sectors (including and immediately following > - * the specified sector) that are known to be in the same > + * 'pnum' is set to the number of bytes (including and immediately following > + * the specified offset) that are known to be in the same > * allocated/unallocated state. > * > */ > int bdrv_is_allocated_above(BlockDriverState *top, > BlockDriverState *base, > - int64_t sector_num, > - int nb_sectors, int *pnum) > + int64_t offset, int64_t bytes, int64_t *pnum) > { > BlockDriverState *intermediate; > - int ret, n = nb_sectors; > + int ret; > + int64_t n = bytes; > > intermediate = top; > while (intermediate && intermediate != base) { > int64_t pnum_inter; > - int psectors_inter; > > - ret = bdrv_is_allocated(intermediate, sector_num * BDRV_SECTOR_SIZE, > - nb_sectors * BDRV_SECTOR_SIZE, > - &pnum_inter); > + ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); > if (ret < 0) { > return ret; > } > - assert(pnum_inter < INT_MAX * BDRV_SECTOR_SIZE); > - psectors_inter = pnum_inter >> BDRV_SECTOR_BITS; > if (ret) { > - *pnum = psectors_inter; > + *pnum = pnum_inter; > return 1; > } > > /* > - * [sector_num, nb_sectors] is unallocated on top but intermediate > - * might have > - * > - * [sector_num+x, nr_sectors] allocated. > + * [offset, bytes] is unallocated on top but intermediate > + * might have [offset+x, bytes-x] allocated. > */ > - if (n > psectors_inter && > + if (n > pnum_inter && > (intermediate == top || > - sector_num + psectors_inter < intermediate->total_sectors)) { > - n = psectors_inter; > + offset + pnum_inter < (intermediate->total_sectors * > + BDRV_SECTOR_SIZE))) { Naive question: not worth using either bdrv_getlength for bytes, or the bdrv_nb_sectors helpers? (Not sure if this is appropriate due to the variable length calls which might disqualify it from internal usage) > + n = pnum_inter; > } > > intermediate = backing_bs(intermediate); > diff --git a/block/mirror.c b/block/mirror.c > index 8de0492..c92335a 100644 > --- a/block/mirror.c > +++ b/block/mirror.c > @@ -609,6 +609,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) > BlockDriverState *bs = s->source; > BlockDriverState *target_bs = blk_bs(s->target); > int ret, n; > + int64_t count; > > end = s->bdev_length / BDRV_SECTOR_SIZE; > > @@ -658,11 +659,13 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) > return 0; > } > > - ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); > + ret = bdrv_is_allocated_above(bs, base, sector_num * BDRV_SECTOR_SIZE, > + nb_sectors * BDRV_SECTOR_SIZE, &count); > if (ret < 0) { > return ret; > } > > + n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE); > assert(n > 0); > if (ret == 1) { > bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); > diff --git a/block/replication.c b/block/replication.c > index 414ecc4..605d90f 100644 > --- a/block/replication.c > +++ b/block/replication.c > @@ -264,7 +264,8 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, > BdrvChild *top = bs->file; > BdrvChild *base = s->secondary_disk; > BdrvChild *target; > - int ret, n; > + int ret; > + int64_t n; > > ret = replication_get_io_status(s); > if (ret < 0) { > @@ -283,14 +284,20 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, > */ > qemu_iovec_init(&hd_qiov, qiov->niov); > while (remaining_sectors > 0) { > - ret = bdrv_is_allocated_above(top->bs, base->bs, sector_num, > - remaining_sectors, &n); > + int64_t count; > + > + ret = bdrv_is_allocated_above(top->bs, base->bs, > + sector_num * BDRV_SECTOR_SIZE, > + remaining_sectors * BDRV_SECTOR_SIZE, > + &count); > if (ret < 0) { > goto out1; > } > > + assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)); > + n = count >> BDRV_SECTOR_BITS; > qemu_iovec_reset(&hd_qiov); > - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * BDRV_SECTOR_SIZE); > + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count); > > target = ret ? top : base; > ret = bdrv_co_writev(target, sector_num, n, &hd_qiov); > @@ -300,7 +307,7 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, > > remaining_sectors -= n; > sector_num += n; > - bytes_done += n * BDRV_SECTOR_SIZE; > + bytes_done += count; > } > > out1: > diff --git a/block/stream.c b/block/stream.c > index 85502eb..9033655 100644 > --- a/block/stream.c > +++ b/block/stream.c > @@ -112,7 +112,7 @@ static void coroutine_fn stream_run(void *opaque) > uint64_t delay_ns = 0; > int error = 0; > int ret = 0; > - int n = 0; /* sectors */ > + int64_t n = 0; /* bytes */ > void *buf; > > if (!bs->backing) { > @@ -136,9 +136,8 @@ static void coroutine_fn stream_run(void *opaque) > bdrv_enable_copy_on_read(bs); > } > > - for ( ; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) { > + for ( ; offset < s->common.len; offset += n) { > bool copy; > - int64_t count = 0; > > /* Note that even when no rate limit is applied we need to yield > * with no pending I/O here so that bdrv_drain_all() returns. > @@ -150,26 +149,25 @@ static void coroutine_fn stream_run(void *opaque) > > copy = false; > > - ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &count); > - n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE); > + ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &n); > if (ret == 1) { > /* Allocated in the top, no need to copy. */ > } else if (ret >= 0) { > /* Copy if allocated in the intermediate images. Limit to the > * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ > ret = bdrv_is_allocated_above(backing_bs(bs), base, > - offset / BDRV_SECTOR_SIZE, n, &n); > + offset, n, &n); > > /* Finish early if end of backing file has been reached */ > if (ret == 0 && n == 0) { > - n = (s->common.len - offset) / BDRV_SECTOR_SIZE; > + n = s->common.len - offset; > } > > copy = (ret == 1); > } > - trace_stream_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret); > + trace_stream_one_iteration(s, offset, n, ret); > if (copy) { > - ret = stream_populate(blk, offset, n * BDRV_SECTOR_SIZE, buf); > + ret = stream_populate(blk, offset, n, buf); > } > if (ret < 0) { > BlockErrorAction action = > @@ -188,10 +186,9 @@ static void coroutine_fn stream_run(void *opaque) > ret = 0; > > /* Publish progress */ > - s->common.offset += n * BDRV_SECTOR_SIZE; > + s->common.offset += n; > if (copy && s->common.speed) { > - delay_ns = ratelimit_calculate_delay(&s->limit, > - n * BDRV_SECTOR_SIZE); > + delay_ns = ratelimit_calculate_delay(&s->limit, n); > } > } > > diff --git a/qemu-img.c b/qemu-img.c > index 2f21d69..d96b4d6 100644 > --- a/qemu-img.c > +++ b/qemu-img.c > @@ -1448,12 +1448,16 @@ static int img_compare(int argc, char **argv) > } > > for (;;) { > + int64_t count; > + > nb_sectors = sectors_to_process(total_sectors_over, sector_num); > if (nb_sectors <= 0) { > break; > } > - ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL, sector_num, > - nb_sectors, &pnum); > + ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL, > + sector_num * BDRV_SECTOR_SIZE, > + nb_sectors * BDRV_SECTOR_SIZE, > + &count); > if (ret < 0) { > ret = 3; > error_report("Sector allocation test failed for %s", > @@ -1461,7 +1465,7 @@ static int img_compare(int argc, char **argv) > goto out; > > } > - nb_sectors = pnum; > + nb_sectors = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE); > if (ret) { > ret = check_empty_sectors(blk_over, sector_num, nb_sectors, > filename_over, buf1, quiet); > Reviewed-by: John Snow <jsnow@redhat.com>
On 04/24/2017 06:06 PM, John Snow wrote: > > > On 04/11/2017 06:29 PM, Eric Blake wrote: >> We are gradually moving away from sector-based interfaces, towards >> byte-based. In the common case, allocation is unlikely to ever use >> values that are not naturally sector-aligned, but it is possible >> that byte-based values will let us be more precise about allocation >> at the end of an unaligned file that can do byte-based access. >> >> Changing the signature of the function to use int64_t *pnum ensures >> that the compiler enforces that all callers are updated. For now, >> the io.c layer still assert()s that all callers are sector-aligned, >> but that can be relaxed when a later patch implements byte-based >> block status. Therefore, for the most part this patch is just the >> addition of scaling at the callers followed by inverse scaling at >> bdrv_is_allocated(). But some code, particularly stream_run(), >> gets a lot simpler because it no longer has to mess with sectors. >> >> +++ b/block/io.c >> @@ -1930,52 +1930,46 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, >> /* >> * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] >> * >> - * Return true if the given sector is allocated in any image between >> + * Return true if the given offset is allocated in any image between > > perhaps "range" instead of "offset"? > >> * BASE and TOP (inclusive). BASE can be NULL to check if the given >> - * sector is allocated in any image of the chain. Return false otherwise, >> + * offset is allocated in any image of the chain. Return false otherwise, >> * or negative errno on failure. Seems reasonable. >> /* >> - * [sector_num, nb_sectors] is unallocated on top but intermediate >> - * might have >> - * >> - * [sector_num+x, nr_sectors] allocated. >> + * [offset, bytes] is unallocated on top but intermediate >> + * might have [offset+x, bytes-x] allocated. >> */ >> - if (n > psectors_inter && >> + if (n > pnum_inter && >> (intermediate == top || >> - sector_num + psectors_inter < intermediate->total_sectors)) { > > > >> - n = psectors_inter; >> + offset + pnum_inter < (intermediate->total_sectors * >> + BDRV_SECTOR_SIZE))) { > > Naive question: not worth using either bdrv_getlength for bytes, or the > bdrv_nb_sectors helpers? bdrv_getlength(intermediate) should indeed be the same as intermediate->total_sectors * BDRV_SECTOR_SIZE (for now - ultimately it would be nice to track a byte length rather than a sector length for images). I can make that cleanup for v2. > > Reviewed-by: John Snow <jsnow@redhat.com> >
On 04/24/2017 08:48 PM, Eric Blake wrote: > On 04/24/2017 06:06 PM, John Snow wrote: >> >> >> On 04/11/2017 06:29 PM, Eric Blake wrote: >>> We are gradually moving away from sector-based interfaces, towards >>> byte-based. In the common case, allocation is unlikely to ever use >>> values that are not naturally sector-aligned, but it is possible >>> that byte-based values will let us be more precise about allocation >>> at the end of an unaligned file that can do byte-based access. >>> >>> Changing the signature of the function to use int64_t *pnum ensures >>> that the compiler enforces that all callers are updated. For now, >>> the io.c layer still assert()s that all callers are sector-aligned, >>> but that can be relaxed when a later patch implements byte-based >>> block status. Therefore, for the most part this patch is just the >>> addition of scaling at the callers followed by inverse scaling at >>> bdrv_is_allocated(). But some code, particularly stream_run(), >>> gets a lot simpler because it no longer has to mess with sectors. >>> > >>> +++ b/block/io.c >>> @@ -1930,52 +1930,46 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, >>> /* >>> * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] >>> * >>> - * Return true if the given sector is allocated in any image between >>> + * Return true if the given offset is allocated in any image between >> >> perhaps "range" instead of "offset"? >> >>> * BASE and TOP (inclusive). BASE can be NULL to check if the given >>> - * sector is allocated in any image of the chain. Return false otherwise, >>> + * offset is allocated in any image of the chain. Return false otherwise, >>> * or negative errno on failure. > > Seems reasonable. Actually, not quite. Suppose we have the following sector allocations: backing: -- -- XX -- active: -- -- -- XX bdrv_is_allocated_above(active, NULL, 0, 2048, &num) will return 0 with num set to 1024 (offset is not allocated, and the not-allocated range is at least 1024 bytes). But calling bdr_is_allocated_above(backing, NULL, 1024, 1024, &num) will return 1 with num set to 512 (the entire range is not allocated, but the 512-byte prefix of the range IS allocated). Meanwhile, note that: bdrv_is_allocated_above(active, NULL, 1024, 1024, &num) will ALSO return 1 with num set to 512, even though it would be nicer if it could return 1024 (from the active layer, all 1024 bytes of the given range ARE allocated, just not from the same location). So callers have to manually coalesce multiple bdrv_is_allocated_above() calls to get a full picture of what is allocated. The same is ALSO true if you have a fragmented image. For example: $ qemu-img create -f qcow2 -o cluster_size=1m file3 10m $ qemu-io -f qcow2 -c 'w 0 5m' -c 'discard 0 2m' -c 'w 1m 1m' \ -c 'w 0 1m' -c 'w 8m 2m' file3 The image is now fragmented (the clusters at 0 and 1m swapped mappings): [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data": true, "offset": 6291456}, { "start": 1048576, "length": 1048576, "depth": 0, "zero": false, "data": true, "offset": 5242880}, { "start": 2097152, "length": 3145728, "depth": 0, "zero": false, "data": true, "offset": 7340032}, { "start": 5242880, "length": 3145728, "depth": 0, "zero": true, "data": false}, { "start": 8388608, "length": 2097152, "depth": 0, "zero": false, "data": true, "offset": 10485760}] 'qemu-io alloc' and 'qemu-io map' are callers which coalesce similar results: $ ./qemu-io -f qcow2 file3 qemu-io> alloc 0 10m 7340032/10485760 bytes allocated at offset 0 bytes qemu-io> map 5 MiB (0x500000) bytes allocated at offset 0 bytes (0x0) 3 MiB (0x300000) bytes not allocated at offset 5 MiB (0x500000) 2 MiB (0x200000) bytes allocated at offset 8 MiB (0x800000) although tracing under gdb showed 5 calls to bdrv_is_allocated() during 'alloc' (at [0, 10m], [1m, 9m], [2m, 8m], [5m, 5m], [8m, 2m]), and 8 calls during 'map' (at [0, 10m], [1m, 9m], [2m, 8m], [5m, 5m], [5m, 5m], [8m, 2m], [8m, 2m], [10m, 0]). Part of that is that the qemu-io map implementation is rather inefficient - it makes a wasted query for 0 bytes, and at every transition between allocated and unallocated it ends up asking for status on the same offset a second time around, rather than remembering what it already learned on the previous iteration. It might be worth followup patches to improve the efficiency of qemu-io map; but also to change semantics such that bdrv_is_allocated_above() gives the largest answer possible, rather than making the callers coalesce identical answers. Part of that would include teaching .bdrv_co_get_block_status() new semantics: if file==NULL, then don't return BDRV_BLOCK_OFFSET_VALID, but merely grab as much BDRV_BLOCK_DATA/BDRV_BLOCK_ZERO information as possible (in spite of fragmentation) [in the example above, it would mean returning BDRV_BLOCK_DATA for 5m, rather than three separate returns of 1m/1m/3m]; as well as teaching bdrv_is_allocated_above() to concatenate all answers along the backing chain until it reaches an actual change in status. Looks like I've just added more work to my queue. That said, there still has to be coalescing somewhere. For example, qcow2 images can easily return status for any range covered by a single L2 cluster, but intentionally quits its response at the end of an L2 cluster because the cost of loading up the next cluster (with the potential of dropping locks and racing with other threads doing writes) becomes too hard to control within the qcow2 layer, and callers may need to realize that the larger a request is on an actively-changing image, the more likely the overall response can be inaccurate by the time multiple sub-queries have been coalesced.
On 04/24/2017 08:48 PM, Eric Blake wrote: >> >> >>> - n = psectors_inter; >>> + offset + pnum_inter < (intermediate->total_sectors * >>> + BDRV_SECTOR_SIZE))) { >> >> Naive question: not worth using either bdrv_getlength for bytes, or the >> bdrv_nb_sectors helpers? > > bdrv_getlength(intermediate) should indeed be the same as > intermediate->total_sectors * BDRV_SECTOR_SIZE (for now - ultimately it > would be nice to track a byte length rather than a sector length for > images). I can make that cleanup for v2. This one's tricky. Calling bdrv_nb_sectors()/bdrv_getlength() (the two are identical other than scale) guarantees that you have a sane answer for a variably-sized BDS, but can fail with -ENOMEDIUM. Which, given the position in the 'if' clause, makes it a difficult rewrite to properly catch. On the other hand, since we just barely called bdrv_is_allocated(intermediate), which in turn called bdrv_co_get_block_status(), and that calls bdrv_nb_sectors(), we are assured that intermediate->total_sectors has not changed in the meantime. So my options are to either add a big comment stating why we are safe, or to use bdrv_getlength() anyways but with proper error checking. Best done as a separate patch from the conversion in scale; so I'll do that for v2.
diff --git a/include/block/block.h b/include/block/block.h index 8641149..740cb86 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -425,7 +425,7 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs, int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, int64_t *pnum); int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, - int64_t sector_num, int nb_sectors, int *pnum); + int64_t offset, int64_t bytes, int64_t *pnum); bool bdrv_is_read_only(BlockDriverState *bs); bool bdrv_is_sg(BlockDriverState *bs); diff --git a/block/commit.c b/block/commit.c index 4d6bb2a..989de7d 100644 --- a/block/commit.c +++ b/block/commit.c @@ -132,7 +132,7 @@ static void coroutine_fn commit_run(void *opaque) int64_t offset; uint64_t delay_ns = 0; int ret = 0; - int n = 0; /* sectors */ + int64_t n = 0; /* bytes */ void *buf = NULL; int bytes_written = 0; int64_t base_len; @@ -157,7 +157,7 @@ static void coroutine_fn commit_run(void *opaque) buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE); - for (offset = 0; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) { + for (offset = 0; offset < s->common.len; offset += n) { bool copy; /* Note that even when no rate limit is applied we need to yield @@ -169,15 +169,12 @@ static void coroutine_fn commit_run(void *opaque) } /* Copy if allocated above the base */ ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), - offset / BDRV_SECTOR_SIZE, - COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, - &n); + offset, COMMIT_BUFFER_SIZE, &n); copy = (ret == 1); - trace_commit_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret); + trace_commit_one_iteration(s, offset, n, ret); if (copy) { - ret = commit_populate(s->top, s->base, offset, - n * BDRV_SECTOR_SIZE, buf); - bytes_written += n * BDRV_SECTOR_SIZE; + ret = commit_populate(s->top, s->base, offset, n, buf); + bytes_written += n; } if (ret < 0) { BlockErrorAction action = @@ -190,11 +187,10 @@ static void coroutine_fn commit_run(void *opaque) } } /* Publish progress */ - s->common.offset += n * BDRV_SECTOR_SIZE; + s->common.offset += n; if (copy && s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, - n * BDRV_SECTOR_SIZE); + delay_ns = ratelimit_calculate_delay(&s->limit, n); } } diff --git a/block/io.c b/block/io.c index 438a493..9218329 100644 --- a/block/io.c +++ b/block/io.c @@ -1930,52 +1930,46 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset, /* * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] * - * Return true if the given sector is allocated in any image between + * Return true if the given offset is allocated in any image between * BASE and TOP (inclusive). BASE can be NULL to check if the given - * sector is allocated in any image of the chain. Return false otherwise, + * offset is allocated in any image of the chain. Return false otherwise, * or negative errno on failure. * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same + * 'pnum' is set to the number of bytes (including and immediately following + * the specified offset) that are known to be in the same * allocated/unallocated state. * */ int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum) + int64_t offset, int64_t bytes, int64_t *pnum) { BlockDriverState *intermediate; - int ret, n = nb_sectors; + int ret; + int64_t n = bytes; intermediate = top; while (intermediate && intermediate != base) { int64_t pnum_inter; - int psectors_inter; - ret = bdrv_is_allocated(intermediate, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, - &pnum_inter); + ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter); if (ret < 0) { return ret; } - assert(pnum_inter < INT_MAX * BDRV_SECTOR_SIZE); - psectors_inter = pnum_inter >> BDRV_SECTOR_BITS; if (ret) { - *pnum = psectors_inter; + *pnum = pnum_inter; return 1; } /* - * [sector_num, nb_sectors] is unallocated on top but intermediate - * might have - * - * [sector_num+x, nr_sectors] allocated. + * [offset, bytes] is unallocated on top but intermediate + * might have [offset+x, bytes-x] allocated. */ - if (n > psectors_inter && + if (n > pnum_inter && (intermediate == top || - sector_num + psectors_inter < intermediate->total_sectors)) { - n = psectors_inter; + offset + pnum_inter < (intermediate->total_sectors * + BDRV_SECTOR_SIZE))) { + n = pnum_inter; } intermediate = backing_bs(intermediate); diff --git a/block/mirror.c b/block/mirror.c index 8de0492..c92335a 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -609,6 +609,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) BlockDriverState *bs = s->source; BlockDriverState *target_bs = blk_bs(s->target); int ret, n; + int64_t count; end = s->bdev_length / BDRV_SECTOR_SIZE; @@ -658,11 +659,13 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) return 0; } - ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); + ret = bdrv_is_allocated_above(bs, base, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, &count); if (ret < 0) { return ret; } + n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE); assert(n > 0); if (ret == 1) { bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); diff --git a/block/replication.c b/block/replication.c index 414ecc4..605d90f 100644 --- a/block/replication.c +++ b/block/replication.c @@ -264,7 +264,8 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, BdrvChild *top = bs->file; BdrvChild *base = s->secondary_disk; BdrvChild *target; - int ret, n; + int ret; + int64_t n; ret = replication_get_io_status(s); if (ret < 0) { @@ -283,14 +284,20 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, */ qemu_iovec_init(&hd_qiov, qiov->niov); while (remaining_sectors > 0) { - ret = bdrv_is_allocated_above(top->bs, base->bs, sector_num, - remaining_sectors, &n); + int64_t count; + + ret = bdrv_is_allocated_above(top->bs, base->bs, + sector_num * BDRV_SECTOR_SIZE, + remaining_sectors * BDRV_SECTOR_SIZE, + &count); if (ret < 0) { goto out1; } + assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)); + n = count >> BDRV_SECTOR_BITS; qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, n * BDRV_SECTOR_SIZE); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count); target = ret ? top : base; ret = bdrv_co_writev(target, sector_num, n, &hd_qiov); @@ -300,7 +307,7 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs, remaining_sectors -= n; sector_num += n; - bytes_done += n * BDRV_SECTOR_SIZE; + bytes_done += count; } out1: diff --git a/block/stream.c b/block/stream.c index 85502eb..9033655 100644 --- a/block/stream.c +++ b/block/stream.c @@ -112,7 +112,7 @@ static void coroutine_fn stream_run(void *opaque) uint64_t delay_ns = 0; int error = 0; int ret = 0; - int n = 0; /* sectors */ + int64_t n = 0; /* bytes */ void *buf; if (!bs->backing) { @@ -136,9 +136,8 @@ static void coroutine_fn stream_run(void *opaque) bdrv_enable_copy_on_read(bs); } - for ( ; offset < s->common.len; offset += n * BDRV_SECTOR_SIZE) { + for ( ; offset < s->common.len; offset += n) { bool copy; - int64_t count = 0; /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. @@ -150,26 +149,25 @@ static void coroutine_fn stream_run(void *opaque) copy = false; - ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &count); - n = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE); + ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &n); if (ret == 1) { /* Allocated in the top, no need to copy. */ } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE). */ ret = bdrv_is_allocated_above(backing_bs(bs), base, - offset / BDRV_SECTOR_SIZE, n, &n); + offset, n, &n); /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { - n = (s->common.len - offset) / BDRV_SECTOR_SIZE; + n = s->common.len - offset; } copy = (ret == 1); } - trace_stream_one_iteration(s, offset, n * BDRV_SECTOR_SIZE, ret); + trace_stream_one_iteration(s, offset, n, ret); if (copy) { - ret = stream_populate(blk, offset, n * BDRV_SECTOR_SIZE, buf); + ret = stream_populate(blk, offset, n, buf); } if (ret < 0) { BlockErrorAction action = @@ -188,10 +186,9 @@ static void coroutine_fn stream_run(void *opaque) ret = 0; /* Publish progress */ - s->common.offset += n * BDRV_SECTOR_SIZE; + s->common.offset += n; if (copy && s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, - n * BDRV_SECTOR_SIZE); + delay_ns = ratelimit_calculate_delay(&s->limit, n); } } diff --git a/qemu-img.c b/qemu-img.c index 2f21d69..d96b4d6 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1448,12 +1448,16 @@ static int img_compare(int argc, char **argv) } for (;;) { + int64_t count; + nb_sectors = sectors_to_process(total_sectors_over, sector_num); if (nb_sectors <= 0) { break; } - ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL, sector_num, - nb_sectors, &pnum); + ret = bdrv_is_allocated_above(blk_bs(blk_over), NULL, + sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, + &count); if (ret < 0) { ret = 3; error_report("Sector allocation test failed for %s", @@ -1461,7 +1465,7 @@ static int img_compare(int argc, char **argv) goto out; } - nb_sectors = pnum; + nb_sectors = DIV_ROUND_UP(count, BDRV_SECTOR_SIZE); if (ret) { ret = check_empty_sectors(blk_over, sector_num, nb_sectors, filename_over, buf1, quiet);
We are gradually moving away from sector-based interfaces, towards byte-based. In the common case, allocation is unlikely to ever use values that are not naturally sector-aligned, but it is possible that byte-based values will let us be more precise about allocation at the end of an unaligned file that can do byte-based access. Changing the signature of the function to use int64_t *pnum ensures that the compiler enforces that all callers are updated. For now, the io.c layer still assert()s that all callers are sector-aligned, but that can be relaxed when a later patch implements byte-based block status. Therefore, for the most part this patch is just the addition of scaling at the callers followed by inverse scaling at bdrv_is_allocated(). But some code, particularly stream_run(), gets a lot simpler because it no longer has to mess with sectors. For ease of review, bdrv_is_allocated() was tackled separately. Signed-off-by: Eric Blake <eblake@redhat.com> --- include/block/block.h | 2 +- block/commit.c | 20 ++++++++------------ block/io.c | 36 +++++++++++++++--------------------- block/mirror.c | 5 ++++- block/replication.c | 17 ++++++++++++----- block/stream.c | 21 +++++++++------------ qemu-img.c | 10 +++++++--- 7 files changed, 56 insertions(+), 55 deletions(-)