diff mbox series

[v3,03/20] file-posix: Switch to .bdrv_co_block_status()

Message ID 20170914144032.14945-4-eblake@redhat.com
State New
Headers show
Series add byte-based block_status driver callbacks | expand

Commit Message

Eric Blake Sept. 14, 2017, 2:40 p.m. UTC
We are gradually moving away from sector-based interfaces, towards
byte-based.  Update the file protocol driver accordingly.  In mapping
mode, note that the entire file is reported as allocated, so we can
take a shortcut and skip lseek().

Signed-off-by: Eric Blake <eblake@redhat.com>

---
v2: tweak comment, add mapping support
---
 block/file-posix.c | 57 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

Comments

Fam Zheng Sept. 20, 2017, 9:57 a.m. UTC | #1
On Thu, 09/14 09:40, Eric Blake wrote:
> We are gradually moving away from sector-based interfaces, towards
> byte-based.  Update the file protocol driver accordingly.  In mapping
> mode, note that the entire file is reported as allocated, so we can
> take a shortcut and skip lseek().
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> 
> ---
> v2: tweak comment, add mapping support
> ---
>  block/file-posix.c | 57 ++++++++++++++++++++++++++++++------------------------
>  1 file changed, 32 insertions(+), 25 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 72ecfbb0e0..6813059867 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2107,24 +2107,25 @@ static int find_allocation(BlockDriverState *bs, off_t start,
>  }
> 
>  /*
> - * Returns the allocation status of the specified sectors.
> + * Returns the allocation status of the specified offset.
>   *
> - * If 'sector_num' is beyond the end of the disk image the return value is 0
> + * If 'offset' is beyond the end of the disk image the return value is 0
>   * and 'pnum' is set to 0.
>   *
> - * 'pnum' is set to the number of sectors (including and immediately following
> - * the specified sector) that are known to be in the same
> + * 'pnum' is set to the number of bytes (including and immediately following
> + * the specified offset) that are known to be in the same
>   * allocated/unallocated state.
>   *
> - * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
> + * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
>   * beyond the end of the disk image it will be clamped.
>   */
> -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
> -                                                    int64_t sector_num,
> -                                                    int nb_sectors, int *pnum,
> -                                                    BlockDriverState **file)
> +static int64_t coroutine_fn raw_co_block_status(BlockDriverState *bs,
> +                                                bool mapping,
> +                                                int64_t offset,
> +                                                int64_t bytes, int64_t *pnum,
> +                                                BlockDriverState **file)
>  {
> -    off_t start, data = 0, hole = 0;
> +    off_t data = 0, hole = 0;
>      int64_t total_size;
>      int ret;
> 
> @@ -2133,39 +2134,45 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
>          return ret;
>      }
> 
> -    start = sector_num * BDRV_SECTOR_SIZE;
>      total_size = bdrv_getlength(bs);
>      if (total_size < 0) {
>          return total_size;
> -    } else if (start >= total_size) {
> +    } else if (offset >= total_size) {
>          *pnum = 0;
>          return 0;
> -    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
> -        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
> +    } else if (offset + bytes > total_size) {
> +        bytes = total_size - offset;
>      }
> 
> -    ret = find_allocation(bs, start, &data, &hole);
> +    if (!mapping) {
> +        *pnum = bytes;
> +        *file = bs;
> +        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID |
> +            (offset & BDRV_BLOCK_OFFSET_MASK);
> +    }

I may be missing something, because the last time I tried to understand the
rationale behind "mapping" was already some time ago: shouldn't we still
distinguish hole and data? What will omitting BDRV_BLOCK_ZERO help?

Fam

> +
> +    ret = find_allocation(bs, offset, &data, &hole);
>      if (ret == -ENXIO) {
>          /* Trailing hole */
> -        *pnum = nb_sectors;
> +        *pnum = bytes;
>          ret = BDRV_BLOCK_ZERO;
>      } else if (ret < 0) {
>          /* No info available, so pretend there are no holes */
> -        *pnum = nb_sectors;
> +        *pnum = bytes;
>          ret = BDRV_BLOCK_DATA;
> -    } else if (data == start) {
> -        /* On a data extent, compute sectors to the end of the extent,
> +    } else if (data == offset) {
> +        /* On a data extent, compute bytes to the end of the extent,
>           * possibly including a partial sector at EOF. */
> -        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
> +        *pnum = MIN(bytes, hole - offset);
>          ret = BDRV_BLOCK_DATA;
>      } else {
> -        /* On a hole, compute sectors to the beginning of the next extent.  */
> -        assert(hole == start);
> -        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
> +        /* On a hole, compute bytes to the beginning of the next extent.  */
> +        assert(hole == offset);
> +        *pnum = MIN(bytes, data - offset);
>          ret = BDRV_BLOCK_ZERO;
>      }
>      *file = bs;
> -    return ret | BDRV_BLOCK_OFFSET_VALID | start;
> +    return ret | BDRV_BLOCK_OFFSET_VALID | (offset & BDRV_BLOCK_OFFSET_MASK);
>  }
> 
>  static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
> @@ -2259,7 +2266,7 @@ BlockDriver bdrv_file = {
>      .bdrv_close = raw_close,
>      .bdrv_create = raw_create,
>      .bdrv_has_zero_init = bdrv_has_zero_init_1,
> -    .bdrv_co_get_block_status = raw_co_get_block_status,
> +    .bdrv_co_block_status = raw_co_block_status,
>      .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
> 
>      .bdrv_co_preadv         = raw_co_preadv,
> -- 
> 2.13.5
>
Eric Blake Sept. 20, 2017, 1:47 p.m. UTC | #2
On 09/20/2017 04:57 AM, Fam Zheng wrote:
> On Thu, 09/14 09:40, Eric Blake wrote:
>> We are gradually moving away from sector-based interfaces, towards
>> byte-based.  Update the file protocol driver accordingly.  In mapping
>> mode, note that the entire file is reported as allocated, so we can
>> take a shortcut and skip lseek().
>>
>> Signed-off-by: Eric Blake <eblake@redhat.com>
>>

>>
>> -    ret = find_allocation(bs, start, &data, &hole);
>> +    if (!mapping) {
>> +        *pnum = bytes;
>> +        *file = bs;
>> +        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID |
>> +            (offset & BDRV_BLOCK_OFFSET_MASK);
>> +    }
> 
> I may be missing something, because the last time I tried to understand the
> rationale behind "mapping" was already some time ago: shouldn't we still
> distinguish hole and data? What will omitting BDRV_BLOCK_ZERO help?

Hmm, the commit message is slightly off (in part, because I switched the
sense of the bool flag between series revisions, but did not properly
update the commit text to match).  In mapping mode, we want to return as
much information as possible (the client is something like 'qemu-img
map'), including where the holes lie.  But when we are NOT in mapping
mode, we care more about learning which portions of the file are
described in the current layer of the backing chain, rather than
delegating to another layer, regardless of whether the read will see
data or zeroes.  By the time we are at the POSIX file protocol layer, we
know that every byte in the file system/block device has a 1:1 mapping
to the bytes that the guest will read (we do not delegate to any backing
file), so we can simply report the entire remainder of the file as
allocated without worrying about holes.

Here's where the mapping flag was added and semantics documented (in
series 3; whereas the current email is series 4):
https://lists.gnu.org/archive/html/qemu-devel/2017-09/msg03542.html

So what I really need to do is fix the commit message to read:

In mapping mode, we must report as much information as possible about
where holes can be found; but when we don't care about mapping, the user
is more interested in how much of the guest view will come from the
current layer rather than delegating to some other BDS, and we can take
the shortcut that all of the remainder of the file fits that
description, and therefore take a shortcut and skip lseek() for a larger
*pnum result.

(the same comment probably applies to several other patches in the series)
Fam Zheng Sept. 22, 2017, 5:54 a.m. UTC | #3
On Wed, 09/20 08:47, Eric Blake wrote:
> On 09/20/2017 04:57 AM, Fam Zheng wrote:
> > On Thu, 09/14 09:40, Eric Blake wrote:
> >> We are gradually moving away from sector-based interfaces, towards
> >> byte-based.  Update the file protocol driver accordingly.  In mapping
> >> mode, note that the entire file is reported as allocated, so we can
> >> take a shortcut and skip lseek().
> >>
> >> Signed-off-by: Eric Blake <eblake@redhat.com>
> >>
> 
> >>
> >> -    ret = find_allocation(bs, start, &data, &hole);
> >> +    if (!mapping) {
> >> +        *pnum = bytes;
> >> +        *file = bs;
> >> +        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID |
> >> +            (offset & BDRV_BLOCK_OFFSET_MASK);
> >> +    }
> > 
> > I may be missing something, because the last time I tried to understand the
> > rationale behind "mapping" was already some time ago: shouldn't we still
> > distinguish hole and data? What will omitting BDRV_BLOCK_ZERO help?
> 
> Hmm, the commit message is slightly off (in part, because I switched the
> sense of the bool flag between series revisions, but did not properly
> update the commit text to match).  In mapping mode, we want to return as
> much information as possible (the client is something like 'qemu-img
> map'), including where the holes lie.  But when we are NOT in mapping
> mode, we care more about learning which portions of the file are
> described in the current layer of the backing chain, rather than
> delegating to another layer, regardless of whether the read will see
> data or zeroes.  By the time we are at the POSIX file protocol layer, we
> know that every byte in the file system/block device has a 1:1 mapping
> to the bytes that the guest will read (we do not delegate to any backing
> file), so we can simply report the entire remainder of the file as
> allocated without worrying about holes.

Thanks, it would be good if this explanation can be added to the comment of
"mapping" parameter, so it's easy to understand the actual intention in the
future.

> 
> Here's where the mapping flag was added and semantics documented (in
> series 3; whereas the current email is series 4):
> https://lists.gnu.org/archive/html/qemu-devel/2017-09/msg03542.html
> 
> So what I really need to do is fix the commit message to read:
> 
> In mapping mode, we must report as much information as possible about
> where holes can be found; but when we don't care about mapping, the user
> is more interested in how much of the guest view will come from the
> current layer rather than delegating to some other BDS, and we can take
> the shortcut that all of the remainder of the file fits that
> description, and therefore take a shortcut and skip lseek() for a larger
> *pnum result.
> 
> (the same comment probably applies to several other patches in the series)
> 

Fam
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index 72ecfbb0e0..6813059867 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2107,24 +2107,25 @@  static int find_allocation(BlockDriverState *bs, off_t start,
 }

 /*
- * Returns the allocation status of the specified sectors.
+ * Returns the allocation status of the specified offset.
  *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * If 'offset' is beyond the end of the disk image the return value is 0
  * and 'pnum' is set to 0.
  *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
+ * 'pnum' is set to the number of bytes (including and immediately following
+ * the specified offset) that are known to be in the same
  * allocated/unallocated state.
  *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
  * beyond the end of the disk image it will be clamped.
  */
-static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
-                                                    int64_t sector_num,
-                                                    int nb_sectors, int *pnum,
-                                                    BlockDriverState **file)
+static int64_t coroutine_fn raw_co_block_status(BlockDriverState *bs,
+                                                bool mapping,
+                                                int64_t offset,
+                                                int64_t bytes, int64_t *pnum,
+                                                BlockDriverState **file)
 {
-    off_t start, data = 0, hole = 0;
+    off_t data = 0, hole = 0;
     int64_t total_size;
     int ret;

@@ -2133,39 +2134,45 @@  static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
         return ret;
     }

-    start = sector_num * BDRV_SECTOR_SIZE;
     total_size = bdrv_getlength(bs);
     if (total_size < 0) {
         return total_size;
-    } else if (start >= total_size) {
+    } else if (offset >= total_size) {
         *pnum = 0;
         return 0;
-    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
-        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    } else if (offset + bytes > total_size) {
+        bytes = total_size - offset;
     }

-    ret = find_allocation(bs, start, &data, &hole);
+    if (!mapping) {
+        *pnum = bytes;
+        *file = bs;
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID |
+            (offset & BDRV_BLOCK_OFFSET_MASK);
+    }
+
+    ret = find_allocation(bs, offset, &data, &hole);
     if (ret == -ENXIO) {
         /* Trailing hole */
-        *pnum = nb_sectors;
+        *pnum = bytes;
         ret = BDRV_BLOCK_ZERO;
     } else if (ret < 0) {
         /* No info available, so pretend there are no holes */
-        *pnum = nb_sectors;
+        *pnum = bytes;
         ret = BDRV_BLOCK_DATA;
-    } else if (data == start) {
-        /* On a data extent, compute sectors to the end of the extent,
+    } else if (data == offset) {
+        /* On a data extent, compute bytes to the end of the extent,
          * possibly including a partial sector at EOF. */
-        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
+        *pnum = MIN(bytes, hole - offset);
         ret = BDRV_BLOCK_DATA;
     } else {
-        /* On a hole, compute sectors to the beginning of the next extent.  */
-        assert(hole == start);
-        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        /* On a hole, compute bytes to the beginning of the next extent.  */
+        assert(hole == offset);
+        *pnum = MIN(bytes, data - offset);
         ret = BDRV_BLOCK_ZERO;
     }
     *file = bs;
-    return ret | BDRV_BLOCK_OFFSET_VALID | start;
+    return ret | BDRV_BLOCK_OFFSET_VALID | (offset & BDRV_BLOCK_OFFSET_MASK);
 }

 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
@@ -2259,7 +2266,7 @@  BlockDriver bdrv_file = {
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_get_block_status = raw_co_get_block_status,
+    .bdrv_co_block_status = raw_co_block_status,
     .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,

     .bdrv_co_preadv         = raw_co_preadv,