diff mbox

[09/13] qed: Convert to bdrv_co_pwrite_zeroes()

Message ID 1464128732-12667-10-git-send-email-eblake@redhat.com
State New
Headers show

Commit Message

Eric Blake May 24, 2016, 10:25 p.m. UTC
Another step on our continuing quest to switch to byte-based
interfaces.

Kill an abuse of the comma operator while at it (fortunately,
the semantics were still right).

Signed-off-by: Eric Blake <eblake@redhat.com>
---
 block/qed.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

Comments

Kevin Wolf May 25, 2016, 2:07 p.m. UTC | #1
Am 25.05.2016 um 00:25 hat Eric Blake geschrieben:
> Another step on our continuing quest to switch to byte-based
> interfaces.
> 
> Kill an abuse of the comma operator while at it (fortunately,
> the semantics were still right).
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> ---
>  block/qed.c | 25 +++++++++++++------------
>  1 file changed, 13 insertions(+), 12 deletions(-)
> 
> diff --git a/block/qed.c b/block/qed.c
> index 0ab5b40..a0be886 100644
> --- a/block/qed.c
> +++ b/block/qed.c
> @@ -1419,7 +1419,7 @@ typedef struct {
>      bool done;
>  } QEDWriteZeroesCB;
> 
> -static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
> +static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
>  {
>      QEDWriteZeroesCB *cb = opaque;
> 
> @@ -1430,10 +1430,10 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
>      }
>  }
> 
> -static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
> -                                                 int64_t sector_num,
> -                                                 int nb_sectors,
> -                                                 BdrvRequestFlags flags)
> +static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
> +                                                  int64_t offset,
> +                                                  int count,
> +                                                  BdrvRequestFlags flags)
>  {
>      BlockAIOCB *blockacb;
>      BDRVQEDState *s = bs->opaque;
> @@ -1443,10 +1443,10 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
> 
>      /* Refuse if there are untouched backing file sectors */
>      if (bs->backing) {
> -        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
> +        if (qed_offset_into_cluster(s, offset) != 0) {
>              return -ENOTSUP;
>          }
> -        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
> +        if (qed_offset_into_cluster(s, count) != 0) {
>              return -ENOTSUP;
>          }
>      }

Unaligned requests are only emulated if there is no backing file...

> @@ -1454,12 +1454,13 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
>      /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
>       * then it will be allocated during request processing.
>       */
> -    iov.iov_base = NULL,
> -    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
> +    iov.iov_base = NULL;
> +    iov.iov_len = count;
> 
>      qemu_iovec_init_external(&qiov, &iov, 1);
> -    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
> -                             qed_co_write_zeroes_cb, &cb,
> +    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
> +                             count >> BDRV_SECTOR_BITS,

...so offset and count can still be unaligned here and we end up zeroing
out the wrong part of the sector. I guess we need to return -ENOTSUP for
all sub-sector requests, even without a backing file.

> +                             qed_co_pwrite_zeroes_cb, &cb,
>                               QED_AIOCB_WRITE | QED_AIOCB_ZERO);
>      if (!blockacb) {
>          return -EIO;

Kevin
Eric Blake May 25, 2016, 2:28 p.m. UTC | #2
On 05/25/2016 08:07 AM, Kevin Wolf wrote:
> Am 25.05.2016 um 00:25 hat Eric Blake geschrieben:
>> Another step on our continuing quest to switch to byte-based
>> interfaces.
>>
>> Kill an abuse of the comma operator while at it (fortunately,
>> the semantics were still right).
>>
>> Signed-off-by: Eric Blake <eblake@redhat.com>
>> ---
>>  block/qed.c | 25 +++++++++++++------------
>>  1 file changed, 13 insertions(+), 12 deletions(-)
>>

>> @@ -1443,10 +1443,10 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
>>
>>      /* Refuse if there are untouched backing file sectors */

The comment wasn't very helpful, so I may rewort it, too
(s/untouched/unaligned/, or something like that)

>>      if (bs->backing) {
>> -        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
>> +        if (qed_offset_into_cluster(s, offset) != 0) {
>>              return -ENOTSUP;
>>          }
>> -        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
>> +        if (qed_offset_into_cluster(s, count) != 0) {
>>              return -ENOTSUP;
>>          }
>>      }
> 
> Unaligned requests are only emulated if there is no backing file...
> 
>> @@ -1454,12 +1454,13 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
>>      /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
>>       * then it will be allocated during request processing.
>>       */
>> -    iov.iov_base = NULL,
>> -    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
>> +    iov.iov_base = NULL;
>> +    iov.iov_len = count;
>>
>>      qemu_iovec_init_external(&qiov, &iov, 1);
>> -    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
>> -                             qed_co_write_zeroes_cb, &cb,
>> +    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
>> +                             count >> BDRV_SECTOR_BITS,
> 
> ...so offset and count can still be unaligned here and we end up zeroing
> out the wrong part of the sector. I guess we need to return -ENOTSUP for
> all sub-sector requests, even without a backing file.

Hmm. Wouldn't it be nicer if we could guarantee that blk_pwrite_zeroes()
will never call bdrv_co_pwrite_zeroes() with less than
request_alignment?  That is, if the block layer takes care of
read-modify-write for any unaligned byte offset less than
request_alignment, then the driver only has to worry about sector
alignment.  Except qed.c doesn't seem to set request_alignment, but is
just relying on io.c currently setting it to MAX(BDRV_SECTOR_SIZE,
bs->bl.request_alignment) everywhere. (And the fact that
request_alignment is a sibling rather than a member to BlockLimits bl is
awkward.)

Maybe we want three limits in BlockLimits, rather than two: the current
max_pwrite_zeroes does a good job at saying how small blk_pwrite_zeroes
must fragment large requests, and pwrite_zeroes_alignment does a good
job at saying how large a request must be to potentially punch a hole,
but at least in the case of qcow2, where we want to optimize a partial
write to potentially zeroing an entire cluster, we still want to limit
things to sector boundaries when checking for whether the rest of the
cluster already reads as zeroes, whether or not we also want to support
request_alignment of 1 instead of 512.

There are other drivers that I touched in this series that were relying
on the fact that the block layer currently guarantees sector alignment,
and maybe they should be setting request_alignment, or maybe we want to
add yet another BlockLimit member.  So even if we want normal read/write
to allow request_alignment of 1 in the case where we don't need the
block layer to do a read-modify-write, I'm still wondering whether we
want the write_zeroes engine to have a different minimum alignment and
ALWAYS hand off to normal read-modify-write for anything smaller.
Kevin Wolf May 25, 2016, 3:06 p.m. UTC | #3
Am 25.05.2016 um 16:28 hat Eric Blake geschrieben:
> On 05/25/2016 08:07 AM, Kevin Wolf wrote:
> > Am 25.05.2016 um 00:25 hat Eric Blake geschrieben:
> >> Another step on our continuing quest to switch to byte-based
> >> interfaces.
> >>
> >> Kill an abuse of the comma operator while at it (fortunately,
> >> the semantics were still right).
> >>
> >> Signed-off-by: Eric Blake <eblake@redhat.com>
> >> ---
> >>  block/qed.c | 25 +++++++++++++------------
> >>  1 file changed, 13 insertions(+), 12 deletions(-)
> >>
> 
> >> @@ -1443,10 +1443,10 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
> >>
> >>      /* Refuse if there are untouched backing file sectors */
> 
> The comment wasn't very helpful, so I may rewort it, too
> (s/untouched/unaligned/, or something like that)

I think "unaligned" is not what the comment means. What would an
unaligned sector be anyway?

The idea is probably like in qcow2 that if there is no backing file, the
rest of the cluster already reads as zeros, so we can overwrite it.

> >>      if (bs->backing) {
> >> -        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
> >> +        if (qed_offset_into_cluster(s, offset) != 0) {
> >>              return -ENOTSUP;
> >>          }
> >> -        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
> >> +        if (qed_offset_into_cluster(s, count) != 0) {
> >>              return -ENOTSUP;
> >>          }
> >>      }
> > 
> > Unaligned requests are only emulated if there is no backing file...
> > 
> >> @@ -1454,12 +1454,13 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
> >>      /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
> >>       * then it will be allocated during request processing.
> >>       */
> >> -    iov.iov_base = NULL,
> >> -    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
> >> +    iov.iov_base = NULL;
> >> +    iov.iov_len = count;
> >>
> >>      qemu_iovec_init_external(&qiov, &iov, 1);
> >> -    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
> >> -                             qed_co_write_zeroes_cb, &cb,
> >> +    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
> >> +                             count >> BDRV_SECTOR_BITS,
> > 
> > ...so offset and count can still be unaligned here and we end up zeroing
> > out the wrong part of the sector. I guess we need to return -ENOTSUP for
> > all sub-sector requests, even without a backing file.
> 
> Hmm. Wouldn't it be nicer if we could guarantee that blk_pwrite_zeroes()
> will never call bdrv_co_pwrite_zeroes() with less than
> request_alignment?

If we want this restriction, the right place is to implement it is in
bdrv_co_pwrite_zeroes() before calling into the driver, not on the
BlockBackend level.

> That is, if the block layer takes care of
> read-modify-write for any unaligned byte offset less than
> request_alignment, then the driver only has to worry about sector
> alignment.  Except qed.c doesn't seem to set request_alignment, but is
> just relying on io.c currently setting it to MAX(BDRV_SECTOR_SIZE,
> bs->bl.request_alignment) everywhere. (And the fact that
> request_alignment is a sibling rather than a member to BlockLimits bl is
> awkward.)

As explained on IRC, the block driver shouldn't care about sector
alignment. 512 is just an arbitrary number that some interfaces
misguidedly happen to use as their unit for parameters. The QED request
struct is one of them, but there is no real reason for it. There is no
metadata that we have at a sector granularity.

So I would avoid baking assumptions of bad drivers into new interfaces.

> Maybe we want three limits in BlockLimits, rather than two: the current
> max_pwrite_zeroes does a good job at saying how small blk_pwrite_zeroes
> must fragment large requests, and pwrite_zeroes_alignment does a good
> job at saying how large a request must be to potentially punch a hole,
> but at least in the case of qcow2, where we want to optimize a partial
> write to potentially zeroing an entire cluster, we still want to limit
> things to sector boundaries when checking for whether the rest of the
> cluster already reads as zeroes, whether or not we also want to support
> request_alignment of 1 instead of 512.

For qcow2, the only reason that sectors are involved in the optimisation
is that that's the granularity of bdrv_get_block_status(). Once that is
fixed, qcow2 can use bytes for its optimisation.

Until then, if we decided that we don't want to check the full cluster
any more (clusters are always aligned to sectors) but only the area that
is not overwritten, it would have to round start and end of the request
to the sector bounary.

> There are other drivers that I touched in this series that were relying
> on the fact that the block layer currently guarantees sector alignment,
> and maybe they should be setting request_alignment, or maybe we want to
> add yet another BlockLimit member.  So even if we want normal read/write
> to allow request_alignment of 1 in the case where we don't need the
> block layer to do a read-modify-write, I'm still wondering whether we
> want the write_zeroes engine to have a different minimum alignment and
> ALWAYS hand off to normal read-modify-write for anything smaller.

Do you have an example where the restriction to full sectors is
fundamental and not just a shortcoming of the implementation? As long as
it's only the latter, I think using -ENOTSUP to deal with it until we
fix it for good is fine.

Kevin
diff mbox

Patch

diff --git a/block/qed.c b/block/qed.c
index 0ab5b40..a0be886 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1419,7 +1419,7 @@  typedef struct {
     bool done;
 } QEDWriteZeroesCB;

-static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
+static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
 {
     QEDWriteZeroesCB *cb = opaque;

@@ -1430,10 +1430,10 @@  static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret)
     }
 }

-static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
-                                                 int64_t sector_num,
-                                                 int nb_sectors,
-                                                 BdrvRequestFlags flags)
+static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
+                                                  int64_t offset,
+                                                  int count,
+                                                  BdrvRequestFlags flags)
 {
     BlockAIOCB *blockacb;
     BDRVQEDState *s = bs->opaque;
@@ -1443,10 +1443,10 @@  static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,

     /* Refuse if there are untouched backing file sectors */
     if (bs->backing) {
-        if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) {
+        if (qed_offset_into_cluster(s, offset) != 0) {
             return -ENOTSUP;
         }
-        if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) {
+        if (qed_offset_into_cluster(s, count) != 0) {
             return -ENOTSUP;
         }
     }
@@ -1454,12 +1454,13 @@  static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs,
     /* Zero writes start without an I/O buffer.  If a buffer becomes necessary
      * then it will be allocated during request processing.
      */
-    iov.iov_base = NULL,
-    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+    iov.iov_base = NULL;
+    iov.iov_len = count;

     qemu_iovec_init_external(&qiov, &iov, 1);
-    blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors,
-                             qed_co_write_zeroes_cb, &cb,
+    blockacb = qed_aio_setup(bs, offset >> BDRV_SECTOR_BITS, &qiov,
+                             count >> BDRV_SECTOR_BITS,
+                             qed_co_pwrite_zeroes_cb, &cb,
                              QED_AIOCB_WRITE | QED_AIOCB_ZERO);
     if (!blockacb) {
         return -EIO;
@@ -1664,7 +1665,7 @@  static BlockDriver bdrv_qed = {
     .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
-    .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
+    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
     .bdrv_get_info            = bdrv_qed_get_info,