diff mbox

[v5,14/14] nbd: Implement NBD_CMD_WRITE_ZEROES on client

Message ID 1468901281-22858-15-git-send-email-eblake@redhat.com
State New
Headers show

Commit Message

Eric Blake July 19, 2016, 4:08 a.m. UTC
Upstream NBD protocol recently added the ability to efficiently
write zeroes without having to send the zeroes over the wire,
along with a flag to control whether the client wants a hole.

The generic block code takes care of falling back to the obvious
write of lots of zeroes if we return -ENOTSUP because the server
does not have WRITE_ZEROES.

Ideally, since NBD_CMD_WRITE_ZEROES does not involve any data
over the wire, we want to support transactions that are much
larger than the normal 32M limit imposed on NBD_CMD_WRITE.  But
the server may still have a limit smaller than UINT_MAX, so
until experimental NBD protocol additions for advertising various
command sizes is finalized (see [1], [2]), for now we just stick to
the same limits as normal writes.

[1] https://github.com/yoe/nbd/blob/extension-info/doc/proto.md
[2] https://sourceforge.net/p/nbd/mailman/message/35081223/

Signed-off-by: Eric Blake <eblake@redhat.com>

---
v5: enhance commit message
v4: rebase to byte-based limits
v3: rebase, tell block layer about our support
---
 block/nbd-client.h |  2 ++
 block/nbd-client.c | 35 +++++++++++++++++++++++++++++++++++
 block/nbd.c        |  4 ++++
 3 files changed, 41 insertions(+)

Comments

Fam Zheng July 19, 2016, 6:24 a.m. UTC | #1
On Mon, 07/18 22:08, Eric Blake wrote:
> Upstream NBD protocol recently added the ability to efficiently
> write zeroes without having to send the zeroes over the wire,
> along with a flag to control whether the client wants a hole.
> 
> The generic block code takes care of falling back to the obvious
> write of lots of zeroes if we return -ENOTSUP because the server
> does not have WRITE_ZEROES.
> 
> Ideally, since NBD_CMD_WRITE_ZEROES does not involve any data
> over the wire, we want to support transactions that are much
> larger than the normal 32M limit imposed on NBD_CMD_WRITE.  But
> the server may still have a limit smaller than UINT_MAX, so
> until experimental NBD protocol additions for advertising various
> command sizes is finalized (see [1], [2]), for now we just stick to
> the same limits as normal writes.
> 
> [1] https://github.com/yoe/nbd/blob/extension-info/doc/proto.md
> [2] https://sourceforge.net/p/nbd/mailman/message/35081223/
> 
> Signed-off-by: Eric Blake <eblake@redhat.com>
> 
> ---
> v5: enhance commit message
> v4: rebase to byte-based limits
> v3: rebase, tell block layer about our support
> ---
>  block/nbd-client.h |  2 ++
>  block/nbd-client.c | 35 +++++++++++++++++++++++++++++++++++
>  block/nbd.c        |  4 ++++
>  3 files changed, 41 insertions(+)
> 
> diff --git a/block/nbd-client.h b/block/nbd-client.h
> index 044aca4..2cfe377 100644
> --- a/block/nbd-client.h
> +++ b/block/nbd-client.h
> @@ -48,6 +48,8 @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
>  int nbd_client_co_flush(BlockDriverState *bs);
>  int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
>                            uint64_t bytes, QEMUIOVector *qiov, int flags);
> +int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
> +                                int count, BdrvRequestFlags flags);
>  int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
>                           uint64_t bytes, QEMUIOVector *qiov, int flags);
> 
> diff --git a/block/nbd-client.c b/block/nbd-client.c
> index 7e9c3ec..104ba2f 100644
> --- a/block/nbd-client.c
> +++ b/block/nbd-client.c
> @@ -275,6 +275,41 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
>      return -reply.error;
>  }
> 
> +int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
> +                                int count, BdrvRequestFlags flags)
> +{
> +    ssize_t ret;
> +    NbdClientSession *client = nbd_get_client_session(bs);
> +    struct nbd_request request = {
> +        .type = NBD_CMD_WRITE_ZEROES,
> +        .from = offset,
> +        .len = count,
> +    };
> +    struct nbd_reply reply;
> +
> +    if (!(client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES)) {
> +        return -ENOTSUP;
> +    }
> +
> +    if (flags & BDRV_REQ_FUA) {
> +        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
> +        request.flags |= NBD_CMD_FLAG_FUA;
> +    }
> +    if (!(flags & BDRV_REQ_MAY_UNMAP)) {

Correct me if I'm wrong, I don't think we care about BDRV_REQ_MAY_UNMAP here,
the NBD protocol can never issue an unmap request. In other words I think
NO_HOLE and MAY_UNMAP are two different things.

Fam

> +        request.flags |= NBD_CMD_FLAG_NO_HOLE;
> +    }
> +
> +    nbd_coroutine_start(client, &request);
> +    ret = nbd_co_send_request(bs, &request, NULL);
> +    if (ret < 0) {
> +        reply.error = -ret;
> +    } else {
> +        nbd_co_receive_reply(client, &request, &reply, NULL);
> +    }
> +    nbd_coroutine_end(client, &request);
> +    return -reply.error;
> +}
> +
>  int nbd_client_co_flush(BlockDriverState *bs)
>  {
>      NbdClientSession *client = nbd_get_client_session(bs);
> diff --git a/block/nbd.c b/block/nbd.c
> index 8d57220..049d1bd 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -357,6 +357,7 @@ static int nbd_co_flush(BlockDriverState *bs)
>  static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE;
> +    bs->bl.max_pwrite_zeroes = NBD_MAX_BUFFER_SIZE;
>      bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
>  }
> 
> @@ -440,6 +441,7 @@ static BlockDriver bdrv_nbd = {
>      .bdrv_file_open             = nbd_open,
>      .bdrv_co_preadv             = nbd_client_co_preadv,
>      .bdrv_co_pwritev            = nbd_client_co_pwritev,
> +    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
>      .bdrv_close                 = nbd_close,
>      .bdrv_co_flush_to_os        = nbd_co_flush,
>      .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
> @@ -458,6 +460,7 @@ static BlockDriver bdrv_nbd_tcp = {
>      .bdrv_file_open             = nbd_open,
>      .bdrv_co_preadv             = nbd_client_co_preadv,
>      .bdrv_co_pwritev            = nbd_client_co_pwritev,
> +    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
>      .bdrv_close                 = nbd_close,
>      .bdrv_co_flush_to_os        = nbd_co_flush,
>      .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
> @@ -476,6 +479,7 @@ static BlockDriver bdrv_nbd_unix = {
>      .bdrv_file_open             = nbd_open,
>      .bdrv_co_preadv             = nbd_client_co_preadv,
>      .bdrv_co_pwritev            = nbd_client_co_pwritev,
> +    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
>      .bdrv_close                 = nbd_close,
>      .bdrv_co_flush_to_os        = nbd_co_flush,
>      .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
> -- 
> 2.5.5
> 
>
Eric Blake July 19, 2016, 3:31 p.m. UTC | #2
On 07/19/2016 12:24 AM, Fam Zheng wrote:
> On Mon, 07/18 22:08, Eric Blake wrote:
>> Upstream NBD protocol recently added the ability to efficiently
>> write zeroes without having to send the zeroes over the wire,
>> along with a flag to control whether the client wants a hole.
>>

>> +int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
>> +                                int count, BdrvRequestFlags flags)
>> +{
>> +    ssize_t ret;
>> +    NbdClientSession *client = nbd_get_client_session(bs);
>> +    struct nbd_request request = {
>> +        .type = NBD_CMD_WRITE_ZEROES,
>> +        .from = offset,
>> +        .len = count,
>> +    };
>> +    struct nbd_reply reply;
>> +
>> +    if (!(client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES)) {
>> +        return -ENOTSUP;
>> +    }
>> +
>> +    if (flags & BDRV_REQ_FUA) {
>> +        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
>> +        request.flags |= NBD_CMD_FLAG_FUA;
>> +    }
>> +    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
> 
> Correct me if I'm wrong, I don't think we care about BDRV_REQ_MAY_UNMAP here,
> the NBD protocol can never issue an unmap request. In other words I think
> NO_HOLE and MAY_UNMAP are two different things.

No. The server is (and should be) allowed to manage storage as
efficiently as it wants, and should only be required to fully allocate
storage if the client has requested that.  The NBD protocol CAN issue an
unmap request (NBD_CMD_TRIM), but we also document in the NBD protocol
that the server SHOULD be able to unmap instead of writing zeroes so
long as the result still reads as zeroes.  So we WANT to issue MAY_UNMAP
as an optimization in all cases except where the client specifically
asked for full allocation.  NO_HOLE and MAY_UNMAP are (supposed to be)
the same thing, except for being negated in sense based on what the
default value of 0 represents.
diff mbox

Patch

diff --git a/block/nbd-client.h b/block/nbd-client.h
index 044aca4..2cfe377 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -48,6 +48,8 @@  int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
                           uint64_t bytes, QEMUIOVector *qiov, int flags);
+int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                int count, BdrvRequestFlags flags);
 int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
                          uint64_t bytes, QEMUIOVector *qiov, int flags);

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 7e9c3ec..104ba2f 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -275,6 +275,41 @@  int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
     return -reply.error;
 }

+int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                int count, BdrvRequestFlags flags)
+{
+    ssize_t ret;
+    NbdClientSession *client = nbd_get_client_session(bs);
+    struct nbd_request request = {
+        .type = NBD_CMD_WRITE_ZEROES,
+        .from = offset,
+        .len = count,
+    };
+    struct nbd_reply reply;
+
+    if (!(client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES)) {
+        return -ENOTSUP;
+    }
+
+    if (flags & BDRV_REQ_FUA) {
+        assert(client->nbdflags & NBD_FLAG_SEND_FUA);
+        request.flags |= NBD_CMD_FLAG_FUA;
+    }
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        request.flags |= NBD_CMD_FLAG_NO_HOLE;
+    }
+
+    nbd_coroutine_start(client, &request);
+    ret = nbd_co_send_request(bs, &request, NULL);
+    if (ret < 0) {
+        reply.error = -ret;
+    } else {
+        nbd_co_receive_reply(client, &request, &reply, NULL);
+    }
+    nbd_coroutine_end(client, &request);
+    return -reply.error;
+}
+
 int nbd_client_co_flush(BlockDriverState *bs)
 {
     NbdClientSession *client = nbd_get_client_session(bs);
diff --git a/block/nbd.c b/block/nbd.c
index 8d57220..049d1bd 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -357,6 +357,7 @@  static int nbd_co_flush(BlockDriverState *bs)
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE;
+    bs->bl.max_pwrite_zeroes = NBD_MAX_BUFFER_SIZE;
     bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
 }

@@ -440,6 +441,7 @@  static BlockDriver bdrv_nbd = {
     .bdrv_file_open             = nbd_open,
     .bdrv_co_preadv             = nbd_client_co_preadv,
     .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
@@ -458,6 +460,7 @@  static BlockDriver bdrv_nbd_tcp = {
     .bdrv_file_open             = nbd_open,
     .bdrv_co_preadv             = nbd_client_co_preadv,
     .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_pdiscard           = nbd_client_co_pdiscard,
@@ -476,6 +479,7 @@  static BlockDriver bdrv_nbd_unix = {
     .bdrv_file_open             = nbd_open,
     .bdrv_co_preadv             = nbd_client_co_preadv,
     .bdrv_co_pwritev            = nbd_client_co_pwritev,
+    .bdrv_co_pwrite_zeroes      = nbd_client_co_pwrite_zeroes,
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_pdiscard           = nbd_client_co_pdiscard,