diff mbox series

[3/9] nbd: BLOCK_STATUS for standard get_block_status function: server part

Message ID 1518702707-7077-4-git-send-email-vsementsov@virtuozzo.com
State New
Headers show
Series [1/9] nbd/server: add nbd_opt_invalid helper | expand

Commit Message

Vladimir Sementsov-Ogievskiy Feb. 15, 2018, 1:51 p.m. UTC
Minimal realization: only one extent in server answer is supported.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 include/block/nbd.h |  33 ++++++
 nbd/common.c        |  10 ++
 nbd/server.c        | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 352 insertions(+), 1 deletion(-)

Comments

Eric Blake Feb. 15, 2018, 11:02 p.m. UTC | #1
On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
> Minimal realization: only one extent in server answer is supported.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>   include/block/nbd.h |  33 ++++++
>   nbd/common.c        |  10 ++
>   nbd/server.c        | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   3 files changed, 352 insertions(+), 1 deletion(-)
> 

> @@ -200,9 +227,15 @@ enum {
>   #define NBD_REPLY_TYPE_NONE          0
>   #define NBD_REPLY_TYPE_OFFSET_DATA   1
>   #define NBD_REPLY_TYPE_OFFSET_HOLE   2
> +#define NBD_REPLY_TYPE_BLOCK_STATUS  5

Stale; see nbd.git commit 56c77720 which changed this to 3.

> +++ b/nbd/server.c
> @@ -82,6 +82,15 @@ struct NBDExport {
>   
>   static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
>   
> +/* NBDExportMetaContexts represents list of selected by
> + * NBD_OPT_SET_META_CONTEXT contexts to be exported. */

represents a list of contexts to be exported, as selected by 
NBD_OPT_SET_META_CONTEXT.

> +typedef struct NBDExportMetaContexts {
> +    char export_name[NBD_MAX_NAME_SIZE + 1];

Would this work as const char * pointing at some other storage, instead 
of having to copy into this storage?

> +    bool valid; /* means that negotiation of the option finished without
> +                   errors */
> +    bool base_allocation; /* export base:allocation context (block status) */
> +} NBDExportMetaContexts;
> +
>   struct NBDClient {

> @@ -636,6 +646,201 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
>       return QIO_CHANNEL(tioc);
>   }
>   
> +/* nbd_alloc_read_size_string
> + *
> + * Read string in format
> + *   uint32_t len
> + *   len bytes string (not 0-terminated)
> + * String is allocated and pointer returned as @buf
> + *
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_alloc_read_size_string(NBDClient *client, char **buf,
> +                                      Error **errp)
> +{
> +    int ret;
> +    uint32_t len;
> +
> +    ret = nbd_opt_read(client, &len, sizeof(len), errp);
> +    if (ret <= 0) {
> +        return ret;
> +    }
> +    cpu_to_be32s(&len);
> +
> +    *buf = g_try_malloc(len + 1);

I'd rather check that len is sane prior to trying to malloc.  Otherwise, 
a malicious client can convince us to waste time/space doing a large 
malloc before we finally realize that we can't read that many bytes 
after all.  And in fact, if you check len in advance, you can probably 
just use g_malloc() instead of g_try_malloc() (g_try_malloc() makes 
sense on a 1M allocation, where we can still allocate smaller stuff in 
reporting the error; but since NBD limits strings to 4k, if we fail at 
allocating 4k, we are probably already so hosed that our attempts to 
report the failure will also run out of memory and abort, so why not 
just abort now).

> +    if (*buf == NULL) {
> +        error_setg(errp, "No memory");
> +        return -ENOMEM;
> +    }
> +    (*buf)[len] = '\0';
> +
> +    ret = nbd_opt_read(client, *buf, len, errp);
> +    if (ret <= 0) {
> +        g_free(*buf);
> +        *buf = NULL;
> +    }
> +
> +    return ret;
> +}
> +
> +/* nbd_read_size_string

Will resume review here...
Vladimir Sementsov-Ogievskiy Feb. 16, 2018, 11:09 a.m. UTC | #2
16.02.2018 02:02, Eric Blake wrote:
> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Minimal realization: only one extent in server answer is supported.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   include/block/nbd.h |  33 ++++++
>>   nbd/common.c        |  10 ++
>>   nbd/server.c        | 310 
>> +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>   3 files changed, 352 insertions(+), 1 deletion(-)
>>
>
>> @@ -200,9 +227,15 @@ enum {
>>   #define NBD_REPLY_TYPE_NONE          0
>>   #define NBD_REPLY_TYPE_OFFSET_DATA   1
>>   #define NBD_REPLY_TYPE_OFFSET_HOLE   2
>> +#define NBD_REPLY_TYPE_BLOCK_STATUS  5
>
> Stale; see nbd.git commit 56c77720 which changed this to 3.

Very unpleasant surprise. I understand that this is still experimental 
extension, but actually we use it =5 in production about one year. Can 
we revert it to 5?

>
>> +++ b/nbd/server.c
>> @@ -82,6 +82,15 @@ struct NBDExport {
>>     static QTAILQ_HEAD(, NBDExport) exports = 
>> QTAILQ_HEAD_INITIALIZER(exports);
>>   +/* NBDExportMetaContexts represents list of selected by
>> + * NBD_OPT_SET_META_CONTEXT contexts to be exported. */
>
> represents a list of contexts to be exported, as selected by 
> NBD_OPT_SET_META_CONTEXT.
>
>> +typedef struct NBDExportMetaContexts {
>> +    char export_name[NBD_MAX_NAME_SIZE + 1];
>
> Would this work as const char * pointing at some other storage, 
> instead of having to copy into this storage?

I'll think about

>
>> +    bool valid; /* means that negotiation of the option finished 
>> without
>> +                   errors */
>> +    bool base_allocation; /* export base:allocation context (block 
>> status) */
>> +} NBDExportMetaContexts;
>> +
>>   struct NBDClient {
>
>> @@ -636,6 +646,201 @@ static QIOChannel 
>> *nbd_negotiate_handle_starttls(NBDClient *client,
>>       return QIO_CHANNEL(tioc);
>>   }
>>   +/* nbd_alloc_read_size_string
>> + *
>> + * Read string in format
>> + *   uint32_t len
>> + *   len bytes string (not 0-terminated)
>> + * String is allocated and pointer returned as @buf
>> + *
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_alloc_read_size_string(NBDClient *client, char **buf,
>> +                                      Error **errp)
>> +{
>> +    int ret;
>> +    uint32_t len;
>> +
>> +    ret = nbd_opt_read(client, &len, sizeof(len), errp);
>> +    if (ret <= 0) {
>> +        return ret;
>> +    }
>> +    cpu_to_be32s(&len);
>> +
>> +    *buf = g_try_malloc(len + 1);
>
> I'd rather check that len is sane prior to trying to malloc. 
> Otherwise, a malicious client can convince us to waste time/space 
> doing a large malloc before we finally realize that we can't read that 
> many bytes after all.  And in fact, if you check len in advance, you 
> can probably just use g_malloc() instead of g_try_malloc() 
> (g_try_malloc() makes sense on a 1M allocation, where we can still 
> allocate smaller stuff in reporting the error; but since NBD limits 
> strings to 4k, if we fail at allocating 4k, we are probably already so 
> hosed that our attempts to report the failure will also run out of 
> memory and abort, so why not just abort now).

reasonable, will do

>
>> +    if (*buf == NULL) {
>> +        error_setg(errp, "No memory");
>> +        return -ENOMEM;
>> +    }
>> +    (*buf)[len] = '\0';
>> +
>> +    ret = nbd_opt_read(client, *buf, len, errp);
>> +    if (ret <= 0) {
>> +        g_free(*buf);
>> +        *buf = NULL;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +/* nbd_read_size_string
>
> Will resume review here...
>
Eric Blake Feb. 16, 2018, 11:45 a.m. UTC | #3
On 02/16/2018 05:09 AM, Vladimir Sementsov-Ogievskiy wrote:
> 16.02.2018 02:02, Eric Blake wrote:
>> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> Minimal realization: only one extent in server answer is supported.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>>>   include/block/nbd.h |  33 ++++++
>>>   nbd/common.c        |  10 ++
>>>   nbd/server.c        | 310 
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>   3 files changed, 352 insertions(+), 1 deletion(-)
>>>
>>
>>> @@ -200,9 +227,15 @@ enum {
>>>   #define NBD_REPLY_TYPE_NONE          0
>>>   #define NBD_REPLY_TYPE_OFFSET_DATA   1
>>>   #define NBD_REPLY_TYPE_OFFSET_HOLE   2
>>> +#define NBD_REPLY_TYPE_BLOCK_STATUS  5
>>
>> Stale; see nbd.git commit 56c77720 which changed this to 3.
> 
> Very unpleasant surprise. I understand that this is still experimental 
> extension, but actually we use it =5 in production about one year. Can 
> we revert it to 5?

For that, you'll have to ask on the upstream NBD list.  There may be 
other things that turn out to need tweaking as I get through the rest of 
your initial implementation, vs. what works well for interoperability, 
so having a distinct number for experimental vs. actual may be 
worthwhile for other reasons as well, even if it requires some glue code 
on your side to handle an older server sending 5 instead of 3 to a newer 
client.
Eric Blake Feb. 16, 2018, 1:21 p.m. UTC | #4
On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
> Minimal realization: only one extent in server answer is supported.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---

continuing where I left off,


> +++ b/nbd/common.c
> @@ -75,6 +75,10 @@ const char *nbd_opt_lookup(uint32_t opt)
>           return "go";
>       case NBD_OPT_STRUCTURED_REPLY:
>           return "structured reply";
> +    case NBD_OPT_LIST_META_CONTEXT:
> +        return "list meta context";
> +    case NBD_OPT_SET_META_CONTEXT:
> +        return "set meta context";
>       default:

Should the changes to the header for new macros and to common.c for 
mapping bits to names be split into a separate patch, so that someone 
could backport just the new constants and then the client-side 
implementation, rather than being forced to backport the rest of the 
server implementation at the same time?

> +
> +/* nbd_read_size_string
> + *
> + * Read string in format
> + *   uint32_t len
> + *   len bytes string (not 0-terminated)
> + *
> + * @buf should be enough to store @max_len+1
> + *
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_read_size_string(NBDClient *client, char *buf,
> +                                uint32_t max_len, Error **errp)

Would existing code benefit from using this helper?  If so, splitting it 
into a separate patch, plus converting initial clients to use it, would 
be worthwhile.


> +
> +static int nbd_negotiate_send_meta_context(NBDClient *client,
> +                                           const char *context,
> +                                           uint32_t context_id,
> +                                           Error **errp)

No comment documenting this function?

> +{
> +    NBDOptionReplyMetaContext opt;
> +    struct iovec iov[] = {
> +        {.iov_base = &opt, .iov_len = sizeof(opt)},
> +        {.iov_base = (void *)context, .iov_len = strlen(context)}

Casting to void* looks suspicious, but I see that it is casting away 
const.  Okay.

> +    };
> +
> +    set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
> +                      sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
> +    stl_be_p(&opt.context_id, context_id);
> +
> +    return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
> +}
> +
> +static void nbd_meta_base_query(NBDExportMetaContexts *meta, const char *query)
> +{

Again, function comments are useful.

> +    if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
> +        /* Note: empty query should select all contexts within base
> +         * namespace. */
> +        meta->base_allocation = true;

 From the client perspective, this handling of the empty leaf-name works 
well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf nodes the 
server supports), but not so well for NBD_OPT_SET_META_CONTEXT (asking 
the server to send ALL base allocations, even when I don't necessarily 
know how to interpret anything other than base:allocation, is a waste). 
So this function needs a bool parameter that says whether it is being 
invoked from _LIST (empty string okay, to advertise ALL base leaf names 
back to client, which for now is just base:allocation) or from _SET 
(empty string is ignored as invalid; client has to specifically ask for 
base:allocation by name).

> +    }
> +}
> +
> +/* nbd_negotiate_meta_query
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_negotiate_meta_query(NBDClient *client,
> +                                    NBDExportMetaContexts *meta, Error **errp)
> +{
> +    int ret;
> +    char *query, *colon, *namespace, *subquery;

Is it worth stack-allocating query here, so you don't have to g_free() 
it later?  NBD limits the maximum string to 4k, which is a little bit 
big for stack allocation (on an operating system with 4k pages, 
allocating more than 4k on the stack in one function risks missing the 
guard page on stack overflow), but we also have the benefit that we KNOW 
that the set of meta-context namespaces that we support have a much 
smaller maximum limit of what we even care about.

> +
> +    ret = nbd_alloc_read_size_string(client, &query, errp);
> +    if (ret <= 0) {
> +        return ret;
> +    }
> +
> +    colon = strchr(query, ':');
> +    if (colon == NULL) {
> +        ret = nbd_opt_invalid(client, errp, "no colon in query");
> +        goto out;

Hmm, that puts a slight wrinkle into my proposal, or else maybe it is 
something I should bring up on the NBD list.  If we only read 5 
characters (because the max namespace WE support is "base:"), but a 
client asks for namespace "X-longname:", we should gracefully ignore the 
client's request; while we still want to reply with an error to a client 
that asks for "garbage" with no colon at all.  The question for the NBD 
spec, then, is whether detecting bad client requests that didn't use 
colon is mandatory for the server (meaning we MUST read the entire 
namespace request, and search for the colon) or merely best effort (we 
only have to read 5 characters, and if we silently ignore instead of 
diagnose a bad namespace request that was longer than that, oh well). 
Worded from the client, it switches to a question of whether the client 
should expect the server to diagnose all requests, or must be prepared 
for the server to ignore requests even where those requests are bogus. 
Or, the NBD spec may change slightly to pass namespace and leafname as 
separate fields, both with lengths, rather than a colon, to make it 
easier for the server to skip over an unknown namespace/leaf pair 
without having to parse whether a colon was present.  I'll send that in 
a separate email (the upstream NBD list doesn't need to see all my 
review comments on this thread).

> +    }
> +    *colon = '\0';
> +    namespace = query;
> +    subquery = colon + 1;
> +
> +    if (strcmp(namespace, "base") == 0) {
> +        nbd_meta_base_query(meta, subquery);
> +    }
> +
> +out:
> +    g_free(query);
> +    return ret;
> +}
> +
> +/* nbd_negotiate_meta_queries
> + * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
> + *
> + * Return -errno on I/O error, 0 if option was completely handled by
> + * sending a reply about inconsistent lengths, or 1 on success. */
> +static int nbd_negotiate_meta_queries(NBDClient *client,
> +                                      NBDExportMetaContexts *meta, Error **errp)
> +{

The two options can mostly share code, but see my earlier comments about 
how I think you need to distinguish between list ('base:' is a valid 
query) and set ('base:' is an invalid request).

> +    int ret;
> +    NBDExport *exp;
> +    NBDExportMetaContexts local_meta;
> +    uint32_t nb_queries;
> +    int i;
> +
> +    assert(client->structured_reply);

Hmm.  Asserting here is bad if the client can get here without 
negotiating structured reply (I'll have to see if you checked that in 
the caller... [1])

> +
> +    if (meta == NULL) {

Bikeshedding - I like writing 'if (!meta) {' as it is fewer characters.

The function documentation should mention that 'meta' may be NULL from 
the caller.  But (without seeing the callers yet), why?  Are you arguing 
that _LIST uses local_meta (because it only needs something for the 
duration of the reply) while _SET supplies a pre-existing meta (that is 
associated with the server state, and used when the client finally calls 
NBD_OPT_GO)?

> +        meta = &local_meta;
> +    }
> +
> +    memset(meta, 0, sizeof(*meta));
> +
> +    ret = nbd_read_size_string(client, meta->export_name,
> +                               NBD_MAX_NAME_SIZE, errp);

Revisiting a question I raised in my first half review - you saved the 
name as part of the struct because we have to later compare that the 
final OPT_SET export name matches the request during OPT_GO (if they 
don't match, then we have no contexts to serve after all).  So a 'const 
char *' won't work, but maybe the struct could use a 'char *' pointing 
to malloc'd storage rather than char[MAX_NAME] that reserves array space 
that is mostly unused for the typical name that is much shorter than the 
maximum name length.

> +    if (ret <= 0) {
> +        return ret;
> +    }
> +
> +    exp = nbd_export_find(meta->export_name);
> +    if (exp == NULL) {
> +        return nbd_opt_invalid(client, errp,
> +                               "export '%s' not present", meta->export_name);

Wrong error; I think NBD_REP_ERR_UNKNOWN is better here.

> +    }
> +
> +    ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
> +    if (ret <= 0) {
> +        return ret;
> +    }
> +    cpu_to_be32s(&nb_queries);
> +
> +    for (i = 0; i < nb_queries; ++i) {
> +        ret = nbd_negotiate_meta_query(client, meta, errp);
> +        if (ret <= 0) {
> +            return ret;
> +        }
> +    }

Interesting choice - you parse all queries prior to preparing any 
response.  I guess you could also reply as you read, but it doesn't 
change the fact that you have to remember what the final _SET selected 
for use later on.  So this works.

> +
> +    if (meta->base_allocation) {
> +        ret = nbd_negotiate_send_meta_context(client, "base:allocation",
> +                                              NBD_META_ID_BASE_ALLOCATION,
> +                                              errp);

Since earlier you have #define NBD_META_ID_BASE_ALLOCATION 0, that means 
you use the context ID of 0 in response to both _LIST (where the spec 
says the server SHOULD send 0, but the client SHOULD ignore the id 
field) and for _SET (where the client needs a unique ID for every 
separate context that actually got selected - but since we only select a 
single context, we get away with it).  I suspect later patches will add 
additional contexts where you'll have to actually worry about context 
ids (and always sending 0 in response to _LIST), but for now, this works :)

> +        if (ret < 0) {
> +            return ret;
> +        }
> +    }
> +
> +    ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
> +    if (ret == 0) {
> +        meta->valid = true;
> +    }

Interesting - meta->valid can be true even if 0 contexts were selected; 
I presume we use this later on to decide whether to reply with EINVAL 
vs. an empty list if a client uses NBD_CMD_BLOCK_STATUS without a valid 
_SET call.  Then again, the NBD spec currently states that 
NBD_CMD_BLOCK_STATUS should not be used even if _SET succeeded, if the 
_SET didn't return at least one context.

> +
> +    return ret;
> +}
> +
>   /* nbd_negotiate_options
>    * Process all NBD_OPT_* client option commands, during fixed newstyle
>    * negotiation.
> @@ -826,6 +1031,22 @@ static int nbd_negotiate_options(NBDClient *client, uint16_t myflags,
>                   }
>                   break;
>   
> +            case NBD_OPT_LIST_META_CONTEXT:
> +            case NBD_OPT_SET_META_CONTEXT:
> +                if (!client->structured_reply) {
> +                    ret = nbd_opt_invalid(
> +                            client, errp,
> +                            "request option '%s' when structured reply "
> +                            "is not negotiated", nbd_opt_lookup(option));

[1] okay, you did filter out clients that request out of order.

> +                } else if (option == NBD_OPT_LIST_META_CONTEXT) {
> +                    ret = nbd_negotiate_meta_queries(client, NULL, errp);
> +                } else {
> +                    ret = nbd_negotiate_meta_queries(client,
> +                                                     &client->export_meta,
> +                                                     errp);

Okay, so you DO have a difference on whether you pass in 'meta' based on 
whether it is a local response vs. setting things for later.

> +                }
> +                break;
> +
>               default:
>                   ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
>                                      "Unsupported option 0x%" PRIx32 " (%s)",
> @@ -1446,6 +1667,78 @@ static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
>       return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
>   }
>   
> +static int blockstatus_to_extent_be(BlockDriverState *bs, uint64_t offset,
> +                                    uint64_t bytes, NBDExtent *extent)
> +{
> +    uint64_t tail_bytes = bytes;
> +
> +    while (tail_bytes) {

I might have named this 'remaining'

> +        uint32_t flags;
> +        int64_t num;
> +        int ret = bdrv_block_status_above(bs, NULL, offset, tail_bytes, &num,
> +                                          NULL, NULL);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
> +                (ret & BDRV_BLOCK_ZERO      ? NBD_STATE_ZERO : 0);

Looks like the correct bit mapping to me.

> +
> +        if (tail_bytes == bytes) {
> +            extent->flags = flags;
> +        }
> +
> +        if (flags != extent->flags) {
> +            break;
> +        }
> +
> +        offset += num;
> +        tail_bytes -= num;
> +    }
> +
> +    cpu_to_be32s(&extent->flags);
> +    extent->length = cpu_to_be32(bytes - tail_bytes);

This only computes one extent, but tries to find the longest extent 
possible (if two consecutive bdrv_block_status calls return the same 
status, perhaps because of fragmentation).  Presumably this is called in 
a loop to find a full list of extents covering the client's entire 
request? [2]

> +
> +    return 0;
> +}
> +
> +/* nbd_co_send_extents
> + * @extents should be in big-endian */
> +static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
> +                               NBDExtent *extents, unsigned nb_extents,
> +                               uint32_t context_id, Error **errp)
> +{
> +    NBDStructuredMeta chunk;
> +
> +    struct iovec iov[] = {
> +        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
> +        {.iov_base = extents, .iov_len = nb_extents * sizeof(extents[0])}
> +    };
> +
> +    set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_BLOCK_STATUS,
> +                 handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);

This says you can only send a single chunk, which matches the fact that 
right now you support exactly one context type.  Presumably later 
patches tweak this.

> +    stl_be_p(&chunk.context_id, context_id);
> +
> +    return nbd_co_send_iov(client, iov, 2, errp);
> +}
> +
> +static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
> +                                    BlockDriverState *bs, uint64_t offset,
> +                                    uint64_t length, uint32_t context_id,
> +                                    Error **errp)

No function comment?

> +{
> +    int ret;
> +    NBDExtent extent;
> +
> +    ret = blockstatus_to_extent_be(bs, offset, length, &extent);
> +    if (ret < 0) {
> +        return nbd_co_send_structured_error(
> +                client, handle, -ret, "can't get block status", errp);
> +    }
> +
> +    return nbd_co_send_extents(client, handle, &extent, 1, context_id, errp);

[2] Okay, no looping to answer the client's entire request, so that may 
be something you add in a later patch, to keep this initial patch 
minimal.  But at least it matches your commit message.

> +}
> +
>   /* nbd_co_receive_request
>    * Collect a client request. Return 0 if request looks valid, -EIO to drop
>    * connection right away, and any other negative value to report an error to
> @@ -1523,6 +1816,8 @@ static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
>           valid_flags |= NBD_CMD_FLAG_DF;
>       } else if (request->type == NBD_CMD_WRITE_ZEROES) {
>           valid_flags |= NBD_CMD_FLAG_NO_HOLE;
> +    } else if (request->type == NBD_CMD_BLOCK_STATUS) {
> +        valid_flags |= NBD_CMD_FLAG_REQ_ONE;
>       }
>       if (request->flags & ~valid_flags) {
>           error_setg(errp, "unsupported flags for command %s (got 0x%x)",
> @@ -1650,6 +1945,19 @@ static coroutine_fn void nbd_trip(void *opaque)
>           }
>   
>           break;
> +    case NBD_CMD_BLOCK_STATUS:
> +        if (client->export_meta.base_allocation) {
> +            ret = nbd_co_send_block_status(req->client, request.handle,
> +                                           blk_bs(exp->blk), request.from,
> +                                           request.len,
> +                                           NBD_META_ID_BASE_ALLOCATION,
> +                                           &local_err);

No check of client->export_meta.valid?

> +        } else {
> +            ret = -EINVAL;
> +            error_setg(&local_err, "CMD_BLOCK_STATUS not negotiated");
> +        }

At this point, we've either sent the final structured chunk, or we've 
sent nothing and have ret < 0;...

> +
> +        break;
>       default:
>           error_setg(&local_err, "invalid request type (%" PRIu32 ") received",
>                      request.type);
> @@ -1680,7 +1988,7 @@ reply:

     if (client->structured_reply &&
         (ret < 0 || request.type == NBD_CMD_READ)) {

>               ret = nbd_co_send_structured_done(req->client, request.handle,
>                                                 &local_err);
>           }

...iIf the client negotiated structured reply, then we've sent the error 
message.  But if the client did NOT negotiate structured reply, then...

> -    } else {
> +    } else if (request.type != NBD_CMD_BLOCK_STATUS) {

...we fail to reply at all.  Oops.  That's not good for 
interoperability.  You'll need to make sure that we reply with EINVAL 
even if the client (mistakenly) calls NBD_CMD_BLOCK_STATUS without 
negotiating structured replies.

>           ret = nbd_co_send_simple_reply(req->client, request.handle,
>                                          ret < 0 ? -ret : 0,
>                                          req->data, reply_data_len, &local_err);
> 

Also missing from the patch - where do you validate that the export name 
in client->export_meta matches the export name in NBD_OPT_GO?  If they 
don't match, then export_meta should be discarded.
Vladimir Sementsov-Ogievskiy Feb. 16, 2018, 2:43 p.m. UTC | #5
16.02.2018 16:21, Eric Blake wrote:
> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Minimal realization: only one extent in server answer is supported.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>
> continuing where I left off,
>
>
>> +++ b/nbd/common.c
>> @@ -75,6 +75,10 @@ const char *nbd_opt_lookup(uint32_t opt)
>>           return "go";
>>       case NBD_OPT_STRUCTURED_REPLY:
>>           return "structured reply";
>> +    case NBD_OPT_LIST_META_CONTEXT:
>> +        return "list meta context";
>> +    case NBD_OPT_SET_META_CONTEXT:
>> +        return "set meta context";
>>       default:
>
> Should the changes to the header for new macros and to common.c for 
> mapping bits to names be split into a separate patch, so that someone 
> could backport just the new constants and then the client-side 
> implementation, rather than being forced to backport the rest of the 
> server implementation at the same time?

Not a problem, I can split. I've thought about it, but decided for first 
roll send it as one patch.

>
>> +
>> +/* nbd_read_size_string
>> + *
>> + * Read string in format
>> + *   uint32_t len
>> + *   len bytes string (not 0-terminated)
>> + *
>> + * @buf should be enough to store @max_len+1
>> + *
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_read_size_string(NBDClient *client, char *buf,
>> +                                uint32_t max_len, Error **errp)
>
> Would existing code benefit from using this helper?  If so, splitting 
> it into a separate patch, plus converting initial clients to use it, 
> would be worthwhile.
>
>
>> +
>> +static int nbd_negotiate_send_meta_context(NBDClient *client,
>> +                                           const char *context,
>> +                                           uint32_t context_id,
>> +                                           Error **errp)
>
> No comment documenting this function?
>
>> +{
>> +    NBDOptionReplyMetaContext opt;
>> +    struct iovec iov[] = {
>> +        {.iov_base = &opt, .iov_len = sizeof(opt)},
>> +        {.iov_base = (void *)context, .iov_len = strlen(context)}
>
> Casting to void* looks suspicious, but I see that it is casting away 
> const.  Okay.
>
>> +    };
>> +
>> +    set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
>> +                      sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
>> +    stl_be_p(&opt.context_id, context_id);
>> +
>> +    return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? 
>> -EIO : 0;
>> +}
>> +
>> +static void nbd_meta_base_query(NBDExportMetaContexts *meta, const 
>> char *query)
>> +{
>
> Again, function comments are useful.
>
>> +    if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
>> +        /* Note: empty query should select all contexts within base
>> +         * namespace. */
>> +        meta->base_allocation = true;
>
> From the client perspective, this handling of the empty leaf-name 
> works well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf 
> nodes the server supports), but not so well for 
> NBD_OPT_SET_META_CONTEXT (asking the server to send ALL base 
> allocations, even when I don't necessarily know how to interpret 
> anything other than base:allocation, is a waste). So this function 
> needs a bool parameter that says whether it is being invoked from 
> _LIST (empty string okay, to advertise ALL base leaf names back to 
> client, which for now is just base:allocation) or from _SET (empty 
> string is ignored as invalid; client has to specifically ask for 
> base:allocation by name).

"empty string is ignored as invalid", hm, do we have this in spec? I 
think SET and LIST must select exactly same sets of contexts. It is 
strange behavior of client to set "base:", but it is its decision. And I 
don't thing that it is invalid.
Formally we may answer with NBD_REP_ERR_TOO_BIG, but it will look weird, 
as client see that both base: and base:allocation returns _one_  
context, but in one case it is too big. But if we will have several 
base: contextes, server may fairly answer with NBD_REP_ERR_TOO_BIG.

So, I think for now the code is ok.

Also, I don't see NBD_REP_ERR_TOO_BIG possible reply in 
NBD_OPT_LIST_META_CONTEXT description. Should it be here?

>
>> +    }
>> +}
>> +
>> +/* nbd_negotiate_meta_query
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_negotiate_meta_query(NBDClient *client,
>> +                                    NBDExportMetaContexts *meta, 
>> Error **errp)
>> +{
>> +    int ret;
>> +    char *query, *colon, *namespace, *subquery;
>
> Is it worth stack-allocating query here, so you don't have to g_free() 
> it later?  NBD limits the maximum string to 4k, which is a little bit 
> big for stack allocation (on an operating system with 4k pages, 
> allocating more than 4k on the stack in one function risks missing the 
> guard page on stack overflow), but we also have the benefit that we 
> KNOW that the set of meta-context namespaces that we support have a 
> much smaller maximum limit of what we even care about.

it is not stack allocated, nbd_alloc_read_size_string calls g_malloc.

>
>> +
>> +    ret = nbd_alloc_read_size_string(client, &query, errp);
>> +    if (ret <= 0) {
>> +        return ret;
>> +    }
>> +
>> +    colon = strchr(query, ':');
>> +    if (colon == NULL) {
>> +        ret = nbd_opt_invalid(client, errp, "no colon in query");
>> +        goto out;
>
> Hmm, that puts a slight wrinkle into my proposal, or else maybe it is 
> something I should bring up on the NBD list.  If we only read 5 
> characters (because the max namespace WE support is "base:"), but a 
> client asks for namespace "X-longname:", we should gracefully ignore 
> the client's request; while we still want to reply with an error to a 
> client that asks for "garbage" with no colon at all. The question for 
> the NBD spec, then, is whether detecting bad client requests that 
> didn't use colon is mandatory for the server (meaning we MUST read the 
> entire namespace request, and search for the colon) or merely best 
> effort (we only have to read 5 characters, and if we silently ignore 
> instead of diagnose a bad namespace request that was longer than that, 
> oh well). Worded from the client, it switches to a question of whether 
> the client should expect the server to diagnose all requests, or must 
> be prepared for the server to ignore requests even where those 
> requests are bogus. Or, the NBD spec may change slightly to pass 
> namespace and leafname as separate fields, both with lengths, rather 
> than a colon, to make it easier for the server to skip over an unknown 
> namespace/leaf pair without having to parse whether a colon was 
> present.  I'll send that in a separate email (the upstream NBD list 
> doesn't need to see all my review comments on this thread).
>
>> +    }
>> +    *colon = '\0';
>> +    namespace = query;
>> +    subquery = colon + 1;
>> +
>> +    if (strcmp(namespace, "base") == 0) {
>> +        nbd_meta_base_query(meta, subquery);
>> +    }
>> +
>> +out:
>> +    g_free(query);
>> +    return ret;
>> +}
>> +
>> +/* nbd_negotiate_meta_queries
>> + * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
>> + *
>> + * Return -errno on I/O error, 0 if option was completely handled by
>> + * sending a reply about inconsistent lengths, or 1 on success. */
>> +static int nbd_negotiate_meta_queries(NBDClient *client,
>> +                                      NBDExportMetaContexts *meta, 
>> Error **errp)
>> +{
>
> The two options can mostly share code, but see my earlier comments 
> about how I think you need to distinguish between list ('base:' is a 
> valid query) and set ('base:' is an invalid request).
>
>> +    int ret;
>> +    NBDExport *exp;
>> +    NBDExportMetaContexts local_meta;
>> +    uint32_t nb_queries;
>> +    int i;
>> +
>> +    assert(client->structured_reply);
>
> Hmm.  Asserting here is bad if the client can get here without 
> negotiating structured reply (I'll have to see if you checked that in 
> the caller... [1])
>
>> +
>> +    if (meta == NULL) {
>
> Bikeshedding - I like writing 'if (!meta) {' as it is fewer characters.
>
> The function documentation should mention that 'meta' may be NULL from 
> the caller.  But (without seeing the callers yet), why?  Are you 
> arguing that _LIST uses local_meta (because it only needs something 
> for the duration of the reply) while _SET supplies a pre-existing meta 
> (that is associated with the server state, and used when the client 
> finally calls NBD_OPT_GO)?

yes.

>
>> +        meta = &local_meta;
>> +    }
>> +
>> +    memset(meta, 0, sizeof(*meta));
>> +
>> +    ret = nbd_read_size_string(client, meta->export_name,
>> +                               NBD_MAX_NAME_SIZE, errp);
>
> Revisiting a question I raised in my first half review - you saved the 
> name as part of the struct because we have to later compare that the 
> final OPT_SET export name matches the request during OPT_GO (if they 
> don't match, then we have no contexts to serve after all).  So a 
> 'const char *' won't work, but maybe the struct could use a 'char *' 
> pointing to malloc'd storage rather than char[MAX_NAME] that reserves 
> array space that is mostly unused for the typical name that is much 
> shorter than the maximum name length.
>
>> +    if (ret <= 0) {
>> +        return ret;
>> +    }
>> +
>> +    exp = nbd_export_find(meta->export_name);
>> +    if (exp == NULL) {
>> +        return nbd_opt_invalid(client, errp,
>> +                               "export '%s' not present", 
>> meta->export_name);
>
> Wrong error; I think NBD_REP_ERR_UNKNOWN is better here.
>
>> +    }
>> +
>> +    ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
>> +    if (ret <= 0) {
>> +        return ret;
>> +    }
>> +    cpu_to_be32s(&nb_queries);
>> +
>> +    for (i = 0; i < nb_queries; ++i) {
>> +        ret = nbd_negotiate_meta_query(client, meta, errp);
>> +        if (ret <= 0) {
>> +            return ret;
>> +        }
>> +    }
>
> Interesting choice - you parse all queries prior to preparing any 
> response.  I guess you could also reply as you read, but it doesn't 
> change the fact that you have to remember what the final _SET selected 
> for use later on.  So this works.
>
>> +
>> +    if (meta->base_allocation) {
>> +        ret = nbd_negotiate_send_meta_context(client, 
>> "base:allocation",
>> + NBD_META_ID_BASE_ALLOCATION,
>> +                                              errp);
>
> Since earlier you have #define NBD_META_ID_BASE_ALLOCATION 0, that 
> means you use the context ID of 0 in response to both _LIST (where the 
> spec says the server SHOULD send 0, but the client SHOULD ignore the 
> id field) and for _SET (where the client needs a unique ID for every 
> separate context that actually got selected - but since we only select 
> a single context, we get away with it).  I suspect later patches will 
> add additional contexts where you'll have to actually worry about 
> context ids (and always sending 0 in response to _LIST), but for now, 
> this works :)
>
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
>> +    if (ret == 0) {
>> +        meta->valid = true;
>> +    }
>
> Interesting - meta->valid can be true even if 0 contexts were 
> selected; I presume we use this later on to decide whether to reply 
> with EINVAL vs. an empty list if a client uses NBD_CMD_BLOCK_STATUS 
> without a valid _SET call.  Then again, the NBD spec currently states 
> that NBD_CMD_BLOCK_STATUS should not be used even if _SET succeeded, 
> if the _SET didn't return at least one context.
>
>> +
>> +    return ret;
>> +}
>> +
>>   /* nbd_negotiate_options
>>    * Process all NBD_OPT_* client option commands, during fixed newstyle
>>    * negotiation.
>> @@ -826,6 +1031,22 @@ static int nbd_negotiate_options(NBDClient 
>> *client, uint16_t myflags,
>>                   }
>>                   break;
>>   +            case NBD_OPT_LIST_META_CONTEXT:
>> +            case NBD_OPT_SET_META_CONTEXT:
>> +                if (!client->structured_reply) {
>> +                    ret = nbd_opt_invalid(
>> +                            client, errp,
>> +                            "request option '%s' when structured 
>> reply "
>> +                            "is not negotiated", 
>> nbd_opt_lookup(option));
>
> [1] okay, you did filter out clients that request out of order.
>
>> +                } else if (option == NBD_OPT_LIST_META_CONTEXT) {
>> +                    ret = nbd_negotiate_meta_queries(client, NULL, 
>> errp);
>> +                } else {
>> +                    ret = nbd_negotiate_meta_queries(client,
>> + &client->export_meta,
>> +                                                     errp);
>
> Okay, so you DO have a difference on whether you pass in 'meta' based 
> on whether it is a local response vs. setting things for later.
>
>> +                }
>> +                break;
>> +
>>               default:
>>                   ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
>>                                      "Unsupported option 0x%" PRIx32 
>> " (%s)",
>> @@ -1446,6 +1667,78 @@ static int coroutine_fn 
>> nbd_co_send_structured_error(NBDClient *client,
>>       return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
>>   }
>>   +static int blockstatus_to_extent_be(BlockDriverState *bs, uint64_t 
>> offset,
>> +                                    uint64_t bytes, NBDExtent *extent)
>> +{
>> +    uint64_t tail_bytes = bytes;
>> +
>> +    while (tail_bytes) {
>
> I might have named this 'remaining'
>
>> +        uint32_t flags;
>> +        int64_t num;
>> +        int ret = bdrv_block_status_above(bs, NULL, offset, 
>> tail_bytes, &num,
>> +                                          NULL, NULL);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
>> +                (ret & BDRV_BLOCK_ZERO      ? NBD_STATE_ZERO : 0);
>
> Looks like the correct bit mapping to me.
>
>> +
>> +        if (tail_bytes == bytes) {
>> +            extent->flags = flags;
>> +        }
>> +
>> +        if (flags != extent->flags) {
>> +            break;
>> +        }
>> +
>> +        offset += num;
>> +        tail_bytes -= num;
>> +    }
>> +
>> +    cpu_to_be32s(&extent->flags);
>> +    extent->length = cpu_to_be32(bytes - tail_bytes);
>
> This only computes one extent, but tries to find the longest extent 
> possible (if two consecutive bdrv_block_status calls return the same 
> status, perhaps because of fragmentation).  Presumably this is called 
> in a loop to find a full list of extents covering the client's entire 
> request? [2]

yes. And tiny overhead is one extra found extent of other type, so this 
function can't be efficiently reused to implement several extents. 
However, I don't see real usage for this now, as we don't have multi 
extent blockstatus support in block layer.. And code for dirty bitmap 
would be separate anyway.

>
>> +
>> +    return 0;
>> +}
>> +
>> +/* nbd_co_send_extents
>> + * @extents should be in big-endian */
>> +static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
>> +                               NBDExtent *extents, unsigned nb_extents,
>> +                               uint32_t context_id, Error **errp)
>> +{
>> +    NBDStructuredMeta chunk;
>> +
>> +    struct iovec iov[] = {
>> +        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
>> +        {.iov_base = extents, .iov_len = nb_extents * 
>> sizeof(extents[0])}
>> +    };
>> +
>> +    set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, 
>> NBD_REPLY_TYPE_BLOCK_STATUS,
>> +                 handle, sizeof(chunk) - sizeof(chunk.h) + 
>> iov[1].iov_len);
>
> This says you can only send a single chunk, which matches the fact 
> that right now you support exactly one context type.  Presumably later 
> patches tweak this.
>
>> +    stl_be_p(&chunk.context_id, context_id);
>> +
>> +    return nbd_co_send_iov(client, iov, 2, errp);
>> +}
>> +
>> +static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
>> +                                    BlockDriverState *bs, uint64_t 
>> offset,
>> +                                    uint64_t length, uint32_t 
>> context_id,
>> +                                    Error **errp)
>
> No function comment?
>
>> +{
>> +    int ret;
>> +    NBDExtent extent;
>> +
>> +    ret = blockstatus_to_extent_be(bs, offset, length, &extent);
>> +    if (ret < 0) {
>> +        return nbd_co_send_structured_error(
>> +                client, handle, -ret, "can't get block status", errp);
>> +    }
>> +
>> +    return nbd_co_send_extents(client, handle, &extent, 1, 
>> context_id, errp);
>
> [2] Okay, no looping to answer the client's entire request, so that 
> may be something you add in a later patch, to keep this initial patch 
> minimal.  But at least it matches your commit message.
>
>> +}
>> +
>>   /* nbd_co_receive_request
>>    * Collect a client request. Return 0 if request looks valid, -EIO 
>> to drop
>>    * connection right away, and any other negative value to report an 
>> error to
>> @@ -1523,6 +1816,8 @@ static int 
>> nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
>>           valid_flags |= NBD_CMD_FLAG_DF;
>>       } else if (request->type == NBD_CMD_WRITE_ZEROES) {
>>           valid_flags |= NBD_CMD_FLAG_NO_HOLE;
>> +    } else if (request->type == NBD_CMD_BLOCK_STATUS) {
>> +        valid_flags |= NBD_CMD_FLAG_REQ_ONE;
>>       }
>>       if (request->flags & ~valid_flags) {
>>           error_setg(errp, "unsupported flags for command %s (got 
>> 0x%x)",
>> @@ -1650,6 +1945,19 @@ static coroutine_fn void nbd_trip(void *opaque)
>>           }
>>             break;
>> +    case NBD_CMD_BLOCK_STATUS:
>> +        if (client->export_meta.base_allocation) {
>> +            ret = nbd_co_send_block_status(req->client, request.handle,
>> +                                           blk_bs(exp->blk), 
>> request.from,
>> +                                           request.len,
>> + NBD_META_ID_BASE_ALLOCATION,
>> +                                           &local_err);
>
> No check of client->export_meta.valid?

hmm, strange, looks like I don't use .valid at all. worth add.

>
>> +        } else {
>> +            ret = -EINVAL;
>> +            error_setg(&local_err, "CMD_BLOCK_STATUS not negotiated");
>> +        }
>
> At this point, we've either sent the final structured chunk, or we've 
> sent nothing and have ret < 0;...
>
>> +
>> +        break;
>>       default:
>>           error_setg(&local_err, "invalid request type (%" PRIu32 ") 
>> received",
>>                      request.type);
>> @@ -1680,7 +1988,7 @@ reply:
>
>     if (client->structured_reply &&
>         (ret < 0 || request.type == NBD_CMD_READ)) {
>
>>               ret = nbd_co_send_structured_done(req->client, 
>> request.handle,
>>                                                 &local_err);
>>           }
>
> ...iIf the client negotiated structured reply, then we've sent the 
> error message.  But if the client did NOT negotiate structured reply, 
> then...
>
>> -    } else {
>> +    } else if (request.type != NBD_CMD_BLOCK_STATUS) {
>
> ...we fail to reply at all.  Oops.  That's not good for 
> interoperability.  You'll need to make sure that we reply with EINVAL 
> even if the client (mistakenly) calls NBD_CMD_BLOCK_STATUS without 
> negotiating structured replies.

good caught! I felt that I'm doing something ugly, when I was writing 
this if. I'll think about how to refactor this a bit.

>
>>           ret = nbd_co_send_simple_reply(req->client, request.handle,
>>                                          ret < 0 ? -ret : 0,
>>                                          req->data, reply_data_len, 
>> &local_err);
>>
>
> Also missing from the patch - where do you validate that the export 
> name in client->export_meta matches the export name in NBD_OPT_GO?  If 
> they don't match, then export_meta should be discarded.
>
hm, me too.


Thank you for careful review!
Eric Blake Feb. 16, 2018, 5:01 p.m. UTC | #6
On 02/16/2018 08:43 AM, Vladimir Sementsov-Ogievskiy wrote:
> 16.02.2018 16:21, Eric Blake wrote:
>> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> Minimal realization: only one extent in server answer is supported.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---

>> Again, function comments are useful.
>>
>>> +    if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
>>> +        /* Note: empty query should select all contexts within base
>>> +         * namespace. */
>>> +        meta->base_allocation = true;
>>
>> From the client perspective, this handling of the empty leaf-name 
>> works well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf 
>> nodes the server supports), but not so well for 
>> NBD_OPT_SET_META_CONTEXT (asking the server to send ALL base 
>> allocations, even when I don't necessarily know how to interpret 
>> anything other than base:allocation, is a waste). So this function 
>> needs a bool parameter that says whether it is being invoked from 
>> _LIST (empty string okay, to advertise ALL base leaf names back to 
>> client, which for now is just base:allocation) or from _SET (empty 
>> string is ignored as invalid; client has to specifically ask for 
>> base:allocation by name).
> 
> "empty string is ignored as invalid", hm, do we have this in spec? I 
> think SET and LIST must select exactly same sets of contexts.

No, it is specifically NOT intended that SET and LIST have to produce 
the same set of contexts; although I do see your point that as currently 
written, it does not appear to require SET to ignore "base:" or to treat 
it as an error.  At any rate, the spec gives the example of:

> In either case, however, for any given namespace the server MAY, instead of exhaustively listing every matching context available to select (or every context available to select where no query is given), send sufficient context records back to allow a client with knowledge of the namespace to select any context. This may be helpful where a client can construct algorithmic queries. For instance, a client might reply simply with the namespace with no leaf-name (e.g. 'X-FooBar:') or with a range of values (e.g. 'X-ModifiedDate:20160310-20161214'). The semantics of such a reply are a matter for the definition of the namespace. However each namespace returned MUST begin with the relevant namespace, followed by a colon, and then other UTF-8 characters, with the entire string following the restrictions for strings set out earlier in this document.

with the intent being that for some namespaces, it may be easy to 
perform an algorithmic query via _LIST to see what ranges are supported, 
but that you cannot select ALL elements in the range simultaneously. 
The empty query for _LIST exists to enumerate what is supported, but 
does not have to equate to an empty query for _SET selecting everything 
possible.  I could even see it being possible to have some round-trips, 
depending on the namespace (of course, any namespace other than "base:" 
will be tightly coordinated between both client and server, so they 
understand each other - the point was that the NBD spec didn't want to 
constrain what a client and server could do as long as they stay within 
the generic framework):

C> LIST ""
S> REPLY "base:allocation" id 0
S> REPLY "X-FooBar:" id 0
S> ACK
C> LIST "X-FooBar:"
S> REPLY "X-FooBar:A_Required", id 0
S> REPLY "X-FooBar:A_Min=100", id 0
S> REPLY "X-FooBar:A_Max=200", id 0
S> REPLY "X-FooBar:B_Default=300", id 0
S> REPLY "X-FooBar:B_Min=300", id 0
S> REPLY "X-FooBar:B_Max=400", id 0
S> ACK
C> SET "X-FooBar:A=150" "base:allocation"
S> REPLY "X-FooBar:A=150,B=300", id 1
S> REPLY "base:allocation", id 2
S> ACK

where the global query of all available contexts merely lists that 
X-FooBar: is understood, but that a specific query is needed for more 
details (to avoid the client having to parse those specifics if it 
doesn't care about X-FooBar:), and the specific query sets up the 
algorithmic description (parameter A is required, between 100 and 200; 
parameter B is optional, between 300 and 400, defaulting to 300), and 
the final SET gives the actual request (A given a value, B left to its 
default; but the reply names the entire context rather than repeating 
the shorter request).  So the spec is written to permit something like 
that for third-party namespaces, while also trying to be very specific 
about the "base:" context as that is the one that needs the most 
interoperability.

> It is 
> strange behavior of client to set "base:", but it is its decision. And I 
> don't thing that it is invalid.

For LIST, querying "base:" makes total sense (for the sake of example, 
we may add "base:frob" down the road that does something new.  Being 
able to LIST whether "base:" turns into just "base:allocation" or into 
"base:allocation"+"base:frob" may be useful to a client that understands 
both formats and wants to probe if the server is new; and even for a 
client right now, the client can gracefully note that it doesn't want to 
select "base:frob").  But for SET, it does not (if "base:" turns into 
"base:allocation" + "base:frob" down the road, then the server is 
wasting time preparing the response to "base:frob" for every 
NBD_CMD_BLOCK_STATUS, and the client is wasting time unpacking from the 
wire and ignoring it), so having the empty query work on LIST but not on 
SET makes sense.

> Formally we may answer with NBD_REP_ERR_TOO_BIG, but it will look weird, 
> as client see that both base: and base:allocation returns _one_ context, 
> but in one case it is too big. But if we will have several base: 
> contextes, server may fairly answer with NBD_REP_ERR_TOO_BIG.

Hmm, you have a point that while a client can ask for "namespace:", the 
server should always respond with "namespace:leaf" for the actual 
contexts that it supports/selects, so that the client knows which leaves 
it actually got, if it does not fail with ERR_TOO_BIG.  You are also 
right that failing with ERR_TOO_BIG for "base:" seems odd, but it may 
make more sense for other namespaces.

> 
> So, I think for now the code is ok.

Then this is probably worth something to bring up on the NBD list, if we 
need to tweak wording to be more explicit (whether we should 
allow/forbid wildcards during SET, or if wildcard queries are intended 
only for LIST).  Sounds like I have more spec emails to write to the NBD 
list.

> 
> Also, I don't see NBD_REP_ERR_TOO_BIG possible reply in 
> NBD_OPT_LIST_META_CONTEXT description. Should it be here?

Yeah, that's probably also worth adding to the upstream spec, even 
though it already encourages LIST results to send compressed information 
back that allows a client to contruct valid specific queries, rather 
than an exhaustive list of selecting everything possible.

>>> +/* nbd_negotiate_meta_query
>>> + * Return -errno on I/O error, 0 if option was completely handled by
>>> + * sending a reply about inconsistent lengths, or 1 on success. */
>>> +static int nbd_negotiate_meta_query(NBDClient *client,
>>> +                                    NBDExportMetaContexts *meta, 
>>> Error **errp)
>>> +{
>>> +    int ret;
>>> +    char *query, *colon, *namespace, *subquery;
>>
>> Is it worth stack-allocating query here, so you don't have to g_free() 
>> it later?  NBD limits the maximum string to 4k, which is a little bit 
>> big for stack allocation (on an operating system with 4k pages, 
>> allocating more than 4k on the stack in one function risks missing the 
>> guard page on stack overflow), but we also have the benefit that we 
>> KNOW that the set of meta-context namespaces that we support have a 
>> much smaller maximum limit of what we even care about.
> 
> it is not stack allocated, nbd_alloc_read_size_string calls g_malloc.

Hence my question - do we NEED the malloc'd version, or can we get away 
with a stack-allocated space?  Although I then revised my question...

> 
>>
>>> +
>>> +    ret = nbd_alloc_read_size_string(client, &query, errp);
>>> +    if (ret <= 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    colon = strchr(query, ':');
>>> +    if (colon == NULL) {
>>> +        ret = nbd_opt_invalid(client, errp, "no colon in query");
>>> +        goto out;
>>
>> Hmm, that puts a slight wrinkle into my proposal, or else maybe it is 
>> something I should bring up on the NBD list.  If we only read 5 
>> characters (because the max namespace WE support is "base:"), but a 
>> client asks for namespace "X-longname:", we should gracefully ignore 
>> the client's request; while we still want to reply with an error to a 
>> client that asks for "garbage" with no colon at all. The question for 
>> the NBD spec, then, is whether detecting bad client requests that 
>> didn't use colon is mandatory for the server (meaning we MUST read the 
>> entire namespace request, and search for the colon) or merely best 
>> effort (we only have to read 5 characters, and if we silently ignore 
>> instead of diagnose a bad namespace request that was longer than that, 
>> oh well). Worded from the client, it switches to a question of whether 
>> the client should expect the server to diagnose all requests, or must 
>> be prepared for the server to ignore requests even where those 
>> requests are bogus. Or, the NBD spec may change slightly to pass 
>> namespace and leafname as separate fields, both with lengths, rather 
>> than a colon, to make it easier for the server to skip over an unknown 
>> namespace/leaf pair without having to parse whether a colon was 
>> present.  I'll send that in a separate email (the upstream NBD list 
>> doesn't need to see all my review comments on this thread).

... in light of this thread now on the NBD list.

> 
> Thank you for careful review!

No problem. We still have some things to sort out on the NBD list as 
well, but I want to make sure we get something that is likely to work 
well with other implementations (I'm also trying, on the side, to get 
nbdkit to support structured reads so I have something available for 
testing cross-implementation support, but it is slow going).
Vladimir Sementsov-Ogievskiy March 1, 2018, 11:36 a.m. UTC | #7
16.02.2018 20:01, Eric Blake wrote:
> On 02/16/2018 08:43 AM, Vladimir Sementsov-Ogievskiy wrote:
>> 16.02.2018 16:21, Eric Blake wrote:
>>> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> Minimal realization: only one extent in server answer is supported.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>> ---
>
>>> Again, function comments are useful.
>>>
>>>> +    if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
>>>> +        /* Note: empty query should select all contexts within base
>>>> +         * namespace. */
>>>> +        meta->base_allocation = true;
>>>
>>> From the client perspective, this handling of the empty leaf-name 
>>> works well for NBD_OPT_LIST_META_CONTEXT (I want to see what leaf 
>>> nodes the server supports), but not so well for 
>>> NBD_OPT_SET_META_CONTEXT (asking the server to send ALL base 
>>> allocations, even when I don't necessarily know how to interpret 
>>> anything other than base:allocation, is a waste). So this function 
>>> needs a bool parameter that says whether it is being invoked from 
>>> _LIST (empty string okay, to advertise ALL base leaf names back to 
>>> client, which for now is just base:allocation) or from _SET (empty 
>>> string is ignored as invalid; client has to specifically ask for 
>>> base:allocation by name).
>>
>> "empty string is ignored as invalid", hm, do we have this in spec? I 
>> think SET and LIST must select exactly same sets of contexts.
>
> No, it is specifically NOT intended that SET and LIST have to produce 
> the same set of contexts; although I do see your point that as 
> currently written, it does not appear to require SET to ignore "base:" 
> or to treat it as an error.  At any rate, the spec gives the example of:
>
>> In either case, however, for any given namespace the server MAY, 
>> instead of exhaustively listing every matching context available to 
>> select (or every context available to select where no query is 
>> given), send sufficient context records back to allow a client with 
>> knowledge of the namespace to select any context. This may be helpful 
>> where a client can construct algorithmic queries. For instance, a 
>> client might reply simply with the namespace with no leaf-name (e.g. 
>> 'X-FooBar:') or with a range of values (e.g. 
>> 'X-ModifiedDate:20160310-20161214'). The semantics of such a reply 
>> are a matter for the definition of the namespace. However each 
>> namespace returned MUST begin with the relevant namespace, followed 
>> by a colon, and then other UTF-8 characters, with the entire string 
>> following the restrictions for strings set out earlier in this document.
>
> with the intent being that for some namespaces, it may be easy to 
> perform an algorithmic query via _LIST to see what ranges are 
> supported, but that you cannot select ALL elements in the range 
> simultaneously. The empty query for _LIST exists to enumerate what is 
> supported, but does not have to equate to an empty query for _SET 
> selecting everything possible.  I could even see it being possible to 
> have some round-trips, depending on the namespace (of course, any 
> namespace other than "base:" will be tightly coordinated between both 
> client and server, so they understand each other - the point was that 
> the NBD spec didn't want to constrain what a client and server could 
> do as long as they stay within the generic framework):
>
> C> LIST ""
> S> REPLY "base:allocation" id 0
> S> REPLY "X-FooBar:" id 0
> S> ACK
> C> LIST "X-FooBar:"
> S> REPLY "X-FooBar:A_Required", id 0
> S> REPLY "X-FooBar:A_Min=100", id 0
> S> REPLY "X-FooBar:A_Max=200", id 0
> S> REPLY "X-FooBar:B_Default=300", id 0
> S> REPLY "X-FooBar:B_Min=300", id 0
> S> REPLY "X-FooBar:B_Max=400", id 0
> S> ACK
> C> SET "X-FooBar:A=150" "base:allocation"
> S> REPLY "X-FooBar:A=150,B=300", id 1
> S> REPLY "base:allocation", id 2
> S> ACK
>
> where the global query of all available contexts merely lists that 
> X-FooBar: is understood, but that a specific query is needed for more 
> details (to avoid the client having to parse those specifics if it 
> doesn't care about X-FooBar:), and the specific query sets up the 
> algorithmic description (parameter A is required, between 100 and 200; 
> parameter B is optional, between 300 and 400, defaulting to 300), and 
> the final SET gives the actual request (A given a value, B left to its 
> default; but the reply names the entire context rather than repeating 
> the shorter request).  So the spec is written to permit something like 
> that for third-party namespaces, while also trying to be very specific 
> about the "base:" context as that is the one that needs the most 
> interoperability.
>
>> It is strange behavior of client to set "base:", but it is its 
>> decision. And I don't thing that it is invalid.
>
> For LIST, querying "base:" makes total sense (for the sake of example, 
> we may add "base:frob" down the road that does something new.  Being 
> able to LIST whether "base:" turns into just "base:allocation" or into 
> "base:allocation"+"base:frob" may be useful to a client that 
> understands both formats and wants to probe if the server is new; and 
> even for a client right now, the client can gracefully note that it 
> doesn't want to select "base:frob").  But for SET, it does not (if 
> "base:" turns into "base:allocation" + "base:frob" down the road, then 
> the server is wasting time preparing the response to "base:frob" for 
> every NBD_CMD_BLOCK_STATUS, and the client is wasting time unpacking 
> from the wire and ignoring it), so having the empty query work on LIST 
> but not on SET makes sense.

Ok, understand, you are right, and, correspondingly,

>
>> Formally we may answer with NBD_REP_ERR_TOO_BIG, but it will look 
>> weird, as client see that both base: and base:allocation returns 
>> _one_ context, but in one case it is too big. But if we will have 
>> several base: contextes, server may fairly answer with 
>> NBD_REP_ERR_TOO_BIG.
>
> Hmm, you have a point that while a client can ask for "namespace:", 
> the server should always respond with "namespace:leaf" for the actual 
> contexts that it supports/selects, so that the client knows which 
> leaves it actually got, if it does not fail with ERR_TOO_BIG.  You are 
> also right that failing with ERR_TOO_BIG for "base:" seems odd, but it 
> may make more sense for other namespaces.

_INVALID is ok here.

>
>>
>> So, I think for now the code is ok.
>
> Then this is probably worth something to bring up on the NBD list, if 
> we need to tweak wording to be more explicit (whether we should 
> allow/forbid wildcards during SET, or if wildcard queries are intended 
> only for LIST).  Sounds like I have more spec emails to write to the 
> NBD list.

So, code is not ok, I'll fix)

>
>>
>> Also, I don't see NBD_REP_ERR_TOO_BIG possible reply in 
>> NBD_OPT_LIST_META_CONTEXT description. Should it be here?
>
> Yeah, that's probably also worth adding to the upstream spec, even 
> though it already encourages LIST results to send compressed 
> information back that allows a client to contruct valid specific 
> queries, rather than an exhaustive list of selecting everything possible.
>
>>>> +/* nbd_negotiate_meta_query
>>>> + * Return -errno on I/O error, 0 if option was completely handled by
>>>> + * sending a reply about inconsistent lengths, or 1 on success. */
>>>> +static int nbd_negotiate_meta_query(NBDClient *client,
>>>> +                                    NBDExportMetaContexts *meta, 
>>>> Error **errp)
>>>> +{
>>>> +    int ret;
>>>> +    char *query, *colon, *namespace, *subquery;
>>>
>>> Is it worth stack-allocating query here, so you don't have to 
>>> g_free() it later?  NBD limits the maximum string to 4k, which is a 
>>> little bit big for stack allocation (on an operating system with 4k 
>>> pages, allocating more than 4k on the stack in one function risks 
>>> missing the guard page on stack overflow), but we also have the 
>>> benefit that we KNOW that the set of meta-context namespaces that we 
>>> support have a much smaller maximum limit of what we even care about.
>>
>> it is not stack allocated, nbd_alloc_read_size_string calls g_malloc.
>
> Hence my question - do we NEED the malloc'd version, or can we get 
> away with a stack-allocated space?  Although I then revised my 
> question...

ohm, just misread, will try.

>
>>
>>>
>>>> +
>>>> +    ret = nbd_alloc_read_size_string(client, &query, errp);
>>>> +    if (ret <= 0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    colon = strchr(query, ':');
>>>> +    if (colon == NULL) {
>>>> +        ret = nbd_opt_invalid(client, errp, "no colon in query");
>>>> +        goto out;
>>>
>>> Hmm, that puts a slight wrinkle into my proposal, or else maybe it 
>>> is something I should bring up on the NBD list.  If we only read 5 
>>> characters (because the max namespace WE support is "base:"), but a 
>>> client asks for namespace "X-longname:", we should gracefully ignore 
>>> the client's request; while we still want to reply with an error to 
>>> a client that asks for "garbage" with no colon at all. The question 
>>> for the NBD spec, then, is whether detecting bad client requests 
>>> that didn't use colon is mandatory for the server (meaning we MUST 
>>> read the entire namespace request, and search for the colon) or 
>>> merely best effort (we only have to read 5 characters, and if we 
>>> silently ignore instead of diagnose a bad namespace request that was 
>>> longer than that, oh well). Worded from the client, it switches to a 
>>> question of whether the client should expect the server to diagnose 
>>> all requests, or must be prepared for the server to ignore requests 
>>> even where those requests are bogus. Or, the NBD spec may change 
>>> slightly to pass namespace and leafname as separate fields, both 
>>> with lengths, rather than a colon, to make it easier for the server 
>>> to skip over an unknown namespace/leaf pair without having to parse 
>>> whether a colon was present. I'll send that in a separate email (the 
>>> upstream NBD list doesn't need to see all my review comments on this 
>>> thread).
>
> ... in light of this thread now on the NBD list.
>
>>
>> Thank you for careful review!
>
> No problem. We still have some things to sort out on the NBD list as 
> well, but I want to make sure we get something that is likely to work 
> well with other implementations (I'm also trying, on the side, to get 
> nbdkit to support structured reads so I have something available for 
> testing cross-implementation support, but it is slow going).
>
Vladimir Sementsov-Ogievskiy March 2, 2018, 3:07 p.m. UTC | #8
16.02.2018 16:21, Eric Blake wrote:
> On 02/15/2018 07:51 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Minimal realization: only one extent in server answer is supported.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>
>

[...]

>> +        meta = &local_meta;
>> +    }
>> +
>> +    memset(meta, 0, sizeof(*meta));
>> +
>> +    ret = nbd_read_size_string(client, meta->export_name,
>> +                               NBD_MAX_NAME_SIZE, errp);
>
> Revisiting a question I raised in my first half review - you saved the 
> name as part of the struct because we have to later compare that the 
> final OPT_SET export name matches the request during OPT_GO (if they 
> don't match, then we have no contexts to serve after all).  So a 
> 'const char *' won't work, but maybe the struct could use a 'char *' 
> pointing to malloc'd storage rather than char[MAX_NAME] that reserves 
> array space that is mostly unused for the typical name that is much 
> shorter than the maximum name length.

Do these 256 bytes worth creating nbd_clear_meta function, to call it 
from client_put and on NBD_OPT_SET_META_CONTEXT, to clear previous meta? 
If yes, I'd prefer to postpone it to continuation about BLOCKSTATUS for 
dirty bitmaps, when I'll have to upgrade meta structure, to maintain 
dirty bitmaps.

>
>> +    if (ret <= 0) {
>> +        return ret;
>> +    }
>> +
>> +    exp = nbd_export_find(meta->export_name);
>> +    if (exp == NULL) {
>> +        return nbd_opt_invalid(client, errp,
>> +                               "export '%s' not present", 
>> meta->export_name);
>
diff mbox series

Patch

diff --git a/include/block/nbd.h b/include/block/nbd.h
index ef1698914b..b16215d17a 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -41,6 +41,12 @@  struct NBDOptionReply {
 } QEMU_PACKED;
 typedef struct NBDOptionReply NBDOptionReply;
 
+typedef struct NBDOptionReplyMetaContext {
+    NBDOptionReply h; /* h.type = NBD_REP_META_CONTEXT, h.length > 4 */
+    uint32_t context_id;
+    /* meta context name follows */
+} QEMU_PACKED NBDOptionReplyMetaContext;
+
 /* Transmission phase structs
  *
  * Note: these are _NOT_ the same as the network representation of an NBD
@@ -105,6 +111,19 @@  typedef struct NBDStructuredError {
     uint16_t message_length;
 } QEMU_PACKED NBDStructuredError;
 
+/* Header of NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDStructuredMeta {
+    NBDStructuredReplyChunk h; /* h.length >= 12 (at least one extent) */
+    uint32_t context_id;
+    /* extents follows */
+} QEMU_PACKED NBDStructuredMeta;
+
+/* Extent chunk for NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDExtent {
+    uint32_t length;
+    uint32_t flags; /* NBD_STATE_* */
+} QEMU_PACKED NBDExtent;
+
 /* Transmission (export) flags: sent from server to client during handshake,
    but describe what will happen during transmission */
 #define NBD_FLAG_HAS_FLAGS         (1 << 0) /* Flags are there */
@@ -136,6 +155,8 @@  typedef struct NBDStructuredError {
 #define NBD_OPT_INFO              (6)
 #define NBD_OPT_GO                (7)
 #define NBD_OPT_STRUCTURED_REPLY  (8)
+#define NBD_OPT_LIST_META_CONTEXT (9)
+#define NBD_OPT_SET_META_CONTEXT  (10)
 
 /* Option reply types. */
 #define NBD_REP_ERR(value) ((UINT32_C(1) << 31) | (value))
@@ -143,6 +164,7 @@  typedef struct NBDStructuredError {
 #define NBD_REP_ACK             (1)    /* Data sending finished. */
 #define NBD_REP_SERVER          (2)    /* Export description. */
 #define NBD_REP_INFO            (3)    /* NBD_OPT_INFO/GO. */
+#define NBD_REP_META_CONTEXT    (4)    /* NBD_OPT_{LIST,SET}_META_CONTEXT */
 
 #define NBD_REP_ERR_UNSUP           NBD_REP_ERR(1)  /* Unknown option */
 #define NBD_REP_ERR_POLICY          NBD_REP_ERR(2)  /* Server denied */
@@ -163,6 +185,10 @@  typedef struct NBDStructuredError {
 #define NBD_CMD_FLAG_FUA        (1 << 0) /* 'force unit access' during write */
 #define NBD_CMD_FLAG_NO_HOLE    (1 << 1) /* don't punch hole on zero run */
 #define NBD_CMD_FLAG_DF         (1 << 2) /* don't fragment structured read */
+#define NBD_CMD_FLAG_REQ_ONE    (1 << 3) /* only one extent in BLOCK_STATUS
+                                          * reply chunk */
+
+#define NBD_META_ID_BASE_ALLOCATION 0
 
 /* Supported request types */
 enum {
@@ -173,6 +199,7 @@  enum {
     NBD_CMD_TRIM = 4,
     /* 5 reserved for failed experiment NBD_CMD_CACHE */
     NBD_CMD_WRITE_ZEROES = 6,
+    NBD_CMD_BLOCK_STATUS = 7,
 };
 
 #define NBD_DEFAULT_PORT	10809
@@ -200,9 +227,15 @@  enum {
 #define NBD_REPLY_TYPE_NONE          0
 #define NBD_REPLY_TYPE_OFFSET_DATA   1
 #define NBD_REPLY_TYPE_OFFSET_HOLE   2
+#define NBD_REPLY_TYPE_BLOCK_STATUS  5
 #define NBD_REPLY_TYPE_ERROR         NBD_REPLY_ERR(1)
 #define NBD_REPLY_TYPE_ERROR_OFFSET  NBD_REPLY_ERR(2)
 
+/* Flags for extents (NBDExtent.flags) of NBD_REPLY_TYPE_BLOCK_STATUS,
+ * for base:allocation meta context */
+#define NBD_STATE_HOLE (1 << 0)
+#define NBD_STATE_ZERO (1 << 1)
+
 static inline bool nbd_reply_type_is_error(int type)
 {
     return type & (1 << 15);
diff --git a/nbd/common.c b/nbd/common.c
index 6295526dd1..8c95c1d606 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -75,6 +75,10 @@  const char *nbd_opt_lookup(uint32_t opt)
         return "go";
     case NBD_OPT_STRUCTURED_REPLY:
         return "structured reply";
+    case NBD_OPT_LIST_META_CONTEXT:
+        return "list meta context";
+    case NBD_OPT_SET_META_CONTEXT:
+        return "set meta context";
     default:
         return "<unknown>";
     }
@@ -90,6 +94,8 @@  const char *nbd_rep_lookup(uint32_t rep)
         return "server";
     case NBD_REP_INFO:
         return "info";
+    case NBD_REP_META_CONTEXT:
+        return "meta context";
     case NBD_REP_ERR_UNSUP:
         return "unsupported";
     case NBD_REP_ERR_POLICY:
@@ -144,6 +150,8 @@  const char *nbd_cmd_lookup(uint16_t cmd)
         return "trim";
     case NBD_CMD_WRITE_ZEROES:
         return "write zeroes";
+    case NBD_CMD_BLOCK_STATUS:
+        return "block status";
     default:
         return "<unknown>";
     }
@@ -159,6 +167,8 @@  const char *nbd_reply_type_lookup(uint16_t type)
         return "data";
     case NBD_REPLY_TYPE_OFFSET_HOLE:
         return "hole";
+    case NBD_REPLY_TYPE_BLOCK_STATUS:
+        return "block status";
     case NBD_REPLY_TYPE_ERROR:
         return "generic error";
     case NBD_REPLY_TYPE_ERROR_OFFSET:
diff --git a/nbd/server.c b/nbd/server.c
index b9860a6dcf..f2a750eb97 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -82,6 +82,15 @@  struct NBDExport {
 
 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
 
+/* NBDExportMetaContexts represents list of selected by
+ * NBD_OPT_SET_META_CONTEXT contexts to be exported. */
+typedef struct NBDExportMetaContexts {
+    char export_name[NBD_MAX_NAME_SIZE + 1];
+    bool valid; /* means that negotiation of the option finished without
+                   errors */
+    bool base_allocation; /* export base:allocation context (block status) */
+} NBDExportMetaContexts;
+
 struct NBDClient {
     int refcount;
     void (*close_fn)(NBDClient *client, bool negotiated);
@@ -102,6 +111,7 @@  struct NBDClient {
     bool closing;
 
     bool structured_reply;
+    NBDExportMetaContexts export_meta;
 
     uint32_t opt; /* Current option being negotiated */
     uint32_t optlen; /* remaining length of data in ioc for the option being
@@ -636,6 +646,201 @@  static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
     return QIO_CHANNEL(tioc);
 }
 
+/* nbd_alloc_read_size_string
+ *
+ * Read string in format
+ *   uint32_t len
+ *   len bytes string (not 0-terminated)
+ * String is allocated and pointer returned as @buf
+ *
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_alloc_read_size_string(NBDClient *client, char **buf,
+                                      Error **errp)
+{
+    int ret;
+    uint32_t len;
+
+    ret = nbd_opt_read(client, &len, sizeof(len), errp);
+    if (ret <= 0) {
+        return ret;
+    }
+    cpu_to_be32s(&len);
+
+    *buf = g_try_malloc(len + 1);
+    if (*buf == NULL) {
+        error_setg(errp, "No memory");
+        return -ENOMEM;
+    }
+    (*buf)[len] = '\0';
+
+    ret = nbd_opt_read(client, *buf, len, errp);
+    if (ret <= 0) {
+        g_free(*buf);
+        *buf = NULL;
+    }
+
+    return ret;
+}
+
+/* nbd_read_size_string
+ *
+ * Read string in format
+ *   uint32_t len
+ *   len bytes string (not 0-terminated)
+ *
+ * @buf should be enough to store @max_len+1
+ *
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_read_size_string(NBDClient *client, char *buf,
+                                uint32_t max_len, Error **errp)
+{
+    int ret;
+    uint32_t len;
+
+    ret = nbd_opt_read(client, &len, sizeof(len), errp);
+    if (ret <= 0) {
+        return ret;
+    }
+    cpu_to_be32s(&len);
+
+    if (len > max_len) {
+        return nbd_opt_invalid(client, errp, "Invalid string length: %u", len);
+    }
+
+    ret = nbd_opt_read(client, buf, len, errp);
+    if (ret <= 0) {
+        return ret;
+    }
+    buf[len] = '\0';
+
+    return 1;
+}
+
+static int nbd_negotiate_send_meta_context(NBDClient *client,
+                                           const char *context,
+                                           uint32_t context_id,
+                                           Error **errp)
+{
+    NBDOptionReplyMetaContext opt;
+    struct iovec iov[] = {
+        {.iov_base = &opt, .iov_len = sizeof(opt)},
+        {.iov_base = (void *)context, .iov_len = strlen(context)}
+    };
+
+    set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
+                      sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
+    stl_be_p(&opt.context_id, context_id);
+
+    return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
+}
+
+static void nbd_meta_base_query(NBDExportMetaContexts *meta, const char *query)
+{
+    if (query[0] == '\0' || strcmp(query, "allocation") == 0) {
+        /* Note: empty query should select all contexts within base
+         * namespace. */
+        meta->base_allocation = true;
+    }
+}
+
+/* nbd_negotiate_meta_query
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_negotiate_meta_query(NBDClient *client,
+                                    NBDExportMetaContexts *meta, Error **errp)
+{
+    int ret;
+    char *query, *colon, *namespace, *subquery;
+
+    ret = nbd_alloc_read_size_string(client, &query, errp);
+    if (ret <= 0) {
+        return ret;
+    }
+
+    colon = strchr(query, ':');
+    if (colon == NULL) {
+        ret = nbd_opt_invalid(client, errp, "no colon in query");
+        goto out;
+    }
+    *colon = '\0';
+    namespace = query;
+    subquery = colon + 1;
+
+    if (strcmp(namespace, "base") == 0) {
+        nbd_meta_base_query(meta, subquery);
+    }
+
+out:
+    g_free(query);
+    return ret;
+}
+
+/* nbd_negotiate_meta_queries
+ * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
+ *
+ * Return -errno on I/O error, 0 if option was completely handled by
+ * sending a reply about inconsistent lengths, or 1 on success. */
+static int nbd_negotiate_meta_queries(NBDClient *client,
+                                      NBDExportMetaContexts *meta, Error **errp)
+{
+    int ret;
+    NBDExport *exp;
+    NBDExportMetaContexts local_meta;
+    uint32_t nb_queries;
+    int i;
+
+    assert(client->structured_reply);
+
+    if (meta == NULL) {
+        meta = &local_meta;
+    }
+
+    memset(meta, 0, sizeof(*meta));
+
+    ret = nbd_read_size_string(client, meta->export_name,
+                               NBD_MAX_NAME_SIZE, errp);
+    if (ret <= 0) {
+        return ret;
+    }
+
+    exp = nbd_export_find(meta->export_name);
+    if (exp == NULL) {
+        return nbd_opt_invalid(client, errp,
+                               "export '%s' not present", meta->export_name);
+    }
+
+    ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
+    if (ret <= 0) {
+        return ret;
+    }
+    cpu_to_be32s(&nb_queries);
+
+    for (i = 0; i < nb_queries; ++i) {
+        ret = nbd_negotiate_meta_query(client, meta, errp);
+        if (ret <= 0) {
+            return ret;
+        }
+    }
+
+    if (meta->base_allocation) {
+        ret = nbd_negotiate_send_meta_context(client, "base:allocation",
+                                              NBD_META_ID_BASE_ALLOCATION,
+                                              errp);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
+    if (ret == 0) {
+        meta->valid = true;
+    }
+
+    return ret;
+}
+
 /* nbd_negotiate_options
  * Process all NBD_OPT_* client option commands, during fixed newstyle
  * negotiation.
@@ -826,6 +1031,22 @@  static int nbd_negotiate_options(NBDClient *client, uint16_t myflags,
                 }
                 break;
 
+            case NBD_OPT_LIST_META_CONTEXT:
+            case NBD_OPT_SET_META_CONTEXT:
+                if (!client->structured_reply) {
+                    ret = nbd_opt_invalid(
+                            client, errp,
+                            "request option '%s' when structured reply "
+                            "is not negotiated", nbd_opt_lookup(option));
+                } else if (option == NBD_OPT_LIST_META_CONTEXT) {
+                    ret = nbd_negotiate_meta_queries(client, NULL, errp);
+                } else {
+                    ret = nbd_negotiate_meta_queries(client,
+                                                     &client->export_meta,
+                                                     errp);
+                }
+                break;
+
             default:
                 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
                                    "Unsupported option 0x%" PRIx32 " (%s)",
@@ -1446,6 +1667,78 @@  static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
     return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
 }
 
+static int blockstatus_to_extent_be(BlockDriverState *bs, uint64_t offset,
+                                    uint64_t bytes, NBDExtent *extent)
+{
+    uint64_t tail_bytes = bytes;
+
+    while (tail_bytes) {
+        uint32_t flags;
+        int64_t num;
+        int ret = bdrv_block_status_above(bs, NULL, offset, tail_bytes, &num,
+                                          NULL, NULL);
+        if (ret < 0) {
+            return ret;
+        }
+
+        flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
+                (ret & BDRV_BLOCK_ZERO      ? NBD_STATE_ZERO : 0);
+
+        if (tail_bytes == bytes) {
+            extent->flags = flags;
+        }
+
+        if (flags != extent->flags) {
+            break;
+        }
+
+        offset += num;
+        tail_bytes -= num;
+    }
+
+    cpu_to_be32s(&extent->flags);
+    extent->length = cpu_to_be32(bytes - tail_bytes);
+
+    return 0;
+}
+
+/* nbd_co_send_extents
+ * @extents should be in big-endian */
+static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
+                               NBDExtent *extents, unsigned nb_extents,
+                               uint32_t context_id, Error **errp)
+{
+    NBDStructuredMeta chunk;
+
+    struct iovec iov[] = {
+        {.iov_base = &chunk, .iov_len = sizeof(chunk)},
+        {.iov_base = extents, .iov_len = nb_extents * sizeof(extents[0])}
+    };
+
+    set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_BLOCK_STATUS,
+                 handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
+    stl_be_p(&chunk.context_id, context_id);
+
+    return nbd_co_send_iov(client, iov, 2, errp);
+}
+
+static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
+                                    BlockDriverState *bs, uint64_t offset,
+                                    uint64_t length, uint32_t context_id,
+                                    Error **errp)
+{
+    int ret;
+    NBDExtent extent;
+
+    ret = blockstatus_to_extent_be(bs, offset, length, &extent);
+    if (ret < 0) {
+        return nbd_co_send_structured_error(
+                client, handle, -ret, "can't get block status", errp);
+    }
+
+    return nbd_co_send_extents(client, handle, &extent, 1, context_id, errp);
+}
+
 /* nbd_co_receive_request
  * Collect a client request. Return 0 if request looks valid, -EIO to drop
  * connection right away, and any other negative value to report an error to
@@ -1523,6 +1816,8 @@  static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
         valid_flags |= NBD_CMD_FLAG_DF;
     } else if (request->type == NBD_CMD_WRITE_ZEROES) {
         valid_flags |= NBD_CMD_FLAG_NO_HOLE;
+    } else if (request->type == NBD_CMD_BLOCK_STATUS) {
+        valid_flags |= NBD_CMD_FLAG_REQ_ONE;
     }
     if (request->flags & ~valid_flags) {
         error_setg(errp, "unsupported flags for command %s (got 0x%x)",
@@ -1650,6 +1945,19 @@  static coroutine_fn void nbd_trip(void *opaque)
         }
 
         break;
+    case NBD_CMD_BLOCK_STATUS:
+        if (client->export_meta.base_allocation) {
+            ret = nbd_co_send_block_status(req->client, request.handle,
+                                           blk_bs(exp->blk), request.from,
+                                           request.len,
+                                           NBD_META_ID_BASE_ALLOCATION,
+                                           &local_err);
+        } else {
+            ret = -EINVAL;
+            error_setg(&local_err, "CMD_BLOCK_STATUS not negotiated");
+        }
+
+        break;
     default:
         error_setg(&local_err, "invalid request type (%" PRIu32 ") received",
                    request.type);
@@ -1680,7 +1988,7 @@  reply:
             ret = nbd_co_send_structured_done(req->client, request.handle,
                                               &local_err);
         }
-    } else {
+    } else if (request.type != NBD_CMD_BLOCK_STATUS) {
         ret = nbd_co_send_simple_reply(req->client, request.handle,
                                        ret < 0 ? -ret : 0,
                                        req->data, reply_data_len, &local_err);