diff mbox series

[8/8] nbd: Minimal structured read for client

Message ID 20170925135801.144261-9-vsementsov@virtuozzo.com
State New
Headers show
Series nbd minimal structured read | expand

Commit Message

Vladimir Sementsov-Ogievskiy Sept. 25, 2017, 1:58 p.m. UTC
Minimal implementation: drop most of additional error information.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/nbd-client.h  |   2 +
 include/block/nbd.h |  15 ++++-
 block/nbd-client.c  |  97 +++++++++++++++++++++++++-----
 nbd/client.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 249 insertions(+), 34 deletions(-)

Comments

Eric Blake Sept. 25, 2017, 10:19 p.m. UTC | #1
On 09/25/2017 08:58 AM, Vladimir Sementsov-Ogievskiy wrote:
> Minimal implementation: drop most of additional error information.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  block/nbd-client.h  |   2 +
>  include/block/nbd.h |  15 ++++-
>  block/nbd-client.c  |  97 +++++++++++++++++++++++++-----
>  nbd/client.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  4 files changed, 249 insertions(+), 34 deletions(-)
> 

> +++ b/include/block/nbd.h
> @@ -57,11 +57,17 @@ struct NBDRequest {
>  };
>  typedef struct NBDRequest NBDRequest;
>  
> -struct NBDReply {
> +typedef struct NBDReply {
> +    bool simple;
>      uint64_t handle;
>      uint32_t error;
> -};
> -typedef struct NBDReply NBDReply;
> +
> +    uint16_t flags;
> +    uint16_t type;
> +    uint32_t tail_length;
> +    uint64_t offset;
> +    uint32_t hole_size;
> +} NBDReply;

This feels like it should be a discriminated union, rather than a struct
containing fields that are only sometimes valid...

>  
>  #define NBD_SREP_TYPE_NONE          0
>  #define NBD_SREP_TYPE_OFFSET_DATA   1
> +#define NBD_SREP_TYPE_OFFSET_HOLE   2
>  #define NBD_SREP_TYPE_ERROR         NBD_SREP_ERR(1)
> +#define NBD_SREP_TYPE_ERROR_OFFSET  NBD_SREP_ERR(2)

...especially since there is more than one type of SREP packet layout.

I also wonder why you are defining constants in a piecemeal fashion,
rather than all at once (even if your minimal server implementation
doesn't send a particular constant, there's no harm in defining them all
up front).

> +++ b/block/nbd-client.c
> @@ -179,9 +179,10 @@ err:
>      return rc;
>  }
>  
> -static int nbd_co_receive_reply(NBDClientSession *s,
> -                                uint64_t handle,
> -                                QEMUIOVector *qiov)
> +static int nbd_co_receive_1_reply_or_chunk(NBDClientSession *s,

Long name, and unusual to mix in "1" instead of "one".  Would it be
better to name it nbd_co_receive_chunk, where we declare that a simple
reply is (roughly) the same amount of effort as a chunk in a structured
reply?

> +                                           uint64_t handle,
> +                                           bool *cont,
> +                                           QEMUIOVector *qiov)
>  {

No documentation of the function?

>      int ret;
>      int i = HANDLE_TO_INDEX(s, handle);
> @@ -191,29 +192,95 @@ static int nbd_co_receive_reply(NBDClientSession *s,
>      qemu_coroutine_yield();
>      s->requests[i].receiving = false;
>      if (!s->ioc || s->quit) {
> -        ret = -EIO;
> -    } else {
> -        assert(s->reply.handle == handle);
> -        ret = -s->reply.error;
> -        if (qiov && s->reply.error == 0) {
> +        *cont = false;
> +        return -EIO;
> +    }
> +
> +    assert(s->reply.handle == handle);
> +    *cont = !(s->reply.simple || (s->reply.flags & NBD_SREP_FLAG_DONE));

We need to validate that the server is not sending us SREP chunks unless
we negotiated them.  I'm thinking it might be better to do it here
(maybe you did it somewhere else, but I haven't seen it yet; I'm
reviewing the patch in textual order rather than the order in which
things are called).

> +    ret = -s->reply.error;
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    if (s->reply.simple) {
> +        if (qiov) {
>              if (qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
> -                                      NULL) < 0) {
> -                ret = -EIO;
> -                s->quit = true;
> +                                      NULL) < 0)
> +            {
> +                goto fatal;
>              }
>          }
> +        goto out;
> +    }
>  
> -        /* Tell the read handler to read another header.  */
> -        s->reply.handle = 0;
> +    /* here we deal with successful structured reply */
> +    switch (s->reply.type) {
> +        QEMUIOVector sub_qiov;
> +    case NBD_SREP_TYPE_OFFSET_DATA:

This is putting a LOT of smarts directly into the receive routine.
Here's where I was previously wondering (and I think Paolo as well)
whether it might be better to split the efforts: the generic function
reads off the chunk information and any payload, but a per-command
callback function then parses the chunks.  Especially since the
definition of the chunks differs on a per-command basis (yes, the NBD
spec will try to not reuse an SREP chunk type across multiple commands
unless the semantics are similar, but that's a bit more fragile).  This
particularly matters given my statement above that you want a
discriminated union, rather than a struct that contains unused fields,
for handling different SREP chunk types.

My review has to pause here for now...
Vladimir Sementsov-Ogievskiy Sept. 27, 2017, 10:05 a.m. UTC | #2
26.09.2017 01:19, Eric Blake wrote:
> On 09/25/2017 08:58 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Minimal implementation: drop most of additional error information.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   block/nbd-client.h  |   2 +
>>   include/block/nbd.h |  15 ++++-
>>   block/nbd-client.c  |  97 +++++++++++++++++++++++++-----
>>   nbd/client.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++-----
>>   4 files changed, 249 insertions(+), 34 deletions(-)
>>
>> +++ b/include/block/nbd.h
>> @@ -57,11 +57,17 @@ struct NBDRequest {
>>   };
>>   typedef struct NBDRequest NBDRequest;
>>   
>> -struct NBDReply {
>> +typedef struct NBDReply {
>> +    bool simple;
>>       uint64_t handle;
>>       uint32_t error;
>> -};
>> -typedef struct NBDReply NBDReply;
>> +
>> +    uint16_t flags;
>> +    uint16_t type;
>> +    uint32_t tail_length;
>> +    uint64_t offset;
>> +    uint32_t hole_size;
>> +} NBDReply;
> This feels like it should be a discriminated union, rather than a struct
> containing fields that are only sometimes valid...

No:

simple, handle and error are always valid
flags, type, tail_length are valid for all structured replies
offset and hole_size are valid for structured hole reply

so, we have one case when all fields are valid.. how do you see it with 
union, what is the real difference? I think it would be just a complication.

>
>>   
>>   #define NBD_SREP_TYPE_NONE          0
>>   #define NBD_SREP_TYPE_OFFSET_DATA   1
>> +#define NBD_SREP_TYPE_OFFSET_HOLE   2
>>   #define NBD_SREP_TYPE_ERROR         NBD_SREP_ERR(1)
>> +#define NBD_SREP_TYPE_ERROR_OFFSET  NBD_SREP_ERR(2)
> ...especially since there is more than one type of SREP packet layout.
>
> I also wonder why you are defining constants in a piecemeal fashion,
> rather than all at once (even if your minimal server implementation
> doesn't send a particular constant, there's no harm in defining them all
> up front).

hmm. just to not define unused constants. It doesn't matter, I can 
define them all if you prefer.

>
>> +++ b/block/nbd-client.c
>> @@ -179,9 +179,10 @@ err:
>>       return rc;
>>   }
>>   
>> -static int nbd_co_receive_reply(NBDClientSession *s,
>> -                                uint64_t handle,
>> -                                QEMUIOVector *qiov)
>> +static int nbd_co_receive_1_reply_or_chunk(NBDClientSession *s,
> Long name, and unusual to mix in "1" instead of "one".  Would it be
> better to name it nbd_co_receive_chunk, where we declare that a simple
> reply is (roughly) the same amount of effort as a chunk in a structured
> reply?
>
>> +                                           uint64_t handle,
>> +                                           bool *cont,
>> +                                           QEMUIOVector *qiov)
>>   {
> No documentation of the function?
>
>>       int ret;
>>       int i = HANDLE_TO_INDEX(s, handle);
>> @@ -191,29 +192,95 @@ static int nbd_co_receive_reply(NBDClientSession *s,
>>       qemu_coroutine_yield();
>>       s->requests[i].receiving = false;
>>       if (!s->ioc || s->quit) {
>> -        ret = -EIO;
>> -    } else {
>> -        assert(s->reply.handle == handle);
>> -        ret = -s->reply.error;
>> -        if (qiov && s->reply.error == 0) {
>> +        *cont = false;
>> +        return -EIO;
>> +    }
>> +
>> +    assert(s->reply.handle == handle);
>> +    *cont = !(s->reply.simple || (s->reply.flags & NBD_SREP_FLAG_DONE));
> We need to validate that the server is not sending us SREP chunks unless
> we negotiated them.  I'm thinking it might be better to do it here
> (maybe you did it somewhere else, but I haven't seen it yet; I'm
> reviewing the patch in textual order rather than the order in which
> things are called).

No, I didn't. Will add (may be early, in reply_entry).

>
>> +    ret = -s->reply.error;
>> +    if (ret < 0) {
>> +        goto out;
>> +    }
>> +
>> +    if (s->reply.simple) {
>> +        if (qiov) {
>>               if (qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
>> -                                      NULL) < 0) {
>> -                ret = -EIO;
>> -                s->quit = true;
>> +                                      NULL) < 0)
>> +            {
>> +                goto fatal;
>>               }
>>           }
>> +        goto out;
>> +    }
>>   
>> -        /* Tell the read handler to read another header.  */
>> -        s->reply.handle = 0;
>> +    /* here we deal with successful structured reply */
>> +    switch (s->reply.type) {
>> +        QEMUIOVector sub_qiov;
>> +    case NBD_SREP_TYPE_OFFSET_DATA:
> This is putting a LOT of smarts directly into the receive routine.
> Here's where I was previously wondering (and I think Paolo as well)
> whether it might be better to split the efforts: the generic function
> reads off the chunk information and any payload, but a per-command

Hmm. it was my idea to move all reading into one coroutine (in my 
refactoring series, but Paolo was against).

Or you mean to read a payload as raw? It would lead to double copying it 
to destination qiov, which I dislike..

> callback function then parses the chunks.

per-command? Then callback for CMD_READ would have all of these 
"smarts", so the whole code would not be simpler.. (However it will 
simplify adding block-status later).

>    Especially since the
> definition of the chunks differs on a per-command basis (yes, the NBD
> spec will try to not reuse an SREP chunk type across multiple commands
> unless the semantics are similar, but that's a bit more fragile).  This
> particularly matters given my statement above that you want a
> discriminated union, rather than a struct that contains unused fields,
> for handling different SREP chunk types.
>
> My review has to pause here for now...
>
>
Vladimir Sementsov-Ogievskiy Sept. 27, 2017, 12:32 p.m. UTC | #3
27.09.2017 13:05, Vladimir Sementsov-Ogievskiy wrote:
> 26.09.2017 01:19, Eric Blake wrote:

>> On 09/25/2017 08:58 AM, Vladimir Sementsov-Ogievskiy wrote:

>>> Minimal implementation: drop most of additional error information.

>>>

>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

>>> ---

>>>   block/nbd-client.h  |   2 +

>>>   include/block/nbd.h |  15 ++++-

>>>   block/nbd-client.c  |  97 +++++++++++++++++++++++++-----

>>>   nbd/client.c        | 169 

>>> +++++++++++++++++++++++++++++++++++++++++++++++-----

>>>   4 files changed, 249 insertions(+), 34 deletions(-)

>>>

>>> +++ b/include/block/nbd.h

>>> @@ -57,11 +57,17 @@ struct NBDRequest {

>>>   };

>>>   typedef struct NBDRequest NBDRequest;

>>>   -struct NBDReply {

>>> +typedef struct NBDReply {

>>> +    bool simple;

>>>       uint64_t handle;

>>>       uint32_t error;

>>> -};

>>> -typedef struct NBDReply NBDReply;

>>> +

>>> +    uint16_t flags;

>>> +    uint16_t type;

>>> +    uint32_t tail_length;

>>> +    uint64_t offset;

>>> +    uint32_t hole_size;

>>> +} NBDReply;

>> This feels like it should be a discriminated union, rather than a struct

>> containing fields that are only sometimes valid...

>

> No:

>

> simple, handle and error are always valid

> flags, type, tail_length are valid for all structured replies

> offset and hole_size are valid for structured hole reply

>

> so, we have one case when all fields are valid.. how do you see it 

> with union, what is the real difference? I think it would be just a 

> complication.

>

>>

>>>     #define NBD_SREP_TYPE_NONE 0

>>>   #define NBD_SREP_TYPE_OFFSET_DATA   1

>>> +#define NBD_SREP_TYPE_OFFSET_HOLE   2

>>>   #define NBD_SREP_TYPE_ERROR         NBD_SREP_ERR(1)

>>> +#define NBD_SREP_TYPE_ERROR_OFFSET  NBD_SREP_ERR(2)

>> ...especially since there is more than one type of SREP packet layout.

>>

>> I also wonder why you are defining constants in a piecemeal fashion,

>> rather than all at once (even if your minimal server implementation

>> doesn't send a particular constant, there's no harm in defining them all

>> up front).

>

> hmm. just to not define unused constants. It doesn't matter, I can 

> define them all if you prefer.

>

>>

>>> +++ b/block/nbd-client.c

>>> @@ -179,9 +179,10 @@ err:

>>>       return rc;

>>>   }

>>>   -static int nbd_co_receive_reply(NBDClientSession *s,

>>> -                                uint64_t handle,

>>> -                                QEMUIOVector *qiov)

>>> +static int nbd_co_receive_1_reply_or_chunk(NBDClientSession *s,

>> Long name, and unusual to mix in "1" instead of "one".  Would it be

>> better to name it nbd_co_receive_chunk, where we declare that a simple

>> reply is (roughly) the same amount of effort as a chunk in a structured

>> reply?

>>

>>> + uint64_t handle,

>>> +                                           bool *cont,

>>> +                                           QEMUIOVector *qiov)

>>>   {

>> No documentation of the function?

>>

>>>       int ret;

>>>       int i = HANDLE_TO_INDEX(s, handle);

>>> @@ -191,29 +192,95 @@ static int 

>>> nbd_co_receive_reply(NBDClientSession *s,

>>>       qemu_coroutine_yield();

>>>       s->requests[i].receiving = false;

>>>       if (!s->ioc || s->quit) {

>>> -        ret = -EIO;

>>> -    } else {

>>> -        assert(s->reply.handle == handle);

>>> -        ret = -s->reply.error;

>>> -        if (qiov && s->reply.error == 0) {

>>> +        *cont = false;

>>> +        return -EIO;

>>> +    }

>>> +

>>> +    assert(s->reply.handle == handle);

>>> +    *cont = !(s->reply.simple || (s->reply.flags & 

>>> NBD_SREP_FLAG_DONE));

>> We need to validate that the server is not sending us SREP chunks unless

>> we negotiated them.  I'm thinking it might be better to do it here

>> (maybe you did it somewhere else, but I haven't seen it yet; I'm

>> reviewing the patch in textual order rather than the order in which

>> things are called).

>

> No, I didn't. Will add (may be early, in reply_entry).

>

>>

>>> +    ret = -s->reply.error;

>>> +    if (ret < 0) {

>>> +        goto out;

>>> +    }

>>> +

>>> +    if (s->reply.simple) {

>>> +        if (qiov) {

>>>               if (qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,

>>> -                                      NULL) < 0) {

>>> -                ret = -EIO;

>>> -                s->quit = true;

>>> +                                      NULL) < 0)

>>> +            {

>>> +                goto fatal;

>>>               }

>>>           }

>>> +        goto out;

>>> +    }

>>>   -        /* Tell the read handler to read another header. */

>>> -        s->reply.handle = 0;

>>> +    /* here we deal with successful structured reply */

>>> +    switch (s->reply.type) {

>>> +        QEMUIOVector sub_qiov;

>>> +    case NBD_SREP_TYPE_OFFSET_DATA:

>> This is putting a LOT of smarts directly into the receive routine.

>> Here's where I was previously wondering (and I think Paolo as well)

>> whether it might be better to split the efforts: the generic function

>> reads off the chunk information and any payload, but a per-command

>

> Hmm. it was my idea to move all reading into one coroutine (in my 

> refactoring series, but Paolo was against).

>

> Or you mean to read a payload as raw? It would lead to double copying 

> it to destination qiov, which I dislike..

>

>> callback function then parses the chunks.

>

> per-command? Then callback for CMD_READ would have all of these 

> "smarts", so the whole code would not be simpler.. (However it will 

> simplify adding block-status later).

>

>>    Especially since the

>> definition of the chunks differs on a per-command basis (yes, the NBD

>> spec will try to not reuse an SREP chunk type across multiple commands

>> unless the semantics are similar, but that's a bit more fragile).  This

>> particularly matters given my statement above that you want a

>> discriminated union, rather than a struct that contains unused fields,

>> for handling different SREP chunk types.

>>

>> My review has to pause here for now...

>>

>>

>

>


I'll post a proposal of extendable architecture for this today to have a 
more concrete discussion.


-- 
Best regards,
Vladimir
Paolo Bonzini Oct. 3, 2017, 10:07 a.m. UTC | #4
On 26/09/2017 00:19, Eric Blake wrote:
>> +    /* here we deal with successful structured reply */
>> +    switch (s->reply.type) {
>> +        QEMUIOVector sub_qiov;
>> +    case NBD_SREP_TYPE_OFFSET_DATA:
> This is putting a LOT of smarts directly into the receive routine.
> Here's where I was previously wondering (and I think Paolo as well)
> whether it might be better to split the efforts: the generic function
> reads off the chunk information and any payload, but a per-command
> callback function then parses the chunks.  Especially since the
> definition of the chunks differs on a per-command basis (yes, the NBD
> spec will try to not reuse an SREP chunk type across multiple commands
> unless the semantics are similar, but that's a bit more fragile).  This
> particularly matters given my statement above that you want a
> discriminated union, rather than a struct that contains unused fields,
> for handling different SREP chunk types.

I think there should be two kinds of replies: 1) read directly into a
QEMUIOVector, using structured replies only as an encapsulation of the
payload; 2) read a chunk at a time into malloc-ed memory, yielding back
to the calling coroutine after receiving one complete chunk.

In the end this probably means that you have a read_chunk_header
function and a read_chunk function.  READ has a loop that calls
read_chunk_header followed by direct reading into the QEMUIOVector,
while everyone else calls read_chunk.

Maybe qio_channel_readv/writev_full could have "offset" and "bytes"
arguments.  Most code in iov_send_recv could be cut-and-pasted.  (When
sheepdog is converted to QIOChannel, iov_send_recv can go away).

Paolo
Vladimir Sementsov-Ogievskiy Oct. 3, 2017, 12:26 p.m. UTC | #5
03.10.2017 13:07, Paolo Bonzini wrote:
> On 26/09/2017 00:19, Eric Blake wrote:
>>> +    /* here we deal with successful structured reply */
>>> +    switch (s->reply.type) {
>>> +        QEMUIOVector sub_qiov;
>>> +    case NBD_SREP_TYPE_OFFSET_DATA:
>> This is putting a LOT of smarts directly into the receive routine.
>> Here's where I was previously wondering (and I think Paolo as well)
>> whether it might be better to split the efforts: the generic function
>> reads off the chunk information and any payload, but a per-command
>> callback function then parses the chunks.  Especially since the
>> definition of the chunks differs on a per-command basis (yes, the NBD
>> spec will try to not reuse an SREP chunk type across multiple commands
>> unless the semantics are similar, but that's a bit more fragile).  This
>> particularly matters given my statement above that you want a
>> discriminated union, rather than a struct that contains unused fields,
>> for handling different SREP chunk types.
> I think there should be two kinds of replies: 1) read directly into a
> QEMUIOVector, using structured replies only as an encapsulation of the

who should read to qiov? reply_entry, or calling coroutine?.. 
reply_entry should anyway
parse reply, to understand should it read it all or read it to qiov (or 
yield back, and then
calling coroutine will read to qiov)..

> payload; 2) read a chunk at a time into malloc-ed memory, yielding back
> to the calling coroutine after receiving one complete chunk.
>
> In the end this probably means that you have a read_chunk_header
> function and a read_chunk function.  READ has a loop that calls
> read_chunk_header followed by direct reading into the QEMUIOVector,
> while everyone else calls read_chunk.
>
> Maybe qio_channel_readv/writev_full could have "offset" and "bytes"
> arguments.  Most code in iov_send_recv could be cut-and-pasted.  (When
> sheepdog is converted to QIOChannel, iov_send_recv can go away).
>
> Paolo
Vladimir Sementsov-Ogievskiy Oct. 3, 2017, 12:58 p.m. UTC | #6
03.10.2017 13:07, Paolo Bonzini wrote:
> On 26/09/2017 00:19, Eric Blake wrote:
>>> +    /* here we deal with successful structured reply */
>>> +    switch (s->reply.type) {
>>> +        QEMUIOVector sub_qiov;
>>> +    case NBD_SREP_TYPE_OFFSET_DATA:
>> This is putting a LOT of smarts directly into the receive routine.
>> Here's where I was previously wondering (and I think Paolo as well)
>> whether it might be better to split the efforts: the generic function
>> reads off the chunk information and any payload, but a per-command
>> callback function then parses the chunks.  Especially since the
>> definition of the chunks differs on a per-command basis (yes, the NBD
>> spec will try to not reuse an SREP chunk type across multiple commands
>> unless the semantics are similar, but that's a bit more fragile).  This
>> particularly matters given my statement above that you want a
>> discriminated union, rather than a struct that contains unused fields,
>> for handling different SREP chunk types.
> I think there should be two kinds of replies: 1) read directly into a
> QEMUIOVector, using structured replies only as an encapsulation of the
> payload; 2) read a chunk at a time into malloc-ed memory, yielding back
> to the calling coroutine after receiving one complete chunk.
>
> In the end this probably means that you have a read_chunk_header
> function and a read_chunk function.  READ has a loop that calls
> read_chunk_header followed by direct reading into the QEMUIOVector,
> while everyone else calls read_chunk.

accordingly to spec, we can receive several error reply chunks to any 
request,
so loop, receiving them should be common for all requests I think

>
> Maybe qio_channel_readv/writev_full could have "offset" and "bytes"
> arguments.  Most code in iov_send_recv could be cut-and-pasted.  (When
> sheepdog is converted to QIOChannel, iov_send_recv can go away).
>
> Paolo
Vladimir Sementsov-Ogievskiy Oct. 3, 2017, 1:35 p.m. UTC | #7
03.10.2017 15:58, Vladimir Sementsov-Ogievskiy wrote:
> 03.10.2017 13:07, Paolo Bonzini wrote:
>> On 26/09/2017 00:19, Eric Blake wrote:
>>>> +    /* here we deal with successful structured reply */
>>>> +    switch (s->reply.type) {
>>>> +        QEMUIOVector sub_qiov;
>>>> +    case NBD_SREP_TYPE_OFFSET_DATA:
>>> This is putting a LOT of smarts directly into the receive routine.
>>> Here's where I was previously wondering (and I think Paolo as well)
>>> whether it might be better to split the efforts: the generic function
>>> reads off the chunk information and any payload, but a per-command
>>> callback function then parses the chunks.  Especially since the
>>> definition of the chunks differs on a per-command basis (yes, the NBD
>>> spec will try to not reuse an SREP chunk type across multiple commands
>>> unless the semantics are similar, but that's a bit more fragile).  This
>>> particularly matters given my statement above that you want a
>>> discriminated union, rather than a struct that contains unused fields,
>>> for handling different SREP chunk types.
>> I think there should be two kinds of replies: 1) read directly into a
>> QEMUIOVector, using structured replies only as an encapsulation of the
>> payload; 2) read a chunk at a time into malloc-ed memory, yielding back
>> to the calling coroutine after receiving one complete chunk.
>>
>> In the end this probably means that you have a read_chunk_header
>> function and a read_chunk function.  READ has a loop that calls
>> read_chunk_header followed by direct reading into the QEMUIOVector,
>> while everyone else calls read_chunk.
>
> accordingly to spec, we can receive several error reply chunks to any 
> request,
> so loop, receiving them should be common for all requests I think

as well as handling error chunks should be common.. What do you think 
about my DRAFT proposal?


>
>>
>> Maybe qio_channel_readv/writev_full could have "offset" and "bytes"
>> arguments.  Most code in iov_send_recv could be cut-and-pasted. (When
>> sheepdog is converted to QIOChannel, iov_send_recv can go away).
>>
>> Paolo
>
>
Paolo Bonzini Oct. 3, 2017, 2:05 p.m. UTC | #8
On 03/10/2017 14:26, Vladimir Sementsov-Ogievskiy wrote:
> 03.10.2017 13:07, Paolo Bonzini wrote:
>> On 26/09/2017 00:19, Eric Blake wrote:
>>>> +    /* here we deal with successful structured reply */
>>>> +    switch (s->reply.type) {
>>>> +        QEMUIOVector sub_qiov;
>>>> +    case NBD_SREP_TYPE_OFFSET_DATA:
>>> This is putting a LOT of smarts directly into the receive routine.
>>> Here's where I was previously wondering (and I think Paolo as well)
>>> whether it might be better to split the efforts: the generic function
>>> reads off the chunk information and any payload, but a per-command
>>> callback function then parses the chunks.  Especially since the
>>> definition of the chunks differs on a per-command basis (yes, the NBD
>>> spec will try to not reuse an SREP chunk type across multiple commands
>>> unless the semantics are similar, but that's a bit more fragile).  This
>>> particularly matters given my statement above that you want a
>>> discriminated union, rather than a struct that contains unused fields,
>>> for handling different SREP chunk types.
>> I think there should be two kinds of replies: 1) read directly into a
>> QEMUIOVector, using structured replies only as an encapsulation of the
> 
> who should read to qiov? reply_entry, or calling coroutine?..
> reply_entry should anyway
> parse reply, to understand should it read it all or read it to qiov (or
> yield back, and then
> calling coroutine will read to qiov)..

The CMD_READ coroutine should---either directly or, in case you have a
structured reply, after reading each chunk header.

Paolo

>> payload; 2) read a chunk at a time into malloc-ed memory, yielding back
>> to the calling coroutine after receiving one complete chunk.
>>
>> In the end this probably means that you have a read_chunk_header
>> function and a read_chunk function.  READ has a loop that calls
>> read_chunk_header followed by direct reading into the QEMUIOVector,
>> while everyone else calls read_chunk.
>>
>> Maybe qio_channel_readv/writev_full could have "offset" and "bytes"
>> arguments.  Most code in iov_send_recv could be cut-and-pasted.  (When
>> sheepdog is converted to QIOChannel, iov_send_recv can go away).
>>
>> Paolo
> 
>
Paolo Bonzini Oct. 3, 2017, 2:06 p.m. UTC | #9
On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>
>>> In the end this probably means that you have a read_chunk_header
>>> function and a read_chunk function.  READ has a loop that calls
>>> read_chunk_header followed by direct reading into the QEMUIOVector,
>>> while everyone else calls read_chunk.
>>
>> accordingly to spec, we can receive several error reply chunks to any
>> request,
>> so loop, receiving them should be common for all requests I think
> 
> as well as handling error chunks should be common..

Yes, reading error chunks should be part of read_chunk_header.

Paolo
Vladimir Sementsov-Ogievskiy Oct. 5, 2017, 9:59 a.m. UTC | #10
03.10.2017 17:06, Paolo Bonzini wrote:
> On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>> In the end this probably means that you have a read_chunk_header
>>>> function and a read_chunk function.  READ has a loop that calls
>>>> read_chunk_header followed by direct reading into the QEMUIOVector,
>>>> while everyone else calls read_chunk.
>>> accordingly to spec, we can receive several error reply chunks to any
>>> request,
>>> so loop, receiving them should be common for all requests I think
>> as well as handling error chunks should be common..
> Yes, reading error chunks should be part of read_chunk_header.
>
> Paolo
Vladimir Sementsov-Ogievskiy Oct. 5, 2017, 10:02 a.m. UTC | #11
03.10.2017 17:06, Paolo Bonzini wrote:
> On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>> In the end this probably means that you have a read_chunk_header
>>>> function and a read_chunk function.  READ has a loop that calls
>>>> read_chunk_header followed by direct reading into the QEMUIOVector,
>>>> while everyone else calls read_chunk.
>>> accordingly to spec, we can receive several error reply chunks to any
>>> request,
>>> so loop, receiving them should be common for all requests I think
>> as well as handling error chunks should be common..
> Yes, reading error chunks should be part of read_chunk_header.
>
> Paolo

So, you want a loop in READ, and separate loop for other commands? Then 
we will have separate loop for BLOCK_STATUS and for all future commands 
with specific replies?
Paolo Bonzini Oct. 5, 2017, 10:36 a.m. UTC | #12
On 05/10/2017 12:02, Vladimir Sementsov-Ogievskiy wrote:
> 03.10.2017 17:06, Paolo Bonzini wrote:
>> On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>>> In the end this probably means that you have a read_chunk_header
>>>>> function and a read_chunk function.  READ has a loop that calls
>>>>> read_chunk_header followed by direct reading into the QEMUIOVector,
>>>>> while everyone else calls read_chunk.
>>>> accordingly to spec, we can receive several error reply chunks to any
>>>> request,
>>>> so loop, receiving them should be common for all requests I think
>>> as well as handling error chunks should be common..
>> Yes, reading error chunks should be part of read_chunk_header.
>>
>> Paolo
> 
> So, you want a loop in READ, and separate loop for other commands? Then
> we will have separate loop for BLOCK_STATUS and for all future commands
> with specific replies?

There should be a separate loop for each command.

The only difference between READ and other commands is that READ
receives directly in QEMUIOVector, while other commands can use a common
function to to receive each structured reply chunk into malloc-ed memory.

Paolo
Eric Blake Oct. 5, 2017, 10:12 p.m. UTC | #13
On 10/05/2017 05:36 AM, Paolo Bonzini wrote:
> On 05/10/2017 12:02, Vladimir Sementsov-Ogievskiy wrote:
>> 03.10.2017 17:06, Paolo Bonzini wrote:
>>> On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> In the end this probably means that you have a read_chunk_header
>>>>>> function and a read_chunk function.  READ has a loop that calls
>>>>>> read_chunk_header followed by direct reading into the QEMUIOVector,
>>>>>> while everyone else calls read_chunk.
>>>>> accordingly to spec, we can receive several error reply chunks to any
>>>>> request,
>>>>> so loop, receiving them should be common for all requests I think
>>>> as well as handling error chunks should be common..
>>> Yes, reading error chunks should be part of read_chunk_header.
>>>
>>> Paolo
>>
>> So, you want a loop in READ, and separate loop for other commands? Then
>> we will have separate loop for BLOCK_STATUS and for all future commands
>> with specific replies?
> 
> There should be a separate loop for each command.
> 
> The only difference between READ and other commands is that READ
> receives directly in QEMUIOVector, while other commands can use a common
> function to to receive each structured reply chunk into malloc-ed memory.

To make sure we're on the same page, here's how I see it.  At a high
level, we have:

Each command calls nbd_co_send_request once, then calls
nbd_co_receive_reply in a loop until there is an indication of the last
packet.  nbd_co_receive_reply waits for data to come from the server,
and parses the header.

If the packet is unrecognized, report failure and request a quit
(negative return value)

If it is old-style:
- if the command is read, and we did not negotiate structured read, then
we also read the payload into qiov
- if the command is read, but we negotiated structured read, the server
is buggy, so report the bug and request a quit
- for all other commands, there is no payload, return success or failure
to the caller based on the header payload
- at any rate, the reply to the caller is that this is the final packet,
and there is no payload returned (so we return negative or 1, but never 0)

Otherwise, it is new-style:
- if we did not negotiate structured reply, the server is buggy, so
report the bug and request a quit (negative return)
- if the chunk is an error, we process the entire packet and report the
error; if we have any commands that care about the error details, we
could return the error in a malloc'd discriminated union, but we can
probably get by without that. If callers don't care about details, but
the error chunk is not final, it may be easier to just store the fact
that an error occurred and return 0 to tell the caller to keep looping,
and return the negative value later when the final chunk is finally received
- if the chunk is NBD_REPLY_TYPE_NONE, there is no payload, and this
should be the final chunk, so the return to the caller can be the same
as for old-style (return 1 if we had no earlier error packets, or the
saved negative value corresponding to the first error received)
- if the command is read, we can read the payload into qiov (saves
malloc'ing space for the reply only to copy it into the qiov), so we
don't have to return any data
- for any other command, we malloc space for the non-error payload, and
then it is up to the command's loop to process the payload

so the signature can be something like:

int nbd_co_receive_reply(NBDClientSession *s, QEMUIOVector *qiov,
                         void **payload)

where it returns -errno on failure, 0 if the command is not complete,
and 1 if the command is done.  READ passes qiov, which is fully
populated when the function returns 1; all other commands pass NULL.
Commands pass NULL for payload if they don't expect a payload return
(this includes READ); but a command that expects a payload
(BLOCK_STATUS) passes a pointer in payload and gets malloc'd space
stored there if return is 0 or 1.

Does that sound like we're on the right design track?
Vladimir Sementsov-Ogievskiy Oct. 6, 2017, 7:09 a.m. UTC | #14
06.10.2017 01:12, Eric Blake wrote:
> On 10/05/2017 05:36 AM, Paolo Bonzini wrote:
>> On 05/10/2017 12:02, Vladimir Sementsov-Ogievskiy wrote:
>>> 03.10.2017 17:06, Paolo Bonzini wrote:
>>>> On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> In the end this probably means that you have a read_chunk_header
>>>>>>> function and a read_chunk function.  READ has a loop that calls
>>>>>>> read_chunk_header followed by direct reading into the QEMUIOVector,
>>>>>>> while everyone else calls read_chunk.
>>>>>> accordingly to spec, we can receive several error reply chunks to any
>>>>>> request,
>>>>>> so loop, receiving them should be common for all requests I think
>>>>> as well as handling error chunks should be common..
>>>> Yes, reading error chunks should be part of read_chunk_header.
>>>>
>>>> Paolo
>>> So, you want a loop in READ, and separate loop for other commands? Then
>>> we will have separate loop for BLOCK_STATUS and for all future commands
>>> with specific replies?
>> There should be a separate loop for each command.
>>
>> The only difference between READ and other commands is that READ
>> receives directly in QEMUIOVector, while other commands can use a common
>> function to to receive each structured reply chunk into malloc-ed memory.
> To make sure we're on the same page, here's how I see it.  At a high
> level, we have:
>
> Each command calls nbd_co_send_request once, then calls
> nbd_co_receive_reply in a loop until there is an indication of the last
> packet.  nbd_co_receive_reply waits for data to come from the server,
> and parses the header.
>
> If the packet is unrecognized, report failure and request a quit
> (negative return value)
>
> If it is old-style:
> - if the command is read, and we did not negotiate structured read, then
> we also read the payload into qiov
> - if the command is read, but we negotiated structured read, the server
> is buggy, so report the bug and request a quit
> - for all other commands, there is no payload, return success or failure
> to the caller based on the header payload
> - at any rate, the reply to the caller is that this is the final packet,
> and there is no payload returned (so we return negative or 1, but never 0)
>
> Otherwise, it is new-style:
> - if we did not negotiate structured reply, the server is buggy, so
> report the bug and request a quit (negative return)
> - if the chunk is an error, we process the entire packet and report the
> error; if we have any commands that care about the error details, we
> could return the error in a malloc'd discriminated union, but we can
> probably get by without that. If callers don't care about details, but
> the error chunk is not final, it may be easier to just store the fact
> that an error occurred and return 0 to tell the caller to keep looping,
> and return the negative value later when the final chunk is finally received
> - if the chunk is NBD_REPLY_TYPE_NONE, there is no payload, and this
> should be the final chunk, so the return to the caller can be the same
> as for old-style (return 1 if we had no earlier error packets, or the
> saved negative value corresponding to the first error received)
> - if the command is read, we can read the payload into qiov (saves
> malloc'ing space for the reply only to copy it into the qiov), so we
> don't have to return any data
> - for any other command, we malloc space for the non-error payload, and
> then it is up to the command's loop to process the payload
>
> so the signature can be something like:
>
> int nbd_co_receive_reply(NBDClientSession *s, QEMUIOVector *qiov,
>                           void **payload)
>
> where it returns -errno on failure, 0 if the command is not complete,
> and 1 if the command is done.  READ passes qiov, which is fully
> populated when the function returns 1; all other commands pass NULL.
> Commands pass NULL for payload if they don't expect a payload return
> (this includes READ); but a command that expects a payload
> (BLOCK_STATUS) passes a pointer in payload and gets malloc'd space
> stored there if return is 0 or 1.
>
> Does that sound like we're on the right design track?
>


Hmm. and what is the difference with my patch? Except payload? The only 
command with payload
now is READ (except errors), but you (and me in my patch) suggest to 
fill this qiov in nbd_co_receive_reply
(nbd_co_receive_1_reply_or_chunk in my patch), so payload will be unused 
for now, we can add it
later if it will be needed for BLOCK_STATUS.
Vladimir Sementsov-Ogievskiy Oct. 6, 2017, 7:23 a.m. UTC | #15
06.10.2017 10:09, Vladimir Sementsov-Ogievskiy wrote:
> 06.10.2017 01:12, Eric Blake wrote:
>> On 10/05/2017 05:36 AM, Paolo Bonzini wrote:
>>> On 05/10/2017 12:02, Vladimir Sementsov-Ogievskiy wrote:
>>>> 03.10.2017 17:06, Paolo Bonzini wrote:
>>>>> On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>> In the end this probably means that you have a read_chunk_header
>>>>>>>> function and a read_chunk function.  READ has a loop that calls
>>>>>>>> read_chunk_header followed by direct reading into the 
>>>>>>>> QEMUIOVector,
>>>>>>>> while everyone else calls read_chunk.
>>>>>>> accordingly to spec, we can receive several error reply chunks 
>>>>>>> to any
>>>>>>> request,
>>>>>>> so loop, receiving them should be common for all requests I think
>>>>>> as well as handling error chunks should be common..
>>>>> Yes, reading error chunks should be part of read_chunk_header.
>>>>>
>>>>> Paolo
>>>> So, you want a loop in READ, and separate loop for other commands? 
>>>> Then
>>>> we will have separate loop for BLOCK_STATUS and for all future 
>>>> commands
>>>> with specific replies?
>>> There should be a separate loop for each command.
>>>
>>> The only difference between READ and other commands is that READ
>>> receives directly in QEMUIOVector, while other commands can use a 
>>> common
>>> function to to receive each structured reply chunk into malloc-ed 
>>> memory.
>> To make sure we're on the same page, here's how I see it.  At a high
>> level, we have:
>>
>> Each command calls nbd_co_send_request once, then calls
>> nbd_co_receive_reply in a loop until there is an indication of the last
>> packet.  nbd_co_receive_reply waits for data to come from the server,
>> and parses the header.
>>
>> If the packet is unrecognized, report failure and request a quit
>> (negative return value)
>>
>> If it is old-style:
>> - if the command is read, and we did not negotiate structured read, then
>> we also read the payload into qiov
>> - if the command is read, but we negotiated structured read, the server
>> is buggy, so report the bug and request a quit
>> - for all other commands, there is no payload, return success or failure
>> to the caller based on the header payload
>> - at any rate, the reply to the caller is that this is the final packet,
>> and there is no payload returned (so we return negative or 1, but 
>> never 0)
>>
>> Otherwise, it is new-style:
>> - if we did not negotiate structured reply, the server is buggy, so
>> report the bug and request a quit (negative return)
>> - if the chunk is an error, we process the entire packet and report the
>> error; if we have any commands that care about the error details, we
>> could return the error in a malloc'd discriminated union, but we can
>> probably get by without that. If callers don't care about details, but
>> the error chunk is not final, it may be easier to just store the fact
>> that an error occurred and return 0 to tell the caller to keep looping,
>> and return the negative value later when the final chunk is finally 
>> received
>> - if the chunk is NBD_REPLY_TYPE_NONE, there is no payload, and this
>> should be the final chunk, so the return to the caller can be the same
>> as for old-style (return 1 if we had no earlier error packets, or the
>> saved negative value corresponding to the first error received)
>> - if the command is read, we can read the payload into qiov (saves
>> malloc'ing space for the reply only to copy it into the qiov), so we
>> don't have to return any data
>> - for any other command, we malloc space for the non-error payload, and
>> then it is up to the command's loop to process the payload
>>
>> so the signature can be something like:
>>
>> int nbd_co_receive_reply(NBDClientSession *s, QEMUIOVector *qiov,
>>                           void **payload)
>>
>> where it returns -errno on failure, 0 if the command is not complete,
>> and 1 if the command is done.  READ passes qiov, which is fully
>> populated when the function returns 1; all other commands pass NULL.
>> Commands pass NULL for payload if they don't expect a payload return
>> (this includes READ); but a command that expects a payload
>> (BLOCK_STATUS) passes a pointer in payload and gets malloc'd space
>> stored there if return is 0 or 1.
>>
>> Does that sound like we're on the right design track?
>>
>
>
> Hmm. and what is the difference with my patch? Except payload? The 
> only command with payload
> now is READ (except errors), but you (and me in my patch) suggest to 
> fill this qiov in nbd_co_receive_reply
> (nbd_co_receive_1_reply_or_chunk in my patch), so payload will be 
> unused for now, we can add it
> later if it will be needed for BLOCK_STATUS.
>

Ahm, sorry, I've already forget my original patch, which reads most of 
payload in nbd/client.c code called from reply_entry.. So, ok for me, I 
think I'll post a new version today.
Vladimir Sementsov-Ogievskiy Oct. 6, 2017, 7:34 a.m. UTC | #16
06.10.2017 01:12, Eric Blake wrote:
> On 10/05/2017 05:36 AM, Paolo Bonzini wrote:
>> On 05/10/2017 12:02, Vladimir Sementsov-Ogievskiy wrote:
>>> 03.10.2017 17:06, Paolo Bonzini wrote:
>>>> On 03/10/2017 15:35, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> In the end this probably means that you have a read_chunk_header
>>>>>>> function and a read_chunk function.  READ has a loop that calls
>>>>>>> read_chunk_header followed by direct reading into the QEMUIOVector,
>>>>>>> while everyone else calls read_chunk.
>>>>>> accordingly to spec, we can receive several error reply chunks to any
>>>>>> request,
>>>>>> so loop, receiving them should be common for all requests I think
>>>>> as well as handling error chunks should be common..
>>>> Yes, reading error chunks should be part of read_chunk_header.
>>>>
>>>> Paolo
>>> So, you want a loop in READ, and separate loop for other commands? Then
>>> we will have separate loop for BLOCK_STATUS and for all future commands
>>> with specific replies?
>> There should be a separate loop for each command.
>>
>> The only difference between READ and other commands is that READ
>> receives directly in QEMUIOVector, while other commands can use a common
>> function to to receive each structured reply chunk into malloc-ed memory.
> To make sure we're on the same page, here's how I see it.  At a high
> level, we have:
>
> Each command calls nbd_co_send_request once, then calls
> nbd_co_receive_reply in a loop until there is an indication of the last
> packet.  nbd_co_receive_reply waits for data to come from the server,
> and parses the header.
>
> If the packet is unrecognized, report failure and request a quit
> (negative return value)
>
> If it is old-style:
> - if the command is read, and we did not negotiate structured read, then
> we also read the payload into qiov
> - if the command is read, but we negotiated structured read, the server
> is buggy, so report the bug and request a quit
> - for all other commands, there is no payload, return success or failure
> to the caller based on the header payload
> - at any rate, the reply to the caller is that this is the final packet,
> and there is no payload returned (so we return negative or 1, but never 0)
>
> Otherwise, it is new-style:
> - if we did not negotiate structured reply, the server is buggy, so
> report the bug and request a quit (negative return)
> - if the chunk is an error, we process the entire packet and report the
> error; if we have any commands that care about the error details, we
> could return the error in a malloc'd discriminated union, but we can
> probably get by without that. If callers don't care about details, but
> the error chunk is not final, it may be easier to just store the fact
> that an error occurred and return 0 to tell the caller to keep looping,
> and return the negative value later when the final chunk is finally received
> - if the chunk is NBD_REPLY_TYPE_NONE, there is no payload, and this
> should be the final chunk, so the return to the caller can be the same
> as for old-style (return 1 if we had no earlier error packets, or the
> saved negative value corresponding to the first error received)
> - if the command is read, we can read the payload into qiov (saves
> malloc'ing space for the reply only to copy it into the qiov), so we

but here you said "This is putting a LOT of smarts directly into the 
receive routine"

> don't have to return any data
> - for any other command, we malloc space for the non-error payload, and
> then it is up to the command's loop to process the payload
>
> so the signature can be something like:
>
> int nbd_co_receive_reply(NBDClientSession *s, QEMUIOVector *qiov,
>                           void **payload)
>
> where it returns -errno on failure, 0 if the command is not complete,
> and 1 if the command is done.  READ passes qiov, which is fully
> populated when the function returns 1; all other commands pass NULL.
> Commands pass NULL for payload if they don't expect a payload return
> (this includes READ); but a command that expects a payload
> (BLOCK_STATUS) passes a pointer in payload and gets malloc'd space
> stored there if return is 0 or 1.
>
> Does that sound like we're on the right design track?
>
Eric Blake Oct. 6, 2017, 1:44 p.m. UTC | #17
On 10/06/2017 02:34 AM, Vladimir Sementsov-Ogievskiy wrote:

>> - if the chunk is NBD_REPLY_TYPE_NONE, there is no payload, and this
>> should be the final chunk, so the return to the caller can be the same
>> as for old-style (return 1 if we had no earlier error packets, or the
>> saved negative value corresponding to the first error received)
>> - if the command is read, we can read the payload into qiov (saves
>> malloc'ing space for the reply only to copy it into the qiov), so we
> 
> but here you said "This is putting a LOT of smarts directly into the
> receive routine"

About the only reason to justify putting smarts into the receive routine
is if it is the most common case, where the overhead of not
special-casing will penalize us.  READ happens to be a frequent command,
all other commands, like BLOCK_STATUS, will probably be infrequent.
Making READ malloc the result, only to then copy it into the qiov, is a
waste of time; no other structured command will pass a qiov.  So yeah,
maybe I'm stating things a bit differently than in my earlier messages,
but that's because I've now stated reasonings for why it is okay to
special-case READ with a qiov differently from all other commands (none
of which will pass a qiov to the receive routine).

Thanks for persisting with this, and I'm looking forward to the next
revision that you post; hopefully, by taking this discussion, we are
making sure that the design is solid.
diff mbox series

Patch

diff --git a/block/nbd-client.h b/block/nbd-client.h
index b435754b82..9e178de510 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -35,6 +35,8 @@  typedef struct NBDClientSession {
     NBDClientRequest requests[MAX_NBD_REQUESTS];
     NBDReply reply;
     bool quit;
+
+    bool structured_reply;
 } NBDClientSession;
 
 NBDClientSession *nbd_get_client_session(BlockDriverState *bs);
diff --git a/include/block/nbd.h b/include/block/nbd.h
index 314f2f9bbc..7604e80c49 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -57,11 +57,17 @@  struct NBDRequest {
 };
 typedef struct NBDRequest NBDRequest;
 
-struct NBDReply {
+typedef struct NBDReply {
+    bool simple;
     uint64_t handle;
     uint32_t error;
-};
-typedef struct NBDReply NBDReply;
+
+    uint16_t flags;
+    uint16_t type;
+    uint32_t tail_length;
+    uint64_t offset;
+    uint32_t hole_size;
+} NBDReply;
 
 typedef struct NBDSimpleReply {
     uint32_t magic;  /* NBD_SIMPLE_REPLY_MAGIC */
@@ -178,12 +184,15 @@  enum {
 
 #define NBD_SREP_TYPE_NONE          0
 #define NBD_SREP_TYPE_OFFSET_DATA   1
+#define NBD_SREP_TYPE_OFFSET_HOLE   2
 #define NBD_SREP_TYPE_ERROR         NBD_SREP_ERR(1)
+#define NBD_SREP_TYPE_ERROR_OFFSET  NBD_SREP_ERR(2)
 
 /* Details collected by NBD_OPT_EXPORT_NAME and NBD_OPT_GO */
 struct NBDExportInfo {
     /* Set by client before nbd_receive_negotiate() */
     bool request_sizes;
+    bool structured_reply;
     /* Set by server results during nbd_receive_negotiate() */
     uint64_t size;
     uint16_t flags;
diff --git a/block/nbd-client.c b/block/nbd-client.c
index e4f0c789f4..bdf9299bb9 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -179,9 +179,10 @@  err:
     return rc;
 }
 
-static int nbd_co_receive_reply(NBDClientSession *s,
-                                uint64_t handle,
-                                QEMUIOVector *qiov)
+static int nbd_co_receive_1_reply_or_chunk(NBDClientSession *s,
+                                           uint64_t handle,
+                                           bool *cont,
+                                           QEMUIOVector *qiov)
 {
     int ret;
     int i = HANDLE_TO_INDEX(s, handle);
@@ -191,29 +192,95 @@  static int nbd_co_receive_reply(NBDClientSession *s,
     qemu_coroutine_yield();
     s->requests[i].receiving = false;
     if (!s->ioc || s->quit) {
-        ret = -EIO;
-    } else {
-        assert(s->reply.handle == handle);
-        ret = -s->reply.error;
-        if (qiov && s->reply.error == 0) {
+        *cont = false;
+        return -EIO;
+    }
+
+    assert(s->reply.handle == handle);
+    *cont = !(s->reply.simple || (s->reply.flags & NBD_SREP_FLAG_DONE));
+    ret = -s->reply.error;
+    if (ret < 0) {
+        goto out;
+    }
+
+    if (s->reply.simple) {
+        if (qiov) {
             if (qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
-                                      NULL) < 0) {
-                ret = -EIO;
-                s->quit = true;
+                                      NULL) < 0)
+            {
+                goto fatal;
             }
         }
+        goto out;
+    }
 
-        /* Tell the read handler to read another header.  */
-        s->reply.handle = 0;
+    /* here we deal with successful structured reply */
+    switch (s->reply.type) {
+        QEMUIOVector sub_qiov;
+    case NBD_SREP_TYPE_OFFSET_DATA:
+        if (!qiov || s->reply.offset + s->reply.tail_length > qiov->size) {
+            goto fatal;
+        }
+        qemu_iovec_init(&sub_qiov, qiov->niov);
+        qemu_iovec_concat(&sub_qiov, qiov, s->reply.offset,
+                          s->reply.tail_length);
+        ret = qio_channel_readv_all(s->ioc, sub_qiov.iov, sub_qiov.niov, NULL);
+        qemu_iovec_destroy(&sub_qiov);
+        if (ret < 0) {
+            goto fatal;
+        }
+        assert(ret == 0);
+        break;
+    case NBD_SREP_TYPE_OFFSET_HOLE:
+        if (!qiov || s->reply.offset + s->reply.hole_size > qiov->size) {
+            goto fatal;
+        }
+        qemu_iovec_memset(qiov, s->reply.offset, 0, s->reply.hole_size);
+        break;
+    case NBD_SREP_TYPE_NONE:
+        break;
+    default:
+        goto fatal;
     }
 
-    s->requests[i].coroutine = NULL;
+out:
+    /* For assert at loop start in nbd_read_reply_entry */
+    s->reply.handle = 0;
+
+    if (s->read_reply_co) {
+        aio_co_wake(s->read_reply_co);
+    }
+
+    return ret;
 
-    /* Kick the read_reply_co to get the next reply.  */
+fatal:
+    /* protocol or ioc failure */
+    *cont = false;
+    s->quit = true;
     if (s->read_reply_co) {
         aio_co_wake(s->read_reply_co);
     }
 
+    return -EIO;
+}
+
+static int nbd_co_receive_reply(NBDClientSession *s,
+                                uint64_t handle,
+                                QEMUIOVector *qiov)
+{
+    int ret = 0;
+    int i = HANDLE_TO_INDEX(s, handle);
+    bool cont = true;
+
+    while (cont) {
+        int rc = nbd_co_receive_1_reply_or_chunk(s, handle, &cont, qiov);
+        if (rc < 0 && ret == 0) {
+            ret = rc;
+        }
+    }
+
+    s->requests[i].coroutine = NULL;
+
     qemu_co_mutex_lock(&s->send_mutex);
     s->in_flight--;
     qemu_co_queue_next(&s->free_sema);
diff --git a/nbd/client.c b/nbd/client.c
index 51ae492e92..880eb17b85 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -719,6 +719,13 @@  int nbd_receive_negotiate(QIOChannel *ioc, const char *name,
         if (fixedNewStyle) {
             int result;
 
+            result = nbd_request_simple_option(ioc, NBD_OPT_STRUCTURED_REPLY,
+                                               errp);
+            if (result < 0) {
+                goto fail;
+            }
+            info->structured_reply = result > 0;
+
             /* Try NBD_OPT_GO first - if it works, we are done (it
              * also gives us a good message if the server requires
              * TLS).  If it is not available, fall back to
@@ -759,6 +766,12 @@  int nbd_receive_negotiate(QIOChannel *ioc, const char *name,
             goto fail;
         }
         be16_to_cpus(&info->flags);
+
+        if (info->structured_reply && !(info->flags & NBD_CMD_FLAG_DF)) {
+            error_setg(errp, "Structured reply is negotiated, "
+                             "but DF flag is not.");
+            goto fail;
+        }
     } else if (magic == NBD_CLIENT_MAGIC) {
         uint32_t oldflags;
 
@@ -942,6 +955,128 @@  int nbd_send_request(QIOChannel *ioc, NBDRequest *request)
     return nbd_write(ioc, buf, sizeof(buf), NULL);
 }
 
+/* nbd_receive_simple_reply
+ * Read simple reply except magic field (which should be already read)
+ */
+static int nbd_receive_simple_reply(QIOChannel *ioc, NBDReply *reply,
+                                    Error **errp)
+{
+    NBDSimpleReply simple_reply;
+    int ret;
+
+    ret = nbd_read(ioc, (uint8_t *)&simple_reply + sizeof(simple_reply.magic),
+                   sizeof(simple_reply) - sizeof(simple_reply.magic), errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    reply->error = be32_to_cpu(simple_reply.error);
+    reply->handle = be64_to_cpu(simple_reply.handle);
+
+    return 0;
+}
+
+/* nbd_receive_structured_reply_chunk
+ * Read structured reply chunk except magic field (which should be already read)
+ * Data for NBD_SREP_TYPE_OFFSET_DATA is not read too.
+ * tail_length field of reply out parameter corresponds to unread part of reply.
+ */
+static int nbd_receive_structured_reply_chunk(QIOChannel *ioc, NBDReply *reply,
+                                              Error **errp)
+{
+    NBDStructuredReplyChunk chunk;
+    ssize_t ret;
+    uint16_t message_size;
+
+    ret = nbd_read(ioc, (uint8_t *)&chunk + sizeof(chunk.magic),
+                          sizeof(chunk) - sizeof(chunk.magic), errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    reply->flags = be16_to_cpu(chunk.flags);
+    reply->type = be16_to_cpu(chunk.type);
+    reply->handle = be64_to_cpu(chunk.handle);
+    reply->tail_length = be32_to_cpu(chunk.length);
+
+    switch (reply->type) {
+    case NBD_SREP_TYPE_NONE:
+        break;
+    case NBD_SREP_TYPE_OFFSET_DATA:
+        if (reply->tail_length < sizeof(reply->offset)) {
+            return -EIO;
+        }
+        ret = nbd_read(ioc, &reply->offset, sizeof(reply->offset), errp);
+        if (ret < 0) {
+            return ret;
+        }
+        be64_to_cpus(&reply->offset);
+        reply->tail_length -= sizeof(reply->offset);
+
+        break;
+    case NBD_SREP_TYPE_OFFSET_HOLE:
+        ret = nbd_read(ioc, &reply->offset, sizeof(reply->offset), errp);
+        if (ret < 0) {
+            return ret;
+        }
+        be64_to_cpus(&reply->offset);
+
+        ret = nbd_read(ioc, &reply->hole_size, sizeof(reply->hole_size), errp);
+        if (ret < 0) {
+            return ret;
+        }
+        be32_to_cpus(&reply->hole_size);
+
+        break;
+    case NBD_SREP_TYPE_ERROR:
+    case NBD_SREP_TYPE_ERROR_OFFSET:
+        ret = nbd_read(ioc, &reply->error, sizeof(reply->error), errp);
+        if (ret < 0) {
+            return ret;
+        }
+        be32_to_cpus(&reply->error);
+
+        ret = nbd_read(ioc, &message_size, sizeof(message_size), errp);
+        if (ret < 0) {
+            return ret;
+        }
+        be16_to_cpus(&message_size);
+
+        if (message_size > 0) {
+            /* TODO: provide error message to user */
+            ret = nbd_drop(ioc, message_size, errp);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+
+        if (reply->type == NBD_SREP_TYPE_ERROR_OFFSET) {
+            /* drop 64bit offset */
+            ret = nbd_drop(ioc, 8, errp);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        break;
+    default:
+        if (reply->type & (1 << 15)) {
+            /* unknown error */
+            ret = nbd_drop(ioc, reply->tail_length, errp);
+            if (ret < 0) {
+                return ret;
+            }
+
+            reply->error = NBD_EINVAL;
+            reply->tail_length = 0;
+        } else {
+            /* unknown non-error reply type */
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
 /* nbd_receive_reply
  * Returns 1 on success
  *         0 on eof, when no data was read (errp is not set)
@@ -949,24 +1084,32 @@  int nbd_send_request(QIOChannel *ioc, NBDRequest *request)
  */
 int nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
 {
-    uint8_t buf[NBD_REPLY_SIZE];
     uint32_t magic;
     int ret;
 
-    ret = nbd_read_eof(ioc, buf, sizeof(buf), errp);
+    ret = nbd_read_eof(ioc, &magic, sizeof(magic), errp);
     if (ret <= 0) {
         return ret;
     }
 
-    /* Reply
-       [ 0 ..  3]    magic   (NBD_SIMPLE_REPLY_MAGIC)
-       [ 4 ..  7]    error   (0 == no error)
-       [ 7 .. 15]    handle
-     */
+    be32_to_cpus(&magic);
 
-    magic = ldl_be_p(buf);
-    reply->error  = ldl_be_p(buf + 4);
-    reply->handle = ldq_be_p(buf + 8);
+    switch (magic) {
+    case NBD_SIMPLE_REPLY_MAGIC:
+        reply->simple = true;
+        ret = nbd_receive_simple_reply(ioc, reply, errp);
+        break;
+    case NBD_STRUCTURED_REPLY_MAGIC:
+        reply->simple = false;
+        ret = nbd_receive_structured_reply_chunk(ioc, reply, errp);
+        break;
+    default:
+        error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic);
+        return -EINVAL;
+    }
+    if (ret < 0) {
+        return ret;
+    }
 
     reply->error = nbd_errno_to_system_errno(reply->error);
 
@@ -977,11 +1120,5 @@  int nbd_receive_reply(QIOChannel *ioc, NBDReply *reply, Error **errp)
     }
     trace_nbd_receive_reply(magic, reply->error, reply->handle);
 
-    if (magic != NBD_SIMPLE_REPLY_MAGIC) {
-        error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic);
-        return -EINVAL;
-    }
-
     return 1;
 }
-