diff mbox series

[v4,for-4.0,4/7] libvhost-user: Support tracking inflight I/O in shared memory

Message ID 20190109112728.9214-5-xieyongji@baidu.com
State New
Headers show
Series vhost-user-blk: Add support for backend reconnecting | expand

Commit Message

Yongji Xie Jan. 9, 2019, 11:27 a.m. UTC
From: Xie Yongji <xieyongji@baidu.com>

This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
to/from qemu. Then we maintain a "bitmap" of all descriptors in
the shared memory for each queue to track inflight I/O.

Signed-off-by: Xie Yongji <xieyongji@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
---
 Makefile                              |   2 +-
 contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
 contrib/libvhost-user/libvhost-user.h |  29 +++
 3 files changed, 268 insertions(+), 21 deletions(-)

Comments

Jason Wang Jan. 11, 2019, 3:56 a.m. UTC | #1
On 2019/1/9 下午7:27, elohimes@gmail.com wrote:
> From: Xie Yongji <xieyongji@baidu.com>
>
> This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
> to/from qemu. Then we maintain a "bitmap" of all descriptors in
> the shared memory for each queue to track inflight I/O.
>
> Signed-off-by: Xie Yongji <xieyongji@baidu.com>
> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> ---
>   Makefile                              |   2 +-
>   contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
>   contrib/libvhost-user/libvhost-user.h |  29 +++
>   3 files changed, 268 insertions(+), 21 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index dd53965f77..b5c9092605 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -473,7 +473,7 @@ Makefile: $(version-obj-y)
>   # Build libraries
>   
>   libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> -libvhost-user.a: $(libvhost-user-obj-y)
> +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
>   
>   ######################################################################
>   
> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> index 23bd52264c..e73ce04619 100644
> --- a/contrib/libvhost-user/libvhost-user.c
> +++ b/contrib/libvhost-user/libvhost-user.c
> @@ -41,6 +41,8 @@
>   #endif
>   
>   #include "qemu/atomic.h"
> +#include "qemu/osdep.h"
> +#include "qemu/memfd.h"
>   
>   #include "libvhost-user.h"
>   
> @@ -53,6 +55,18 @@
>               _min1 < _min2 ? _min1 : _min2; })
>   #endif
>   
> +/* Round number down to multiple */
> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> +
> +/* Round number up to multiple */
> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> +
> +/* Align each region to cache line size in inflight buffer */
> +#define INFLIGHT_ALIGNMENT 64
> +
> +/* The version of inflight buffer */
> +#define INFLIGHT_VERSION 1
> +
>   #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
>   
>   /* The version of the protocol we support */
> @@ -66,6 +80,20 @@
>           }                                       \
>       } while (0)
>   
> +static inline
> +bool has_feature(uint64_t features, unsigned int fbit)
> +{
> +    assert(fbit < 64);
> +    return !!(features & (1ULL << fbit));
> +}
> +
> +static inline
> +bool vu_has_feature(VuDev *dev,
> +                    unsigned int fbit)
> +{
> +    return has_feature(dev->features, fbit);
> +}
> +
>   static const char *
>   vu_request_to_string(unsigned int req)
>   {
> @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
>           REQ(VHOST_USER_POSTCOPY_ADVISE),
>           REQ(VHOST_USER_POSTCOPY_LISTEN),
>           REQ(VHOST_USER_POSTCOPY_END),
> +        REQ(VHOST_USER_GET_INFLIGHT_FD),
> +        REQ(VHOST_USER_SET_INFLIGHT_FD),
>           REQ(VHOST_USER_MAX),
>       };
>   #undef REQ
> @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
>       return true;
>   }
>   
> +static int
> +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> +{
> +    int i = 0;
> +
> +    if (!has_feature(dev->protocol_features,
> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> +        return 0;
> +    }
> +
> +    if (unlikely(!vq->inflight)) {
> +        return -1;
> +    }
> +
> +    vq->used_idx = vq->vring.used->idx;
> +    vq->inflight_num = 0;
> +    for (i = 0; i < vq->vring.num; i++) {
> +        if (vq->inflight->desc[i] == 0) {
> +            continue;
> +        }
> +
> +        vq->inflight_desc[vq->inflight_num++] = i;
> +        vq->inuse++;
> +    }
> +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> +
> +    /* in case of I/O hang after reconnecting */
> +    if (eventfd_write(vq->kick_fd, 1) ||
> +        eventfd_write(vq->call_fd, 1)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>   static bool
>   vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>   {
> @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>                  dev->vq[index].kick_fd, index);
>       }
>   
> +    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> +        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> +    }
> +
>       return false;
>   }
>   
> @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
>       return true;
>   }
>   
> +static bool
> +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> +{
> +    int fd;
> +    void *addr;
> +    uint64_t mmap_size;
> +
> +    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> +        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> +        vmsg->payload.inflight.mmap_size = 0;
> +        return true;
> +    }
> +
> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
> +           vmsg->payload.inflight.num_queues);
> +
> +    mmap_size = vmsg->payload.inflight.num_queues *
> +                ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
> +
> +    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> +                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> +                            &fd, NULL);
> +
> +    if (!addr) {
> +        vu_panic(dev, "Failed to alloc vhost inflight area");
> +        vmsg->payload.inflight.mmap_size = 0;
> +        return true;
> +    }
> +
> +    dev->inflight_info.addr = addr;
> +    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> +    vmsg->payload.inflight.mmap_offset = 0;
> +    vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
> +    vmsg->payload.inflight.version = INFLIGHT_VERSION;
> +    vmsg->fd_num = 1;
> +    dev->inflight_info.fd = vmsg->fds[0] = fd;
> +
> +    DPRINT("send inflight mmap_size: %"PRId64"\n",
> +           vmsg->payload.inflight.mmap_size);
> +    DPRINT("send inflight mmap offset: %"PRId64"\n",
> +           vmsg->payload.inflight.mmap_offset);
> +    DPRINT("send inflight align: %"PRId32"\n",
> +           vmsg->payload.inflight.align);
> +    DPRINT("send inflight version: %"PRId16"\n",
> +           vmsg->payload.inflight.version);
> +
> +    return true;
> +}
> +
> +static bool
> +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> +{
> +    int fd, i;
> +    uint64_t mmap_size, mmap_offset;
> +    uint32_t align;
> +    uint16_t num_queues, version;
> +    void *rc;
> +
> +    if (vmsg->fd_num != 1 ||
> +        vmsg->size != sizeof(vmsg->payload.inflight)) {
> +        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> +                 vmsg->size, vmsg->fd_num);
> +        return false;
> +    }
> +
> +    fd = vmsg->fds[0];
> +    mmap_size = vmsg->payload.inflight.mmap_size;
> +    mmap_offset = vmsg->payload.inflight.mmap_offset;
> +    align = vmsg->payload.inflight.align;
> +    num_queues = vmsg->payload.inflight.num_queues;
> +    version = vmsg->payload.inflight.version;
> +
> +    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> +    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> +    DPRINT("set_inflight_fd align: %"PRId32"\n", align);
> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> +    DPRINT("set_inflight_fd version: %"PRId16"\n", version);
> +
> +    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +              fd, mmap_offset);
> +
> +    if (rc == MAP_FAILED) {
> +        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> +        return false;
> +    }
> +
> +    if (version != INFLIGHT_VERSION) {
> +        vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
> +        return false;
> +    }
> +
> +    if (dev->inflight_info.fd) {
> +        close(dev->inflight_info.fd);
> +    }
> +
> +    if (dev->inflight_info.addr) {
> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> +    }
> +
> +    dev->inflight_info.fd = fd;
> +    dev->inflight_info.addr = rc;
> +    dev->inflight_info.size = mmap_size;
> +
> +    for (i = 0; i < num_queues; i++) {
> +        dev->vq[i].inflight = (VuVirtqInflight *)rc;
> +        rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align));
> +    }
> +
> +    return false;
> +}
> +
>   static bool
>   vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
>   {
> @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
>           return vu_set_postcopy_listen(dev, vmsg);
>       case VHOST_USER_POSTCOPY_END:
>           return vu_set_postcopy_end(dev, vmsg);
> +    case VHOST_USER_GET_INFLIGHT_FD:
> +        return vu_get_inflight_fd(dev, vmsg);
> +    case VHOST_USER_SET_INFLIGHT_FD:
> +        return vu_set_inflight_fd(dev, vmsg);
>       default:
>           vmsg_close_fds(vmsg);
>           vu_panic(dev, "Unhandled request: %d", vmsg->request);
> @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev)
>               close(vq->err_fd);
>               vq->err_fd = -1;
>           }
> +        vq->inflight = NULL;
>       }
>   
> +    if (dev->inflight_info.addr) {
> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> +        dev->inflight_info.addr = NULL;
> +    }
> +
> +    if (dev->inflight_info.fd > 0) {
> +        close(dev->inflight_info.fd);
> +        dev->inflight_info.fd = -1;
> +    }
>   
>       vu_close_log(dev);
>       if (dev->slave_fd != -1) {
> @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
>       return vring_avail_idx(vq) == vq->last_avail_idx;
>   }
>   
> -static inline
> -bool has_feature(uint64_t features, unsigned int fbit)
> -{
> -    assert(fbit < 64);
> -    return !!(features & (1ULL << fbit));
> -}
> -
> -static inline
> -bool vu_has_feature(VuDev *dev,
> -                    unsigned int fbit)
> -{
> -    return has_feature(dev->features, fbit);
> -}
> -
>   static bool
>   vring_notify(VuDev *dev, VuVirtq *vq)
>   {
> @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev,
>       *p_num_sg = num_sg;
>   }
>   
> -/* Round number down to multiple */
> -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> -
> -/* Round number up to multiple */
> -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> -
>   static void *
>   virtqueue_alloc_element(size_t sz,
>                                        unsigned out_num, unsigned in_num)
> @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
>       return elem;
>   }
>   
> +static int
> +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> +    if (!has_feature(dev->protocol_features,
> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> +        return 0;
> +    }
> +
> +    if (unlikely(!vq->inflight)) {
> +        return -1;
> +    }
> +


Just wonder what happens if backend get killed at this point?

You want to survive from the backend crash but you still depend on 
backend to get and put inflight descriptors which seems somehow conflict.

Thanks


> +    vq->inflight->desc[desc_idx] = 1;
> +
> +    return 0;
> +}
> +
> +static int
> +vu_queue_inflight_put(VuDev *dev, VuVirtq *vq, int desc_idx)
> +{
> +    if (!has_feature(dev->protocol_features,
> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> +        return 0;
> +    }
> +
> +    if (unlikely(!vq->inflight)) {
> +        return -1;
> +    }
> +
> +    vq->inflight->desc[desc_idx] = 0;
> +
> +    return 0;
> +}
> +
>   void *
>   vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
>   {
> +    int i;
>       unsigned int head;
>       VuVirtqElement *elem;
>   
> @@ -1946,6 +2155,12 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
>           return NULL;
>       }
>   
> +    if (unlikely(vq->inflight_num > 0)) {
> +        i = (--vq->inflight_num);
> +        elem = vu_queue_map_desc(dev, vq, vq->inflight_desc[i], sz);
> +        return elem;
> +    }
> +
>       if (vu_queue_empty(dev, vq)) {
>           return NULL;
>       }
> @@ -1976,6 +2191,8 @@ vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
>   
>       vq->inuse++;
>   
> +    vu_queue_inflight_get(dev, vq, head);
> +
>       return elem;
>   }
>   
> @@ -2121,4 +2338,5 @@ vu_queue_push(VuDev *dev, VuVirtq *vq,
>   {
>       vu_queue_fill(dev, vq, elem, len, 0);
>       vu_queue_flush(dev, vq, 1);
> +    vu_queue_inflight_put(dev, vq, elem->index);
>   }
> diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
> index 4aa55b4d2d..5afb80ea5c 100644
> --- a/contrib/libvhost-user/libvhost-user.h
> +++ b/contrib/libvhost-user/libvhost-user.h
> @@ -53,6 +53,7 @@ enum VhostUserProtocolFeature {
>       VHOST_USER_PROTOCOL_F_CONFIG = 9,
>       VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
>       VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
> +    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
>   
>       VHOST_USER_PROTOCOL_F_MAX
>   };
> @@ -91,6 +92,8 @@ typedef enum VhostUserRequest {
>       VHOST_USER_POSTCOPY_ADVISE  = 28,
>       VHOST_USER_POSTCOPY_LISTEN  = 29,
>       VHOST_USER_POSTCOPY_END     = 30,
> +    VHOST_USER_GET_INFLIGHT_FD = 31,
> +    VHOST_USER_SET_INFLIGHT_FD = 32,
>       VHOST_USER_MAX
>   } VhostUserRequest;
>   
> @@ -138,6 +141,14 @@ typedef struct VhostUserVringArea {
>       uint64_t offset;
>   } VhostUserVringArea;
>   
> +typedef struct VhostUserInflight {
> +    uint64_t mmap_size;
> +    uint64_t mmap_offset;
> +    uint32_t align;
> +    uint16_t num_queues;
> +    uint16_t version;
> +} VhostUserInflight;
> +
>   #if defined(_WIN32)
>   # define VU_PACKED __attribute__((gcc_struct, packed))
>   #else
> @@ -163,6 +174,7 @@ typedef struct VhostUserMsg {
>           VhostUserLog log;
>           VhostUserConfig config;
>           VhostUserVringArea area;
> +        VhostUserInflight inflight;
>       } payload;
>   
>       int fds[VHOST_MEMORY_MAX_NREGIONS];
> @@ -234,9 +246,19 @@ typedef struct VuRing {
>       uint32_t flags;
>   } VuRing;
>   
> +typedef struct VuVirtqInflight {
> +    char desc[VIRTQUEUE_MAX_SIZE];
> +} VuVirtqInflight;
> +
>   typedef struct VuVirtq {
>       VuRing vring;
>   
> +    VuVirtqInflight *inflight;
> +
> +    uint16_t inflight_desc[VIRTQUEUE_MAX_SIZE];
> +
> +    uint16_t inflight_num;
> +
>       /* Next head to pop */
>       uint16_t last_avail_idx;
>   
> @@ -279,11 +301,18 @@ typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
>                                    vu_watch_cb cb, void *data);
>   typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
>   
> +typedef struct VuDevInflightInfo {
> +    int fd;
> +    void *addr;
> +    uint64_t size;
> +} VuDevInflightInfo;
> +
>   struct VuDev {
>       int sock;
>       uint32_t nregions;
>       VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
>       VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
> +    VuDevInflightInfo inflight_info;
>       int log_call_fd;
>       int slave_fd;
>       uint64_t log_size;
Yongji Xie Jan. 11, 2019, 6:10 a.m. UTC | #2
On Fri, 11 Jan 2019 at 11:56, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2019/1/9 下午7:27, elohimes@gmail.com wrote:
> > From: Xie Yongji <xieyongji@baidu.com>
> >
> > This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> > VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
> > to/from qemu. Then we maintain a "bitmap" of all descriptors in
> > the shared memory for each queue to track inflight I/O.
> >
> > Signed-off-by: Xie Yongji <xieyongji@baidu.com>
> > Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> > ---
> >   Makefile                              |   2 +-
> >   contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
> >   contrib/libvhost-user/libvhost-user.h |  29 +++
> >   3 files changed, 268 insertions(+), 21 deletions(-)
> >
> > diff --git a/Makefile b/Makefile
> > index dd53965f77..b5c9092605 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -473,7 +473,7 @@ Makefile: $(version-obj-y)
> >   # Build libraries
> >
> >   libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> > -libvhost-user.a: $(libvhost-user-obj-y)
> > +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
> >
> >   ######################################################################
> >
> > diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> > index 23bd52264c..e73ce04619 100644
> > --- a/contrib/libvhost-user/libvhost-user.c
> > +++ b/contrib/libvhost-user/libvhost-user.c
> > @@ -41,6 +41,8 @@
> >   #endif
> >
> >   #include "qemu/atomic.h"
> > +#include "qemu/osdep.h"
> > +#include "qemu/memfd.h"
> >
> >   #include "libvhost-user.h"
> >
> > @@ -53,6 +55,18 @@
> >               _min1 < _min2 ? _min1 : _min2; })
> >   #endif
> >
> > +/* Round number down to multiple */
> > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > +
> > +/* Round number up to multiple */
> > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > +
> > +/* Align each region to cache line size in inflight buffer */
> > +#define INFLIGHT_ALIGNMENT 64
> > +
> > +/* The version of inflight buffer */
> > +#define INFLIGHT_VERSION 1
> > +
> >   #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
> >
> >   /* The version of the protocol we support */
> > @@ -66,6 +80,20 @@
> >           }                                       \
> >       } while (0)
> >
> > +static inline
> > +bool has_feature(uint64_t features, unsigned int fbit)
> > +{
> > +    assert(fbit < 64);
> > +    return !!(features & (1ULL << fbit));
> > +}
> > +
> > +static inline
> > +bool vu_has_feature(VuDev *dev,
> > +                    unsigned int fbit)
> > +{
> > +    return has_feature(dev->features, fbit);
> > +}
> > +
> >   static const char *
> >   vu_request_to_string(unsigned int req)
> >   {
> > @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
> >           REQ(VHOST_USER_POSTCOPY_ADVISE),
> >           REQ(VHOST_USER_POSTCOPY_LISTEN),
> >           REQ(VHOST_USER_POSTCOPY_END),
> > +        REQ(VHOST_USER_GET_INFLIGHT_FD),
> > +        REQ(VHOST_USER_SET_INFLIGHT_FD),
> >           REQ(VHOST_USER_MAX),
> >       };
> >   #undef REQ
> > @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
> >       return true;
> >   }
> >
> > +static int
> > +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> > +{
> > +    int i = 0;
> > +
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
> > +    vq->used_idx = vq->vring.used->idx;
> > +    vq->inflight_num = 0;
> > +    for (i = 0; i < vq->vring.num; i++) {
> > +        if (vq->inflight->desc[i] == 0) {
> > +            continue;
> > +        }
> > +
> > +        vq->inflight_desc[vq->inflight_num++] = i;
> > +        vq->inuse++;
> > +    }
> > +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> > +
> > +    /* in case of I/O hang after reconnecting */
> > +    if (eventfd_write(vq->kick_fd, 1) ||
> > +        eventfd_write(vq->call_fd, 1)) {
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >   static bool
> >   vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >   {
> > @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >                  dev->vq[index].kick_fd, index);
> >       }
> >
> > +    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> > +        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> > +    }
> > +
> >       return false;
> >   }
> >
> > @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
> >       return true;
> >   }
> >
> > +static bool
> > +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > +    int fd;
> > +    void *addr;
> > +    uint64_t mmap_size;
> > +
> > +    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> > +        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> > +        vmsg->payload.inflight.mmap_size = 0;
> > +        return true;
> > +    }
> > +
> > +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
> > +           vmsg->payload.inflight.num_queues);
> > +
> > +    mmap_size = vmsg->payload.inflight.num_queues *
> > +                ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
> > +
> > +    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> > +                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> > +                            &fd, NULL);
> > +
> > +    if (!addr) {
> > +        vu_panic(dev, "Failed to alloc vhost inflight area");
> > +        vmsg->payload.inflight.mmap_size = 0;
> > +        return true;
> > +    }
> > +
> > +    dev->inflight_info.addr = addr;
> > +    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> > +    vmsg->payload.inflight.mmap_offset = 0;
> > +    vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
> > +    vmsg->payload.inflight.version = INFLIGHT_VERSION;
> > +    vmsg->fd_num = 1;
> > +    dev->inflight_info.fd = vmsg->fds[0] = fd;
> > +
> > +    DPRINT("send inflight mmap_size: %"PRId64"\n",
> > +           vmsg->payload.inflight.mmap_size);
> > +    DPRINT("send inflight mmap offset: %"PRId64"\n",
> > +           vmsg->payload.inflight.mmap_offset);
> > +    DPRINT("send inflight align: %"PRId32"\n",
> > +           vmsg->payload.inflight.align);
> > +    DPRINT("send inflight version: %"PRId16"\n",
> > +           vmsg->payload.inflight.version);
> > +
> > +    return true;
> > +}
> > +
> > +static bool
> > +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> > +{
> > +    int fd, i;
> > +    uint64_t mmap_size, mmap_offset;
> > +    uint32_t align;
> > +    uint16_t num_queues, version;
> > +    void *rc;
> > +
> > +    if (vmsg->fd_num != 1 ||
> > +        vmsg->size != sizeof(vmsg->payload.inflight)) {
> > +        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> > +                 vmsg->size, vmsg->fd_num);
> > +        return false;
> > +    }
> > +
> > +    fd = vmsg->fds[0];
> > +    mmap_size = vmsg->payload.inflight.mmap_size;
> > +    mmap_offset = vmsg->payload.inflight.mmap_offset;
> > +    align = vmsg->payload.inflight.align;
> > +    num_queues = vmsg->payload.inflight.num_queues;
> > +    version = vmsg->payload.inflight.version;
> > +
> > +    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> > +    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> > +    DPRINT("set_inflight_fd align: %"PRId32"\n", align);
> > +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> > +    DPRINT("set_inflight_fd version: %"PRId16"\n", version);
> > +
> > +    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> > +              fd, mmap_offset);
> > +
> > +    if (rc == MAP_FAILED) {
> > +        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> > +        return false;
> > +    }
> > +
> > +    if (version != INFLIGHT_VERSION) {
> > +        vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
> > +        return false;
> > +    }
> > +
> > +    if (dev->inflight_info.fd) {
> > +        close(dev->inflight_info.fd);
> > +    }
> > +
> > +    if (dev->inflight_info.addr) {
> > +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +    }
> > +
> > +    dev->inflight_info.fd = fd;
> > +    dev->inflight_info.addr = rc;
> > +    dev->inflight_info.size = mmap_size;
> > +
> > +    for (i = 0; i < num_queues; i++) {
> > +        dev->vq[i].inflight = (VuVirtqInflight *)rc;
> > +        rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align));
> > +    }
> > +
> > +    return false;
> > +}
> > +
> >   static bool
> >   vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >   {
> > @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >           return vu_set_postcopy_listen(dev, vmsg);
> >       case VHOST_USER_POSTCOPY_END:
> >           return vu_set_postcopy_end(dev, vmsg);
> > +    case VHOST_USER_GET_INFLIGHT_FD:
> > +        return vu_get_inflight_fd(dev, vmsg);
> > +    case VHOST_USER_SET_INFLIGHT_FD:
> > +        return vu_set_inflight_fd(dev, vmsg);
> >       default:
> >           vmsg_close_fds(vmsg);
> >           vu_panic(dev, "Unhandled request: %d", vmsg->request);
> > @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev)
> >               close(vq->err_fd);
> >               vq->err_fd = -1;
> >           }
> > +        vq->inflight = NULL;
> >       }
> >
> > +    if (dev->inflight_info.addr) {
> > +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> > +        dev->inflight_info.addr = NULL;
> > +    }
> > +
> > +    if (dev->inflight_info.fd > 0) {
> > +        close(dev->inflight_info.fd);
> > +        dev->inflight_info.fd = -1;
> > +    }
> >
> >       vu_close_log(dev);
> >       if (dev->slave_fd != -1) {
> > @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
> >       return vring_avail_idx(vq) == vq->last_avail_idx;
> >   }
> >
> > -static inline
> > -bool has_feature(uint64_t features, unsigned int fbit)
> > -{
> > -    assert(fbit < 64);
> > -    return !!(features & (1ULL << fbit));
> > -}
> > -
> > -static inline
> > -bool vu_has_feature(VuDev *dev,
> > -                    unsigned int fbit)
> > -{
> > -    return has_feature(dev->features, fbit);
> > -}
> > -
> >   static bool
> >   vring_notify(VuDev *dev, VuVirtq *vq)
> >   {
> > @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev,
> >       *p_num_sg = num_sg;
> >   }
> >
> > -/* Round number down to multiple */
> > -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> > -
> > -/* Round number up to multiple */
> > -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> > -
> >   static void *
> >   virtqueue_alloc_element(size_t sz,
> >                                        unsigned out_num, unsigned in_num)
> > @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
> >       return elem;
> >   }
> >
> > +static int
> > +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> > +{
> > +    if (!has_feature(dev->protocol_features,
> > +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> > +        return 0;
> > +    }
> > +
> > +    if (unlikely(!vq->inflight)) {
> > +        return -1;
> > +    }
> > +
>
>
> Just wonder what happens if backend get killed at this point?
>

We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx

At this point, backend could consume this entry correctly after reconnect.

> You want to survive from the backend crash but you still depend on
> backend to get and put inflight descriptors which seems somehow conflict.
>

But if backend get killed in vu_queue_inflight_put(), I think you are
right, there is something conflict. One descriptor is consumed by
guest but still marked as inused in inflight buffer. Then we will
re-send this old descriptor after restart.

Maybe we can add something like that to fix this issue:

void vu_queue_push()
{
    vq->inflight->elem_idx = elem->idx;
    vu_queue_fill();
    vu_queue_flush();
    vq->inflight->desc[elem->idx] = 0;
    vq->inflight->used_idx = vq->vring.used->idx;
}

static int vu_check_queue_inflights()
{
    ....
    if (vq->inflight->used_idx != vq->vring.used->idx) {
        /* Crash in vu_queue_push() */
        vq->inflight->desc[vq->inflight->elem_idx] = 0;
    }
    ....
}

Thanks,
Yongji
Jason Wang Jan. 15, 2019, 7:52 a.m. UTC | #3
On 2019/1/11 下午2:10, Yongji Xie wrote:
> On Fri, 11 Jan 2019 at 11:56, Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2019/1/9 下午7:27, elohimes@gmail.com wrote:
>>> From: Xie Yongji <xieyongji@baidu.com>
>>>
>>> This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
>>> VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
>>> to/from qemu. Then we maintain a "bitmap" of all descriptors in
>>> the shared memory for each queue to track inflight I/O.
>>>
>>> Signed-off-by: Xie Yongji <xieyongji@baidu.com>
>>> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
>>> ---
>>>    Makefile                              |   2 +-
>>>    contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
>>>    contrib/libvhost-user/libvhost-user.h |  29 +++
>>>    3 files changed, 268 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/Makefile b/Makefile
>>> index dd53965f77..b5c9092605 100644
>>> --- a/Makefile
>>> +++ b/Makefile
>>> @@ -473,7 +473,7 @@ Makefile: $(version-obj-y)
>>>    # Build libraries
>>>
>>>    libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
>>> -libvhost-user.a: $(libvhost-user-obj-y)
>>> +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
>>>
>>>    ######################################################################
>>>
>>> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
>>> index 23bd52264c..e73ce04619 100644
>>> --- a/contrib/libvhost-user/libvhost-user.c
>>> +++ b/contrib/libvhost-user/libvhost-user.c
>>> @@ -41,6 +41,8 @@
>>>    #endif
>>>
>>>    #include "qemu/atomic.h"
>>> +#include "qemu/osdep.h"
>>> +#include "qemu/memfd.h"
>>>
>>>    #include "libvhost-user.h"
>>>
>>> @@ -53,6 +55,18 @@
>>>                _min1 < _min2 ? _min1 : _min2; })
>>>    #endif
>>>
>>> +/* Round number down to multiple */
>>> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
>>> +
>>> +/* Round number up to multiple */
>>> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
>>> +
>>> +/* Align each region to cache line size in inflight buffer */
>>> +#define INFLIGHT_ALIGNMENT 64
>>> +
>>> +/* The version of inflight buffer */
>>> +#define INFLIGHT_VERSION 1
>>> +
>>>    #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
>>>
>>>    /* The version of the protocol we support */
>>> @@ -66,6 +80,20 @@
>>>            }                                       \
>>>        } while (0)
>>>
>>> +static inline
>>> +bool has_feature(uint64_t features, unsigned int fbit)
>>> +{
>>> +    assert(fbit < 64);
>>> +    return !!(features & (1ULL << fbit));
>>> +}
>>> +
>>> +static inline
>>> +bool vu_has_feature(VuDev *dev,
>>> +                    unsigned int fbit)
>>> +{
>>> +    return has_feature(dev->features, fbit);
>>> +}
>>> +
>>>    static const char *
>>>    vu_request_to_string(unsigned int req)
>>>    {
>>> @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
>>>            REQ(VHOST_USER_POSTCOPY_ADVISE),
>>>            REQ(VHOST_USER_POSTCOPY_LISTEN),
>>>            REQ(VHOST_USER_POSTCOPY_END),
>>> +        REQ(VHOST_USER_GET_INFLIGHT_FD),
>>> +        REQ(VHOST_USER_SET_INFLIGHT_FD),
>>>            REQ(VHOST_USER_MAX),
>>>        };
>>>    #undef REQ
>>> @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
>>>        return true;
>>>    }
>>>
>>> +static int
>>> +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
>>> +{
>>> +    int i = 0;
>>> +
>>> +    if (!has_feature(dev->protocol_features,
>>> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (unlikely(!vq->inflight)) {
>>> +        return -1;
>>> +    }
>>> +
>>> +    vq->used_idx = vq->vring.used->idx;
>>> +    vq->inflight_num = 0;
>>> +    for (i = 0; i < vq->vring.num; i++) {
>>> +        if (vq->inflight->desc[i] == 0) {
>>> +            continue;
>>> +        }
>>> +
>>> +        vq->inflight_desc[vq->inflight_num++] = i;
>>> +        vq->inuse++;
>>> +    }
>>> +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
>>> +
>>> +    /* in case of I/O hang after reconnecting */
>>> +    if (eventfd_write(vq->kick_fd, 1) ||
>>> +        eventfd_write(vq->call_fd, 1)) {
>>> +        return -1;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>    static bool
>>>    vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>>>    {
>>> @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
>>>                   dev->vq[index].kick_fd, index);
>>>        }
>>>
>>> +    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
>>> +        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
>>> +    }
>>> +
>>>        return false;
>>>    }
>>>
>>> @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
>>>        return true;
>>>    }
>>>
>>> +static bool
>>> +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
>>> +{
>>> +    int fd;
>>> +    void *addr;
>>> +    uint64_t mmap_size;
>>> +
>>> +    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
>>> +        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
>>> +        vmsg->payload.inflight.mmap_size = 0;
>>> +        return true;
>>> +    }
>>> +
>>> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
>>> +           vmsg->payload.inflight.num_queues);
>>> +
>>> +    mmap_size = vmsg->payload.inflight.num_queues *
>>> +                ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
>>> +
>>> +    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
>>> +                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
>>> +                            &fd, NULL);
>>> +
>>> +    if (!addr) {
>>> +        vu_panic(dev, "Failed to alloc vhost inflight area");
>>> +        vmsg->payload.inflight.mmap_size = 0;
>>> +        return true;
>>> +    }
>>> +
>>> +    dev->inflight_info.addr = addr;
>>> +    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
>>> +    vmsg->payload.inflight.mmap_offset = 0;
>>> +    vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
>>> +    vmsg->payload.inflight.version = INFLIGHT_VERSION;
>>> +    vmsg->fd_num = 1;
>>> +    dev->inflight_info.fd = vmsg->fds[0] = fd;
>>> +
>>> +    DPRINT("send inflight mmap_size: %"PRId64"\n",
>>> +           vmsg->payload.inflight.mmap_size);
>>> +    DPRINT("send inflight mmap offset: %"PRId64"\n",
>>> +           vmsg->payload.inflight.mmap_offset);
>>> +    DPRINT("send inflight align: %"PRId32"\n",
>>> +           vmsg->payload.inflight.align);
>>> +    DPRINT("send inflight version: %"PRId16"\n",
>>> +           vmsg->payload.inflight.version);
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static bool
>>> +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
>>> +{
>>> +    int fd, i;
>>> +    uint64_t mmap_size, mmap_offset;
>>> +    uint32_t align;
>>> +    uint16_t num_queues, version;
>>> +    void *rc;
>>> +
>>> +    if (vmsg->fd_num != 1 ||
>>> +        vmsg->size != sizeof(vmsg->payload.inflight)) {
>>> +        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
>>> +                 vmsg->size, vmsg->fd_num);
>>> +        return false;
>>> +    }
>>> +
>>> +    fd = vmsg->fds[0];
>>> +    mmap_size = vmsg->payload.inflight.mmap_size;
>>> +    mmap_offset = vmsg->payload.inflight.mmap_offset;
>>> +    align = vmsg->payload.inflight.align;
>>> +    num_queues = vmsg->payload.inflight.num_queues;
>>> +    version = vmsg->payload.inflight.version;
>>> +
>>> +    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
>>> +    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
>>> +    DPRINT("set_inflight_fd align: %"PRId32"\n", align);
>>> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
>>> +    DPRINT("set_inflight_fd version: %"PRId16"\n", version);
>>> +
>>> +    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
>>> +              fd, mmap_offset);
>>> +
>>> +    if (rc == MAP_FAILED) {
>>> +        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
>>> +        return false;
>>> +    }
>>> +
>>> +    if (version != INFLIGHT_VERSION) {
>>> +        vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
>>> +        return false;
>>> +    }
>>> +
>>> +    if (dev->inflight_info.fd) {
>>> +        close(dev->inflight_info.fd);
>>> +    }
>>> +
>>> +    if (dev->inflight_info.addr) {
>>> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
>>> +    }
>>> +
>>> +    dev->inflight_info.fd = fd;
>>> +    dev->inflight_info.addr = rc;
>>> +    dev->inflight_info.size = mmap_size;
>>> +
>>> +    for (i = 0; i < num_queues; i++) {
>>> +        dev->vq[i].inflight = (VuVirtqInflight *)rc;
>>> +        rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align));
>>> +    }
>>> +
>>> +    return false;
>>> +}
>>> +
>>>    static bool
>>>    vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
>>>    {
>>> @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
>>>            return vu_set_postcopy_listen(dev, vmsg);
>>>        case VHOST_USER_POSTCOPY_END:
>>>            return vu_set_postcopy_end(dev, vmsg);
>>> +    case VHOST_USER_GET_INFLIGHT_FD:
>>> +        return vu_get_inflight_fd(dev, vmsg);
>>> +    case VHOST_USER_SET_INFLIGHT_FD:
>>> +        return vu_set_inflight_fd(dev, vmsg);
>>>        default:
>>>            vmsg_close_fds(vmsg);
>>>            vu_panic(dev, "Unhandled request: %d", vmsg->request);
>>> @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev)
>>>                close(vq->err_fd);
>>>                vq->err_fd = -1;
>>>            }
>>> +        vq->inflight = NULL;
>>>        }
>>>
>>> +    if (dev->inflight_info.addr) {
>>> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
>>> +        dev->inflight_info.addr = NULL;
>>> +    }
>>> +
>>> +    if (dev->inflight_info.fd > 0) {
>>> +        close(dev->inflight_info.fd);
>>> +        dev->inflight_info.fd = -1;
>>> +    }
>>>
>>>        vu_close_log(dev);
>>>        if (dev->slave_fd != -1) {
>>> @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
>>>        return vring_avail_idx(vq) == vq->last_avail_idx;
>>>    }
>>>
>>> -static inline
>>> -bool has_feature(uint64_t features, unsigned int fbit)
>>> -{
>>> -    assert(fbit < 64);
>>> -    return !!(features & (1ULL << fbit));
>>> -}
>>> -
>>> -static inline
>>> -bool vu_has_feature(VuDev *dev,
>>> -                    unsigned int fbit)
>>> -{
>>> -    return has_feature(dev->features, fbit);
>>> -}
>>> -
>>>    static bool
>>>    vring_notify(VuDev *dev, VuVirtq *vq)
>>>    {
>>> @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev,
>>>        *p_num_sg = num_sg;
>>>    }
>>>
>>> -/* Round number down to multiple */
>>> -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
>>> -
>>> -/* Round number up to multiple */
>>> -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
>>> -
>>>    static void *
>>>    virtqueue_alloc_element(size_t sz,
>>>                                         unsigned out_num, unsigned in_num)
>>> @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
>>>        return elem;
>>>    }
>>>
>>> +static int
>>> +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
>>> +{
>>> +    if (!has_feature(dev->protocol_features,
>>> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    if (unlikely(!vq->inflight)) {
>>> +        return -1;
>>> +    }
>>> +
>>
>> Just wonder what happens if backend get killed at this point?
>>
> We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx


I'm not sure I get you here, but it looks to me at least one pending 
descriptor is missed since you don't set vq->inflight->desc[desc_idx] to 1?


>
> At this point, backend could consume this entry correctly after reconnect.
>
>> You want to survive from the backend crash but you still depend on
>> backend to get and put inflight descriptors which seems somehow conflict.
>>
> But if backend get killed in vu_queue_inflight_put(), I think you are
> right, there is something conflict. One descriptor is consumed by
> guest but still marked as inused in inflight buffer. Then we will
> re-send this old descriptor after restart.
>
> Maybe we can add something like that to fix this issue:
>
> void vu_queue_push()
> {
>      vq->inflight->elem_idx = elem->idx;
>      vu_queue_fill();
>      vu_queue_flush();
>      vq->inflight->desc[elem->idx] = 0;


Does this safe to be killed here?


>      vq->inflight->used_idx = vq->vring.used->idx;
> }
>
> static int vu_check_queue_inflights()
> {
>      ....
>      if (vq->inflight->used_idx != vq->vring.used->idx) {
>          /* Crash in vu_queue_push() */
>          vq->inflight->desc[vq->inflight->elem_idx] = 0;
>      }
>      ....
> }
>
> Thanks,
> Yongji


Well, this may work but here're my points:

1) The code want to recover from backed crash by introducing extra space 
to store inflight data, but it still depends on the backend to set/get 
the inflight state

2) Since the backend could be killed at any time, the backend must have 
the ability to recover from the partial inflight state

So it looks to me 1) tends to be self-contradictory and 2) tends to be 
recursive. The above lines show how tricky could the code looks like.

Solving this at vhost-user level through at backend is probably wrong. 
It's time to consider the support from virtio itself.

Thanks
Yongji Xie Jan. 15, 2019, 2:51 p.m. UTC | #4
On Tue, 15 Jan 2019 at 15:52, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2019/1/11 下午2:10, Yongji Xie wrote:
> > On Fri, 11 Jan 2019 at 11:56, Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2019/1/9 下午7:27, elohimes@gmail.com wrote:
> >>> From: Xie Yongji <xieyongji@baidu.com>
> >>>
> >>> This patch adds support for VHOST_USER_GET_INFLIGHT_FD and
> >>> VHOST_USER_SET_INFLIGHT_FD message to set/get shared memory
> >>> to/from qemu. Then we maintain a "bitmap" of all descriptors in
> >>> the shared memory for each queue to track inflight I/O.
> >>>
> >>> Signed-off-by: Xie Yongji <xieyongji@baidu.com>
> >>> Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
> >>> ---
> >>>    Makefile                              |   2 +-
> >>>    contrib/libvhost-user/libvhost-user.c | 258 ++++++++++++++++++++++++--
> >>>    contrib/libvhost-user/libvhost-user.h |  29 +++
> >>>    3 files changed, 268 insertions(+), 21 deletions(-)
> >>>
> >>> diff --git a/Makefile b/Makefile
> >>> index dd53965f77..b5c9092605 100644
> >>> --- a/Makefile
> >>> +++ b/Makefile
> >>> @@ -473,7 +473,7 @@ Makefile: $(version-obj-y)
> >>>    # Build libraries
> >>>
> >>>    libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
> >>> -libvhost-user.a: $(libvhost-user-obj-y)
> >>> +libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
> >>>
> >>>    ######################################################################
> >>>
> >>> diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
> >>> index 23bd52264c..e73ce04619 100644
> >>> --- a/contrib/libvhost-user/libvhost-user.c
> >>> +++ b/contrib/libvhost-user/libvhost-user.c
> >>> @@ -41,6 +41,8 @@
> >>>    #endif
> >>>
> >>>    #include "qemu/atomic.h"
> >>> +#include "qemu/osdep.h"
> >>> +#include "qemu/memfd.h"
> >>>
> >>>    #include "libvhost-user.h"
> >>>
> >>> @@ -53,6 +55,18 @@
> >>>                _min1 < _min2 ? _min1 : _min2; })
> >>>    #endif
> >>>
> >>> +/* Round number down to multiple */
> >>> +#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> >>> +
> >>> +/* Round number up to multiple */
> >>> +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> >>> +
> >>> +/* Align each region to cache line size in inflight buffer */
> >>> +#define INFLIGHT_ALIGNMENT 64
> >>> +
> >>> +/* The version of inflight buffer */
> >>> +#define INFLIGHT_VERSION 1
> >>> +
> >>>    #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
> >>>
> >>>    /* The version of the protocol we support */
> >>> @@ -66,6 +80,20 @@
> >>>            }                                       \
> >>>        } while (0)
> >>>
> >>> +static inline
> >>> +bool has_feature(uint64_t features, unsigned int fbit)
> >>> +{
> >>> +    assert(fbit < 64);
> >>> +    return !!(features & (1ULL << fbit));
> >>> +}
> >>> +
> >>> +static inline
> >>> +bool vu_has_feature(VuDev *dev,
> >>> +                    unsigned int fbit)
> >>> +{
> >>> +    return has_feature(dev->features, fbit);
> >>> +}
> >>> +
> >>>    static const char *
> >>>    vu_request_to_string(unsigned int req)
> >>>    {
> >>> @@ -100,6 +128,8 @@ vu_request_to_string(unsigned int req)
> >>>            REQ(VHOST_USER_POSTCOPY_ADVISE),
> >>>            REQ(VHOST_USER_POSTCOPY_LISTEN),
> >>>            REQ(VHOST_USER_POSTCOPY_END),
> >>> +        REQ(VHOST_USER_GET_INFLIGHT_FD),
> >>> +        REQ(VHOST_USER_SET_INFLIGHT_FD),
> >>>            REQ(VHOST_USER_MAX),
> >>>        };
> >>>    #undef REQ
> >>> @@ -890,6 +920,41 @@ vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
> >>>        return true;
> >>>    }
> >>>
> >>> +static int
> >>> +vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
> >>> +{
> >>> +    int i = 0;
> >>> +
> >>> +    if (!has_feature(dev->protocol_features,
> >>> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> >>> +        return 0;
> >>> +    }
> >>> +
> >>> +    if (unlikely(!vq->inflight)) {
> >>> +        return -1;
> >>> +    }
> >>> +
> >>> +    vq->used_idx = vq->vring.used->idx;
> >>> +    vq->inflight_num = 0;
> >>> +    for (i = 0; i < vq->vring.num; i++) {
> >>> +        if (vq->inflight->desc[i] == 0) {
> >>> +            continue;
> >>> +        }
> >>> +
> >>> +        vq->inflight_desc[vq->inflight_num++] = i;
> >>> +        vq->inuse++;
> >>> +    }
> >>> +    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
> >>> +
> >>> +    /* in case of I/O hang after reconnecting */
> >>> +    if (eventfd_write(vq->kick_fd, 1) ||
> >>> +        eventfd_write(vq->call_fd, 1)) {
> >>> +        return -1;
> >>> +    }
> >>> +
> >>> +    return 0;
> >>> +}
> >>> +
> >>>    static bool
> >>>    vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >>>    {
> >>> @@ -925,6 +990,10 @@ vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
> >>>                   dev->vq[index].kick_fd, index);
> >>>        }
> >>>
> >>> +    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
> >>> +        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
> >>> +    }
> >>> +
> >>>        return false;
> >>>    }
> >>>
> >>> @@ -1215,6 +1284,117 @@ vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
> >>>        return true;
> >>>    }
> >>>
> >>> +static bool
> >>> +vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> >>> +{
> >>> +    int fd;
> >>> +    void *addr;
> >>> +    uint64_t mmap_size;
> >>> +
> >>> +    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
> >>> +        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
> >>> +        vmsg->payload.inflight.mmap_size = 0;
> >>> +        return true;
> >>> +    }
> >>> +
> >>> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
> >>> +           vmsg->payload.inflight.num_queues);
> >>> +
> >>> +    mmap_size = vmsg->payload.inflight.num_queues *
> >>> +                ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
> >>> +
> >>> +    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
> >>> +                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
> >>> +                            &fd, NULL);
> >>> +
> >>> +    if (!addr) {
> >>> +        vu_panic(dev, "Failed to alloc vhost inflight area");
> >>> +        vmsg->payload.inflight.mmap_size = 0;
> >>> +        return true;
> >>> +    }
> >>> +
> >>> +    dev->inflight_info.addr = addr;
> >>> +    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
> >>> +    vmsg->payload.inflight.mmap_offset = 0;
> >>> +    vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
> >>> +    vmsg->payload.inflight.version = INFLIGHT_VERSION;
> >>> +    vmsg->fd_num = 1;
> >>> +    dev->inflight_info.fd = vmsg->fds[0] = fd;
> >>> +
> >>> +    DPRINT("send inflight mmap_size: %"PRId64"\n",
> >>> +           vmsg->payload.inflight.mmap_size);
> >>> +    DPRINT("send inflight mmap offset: %"PRId64"\n",
> >>> +           vmsg->payload.inflight.mmap_offset);
> >>> +    DPRINT("send inflight align: %"PRId32"\n",
> >>> +           vmsg->payload.inflight.align);
> >>> +    DPRINT("send inflight version: %"PRId16"\n",
> >>> +           vmsg->payload.inflight.version);
> >>> +
> >>> +    return true;
> >>> +}
> >>> +
> >>> +static bool
> >>> +vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
> >>> +{
> >>> +    int fd, i;
> >>> +    uint64_t mmap_size, mmap_offset;
> >>> +    uint32_t align;
> >>> +    uint16_t num_queues, version;
> >>> +    void *rc;
> >>> +
> >>> +    if (vmsg->fd_num != 1 ||
> >>> +        vmsg->size != sizeof(vmsg->payload.inflight)) {
> >>> +        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
> >>> +                 vmsg->size, vmsg->fd_num);
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    fd = vmsg->fds[0];
> >>> +    mmap_size = vmsg->payload.inflight.mmap_size;
> >>> +    mmap_offset = vmsg->payload.inflight.mmap_offset;
> >>> +    align = vmsg->payload.inflight.align;
> >>> +    num_queues = vmsg->payload.inflight.num_queues;
> >>> +    version = vmsg->payload.inflight.version;
> >>> +
> >>> +    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
> >>> +    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
> >>> +    DPRINT("set_inflight_fd align: %"PRId32"\n", align);
> >>> +    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
> >>> +    DPRINT("set_inflight_fd version: %"PRId16"\n", version);
> >>> +
> >>> +    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> >>> +              fd, mmap_offset);
> >>> +
> >>> +    if (rc == MAP_FAILED) {
> >>> +        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    if (version != INFLIGHT_VERSION) {
> >>> +        vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
> >>> +        return false;
> >>> +    }
> >>> +
> >>> +    if (dev->inflight_info.fd) {
> >>> +        close(dev->inflight_info.fd);
> >>> +    }
> >>> +
> >>> +    if (dev->inflight_info.addr) {
> >>> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> >>> +    }
> >>> +
> >>> +    dev->inflight_info.fd = fd;
> >>> +    dev->inflight_info.addr = rc;
> >>> +    dev->inflight_info.size = mmap_size;
> >>> +
> >>> +    for (i = 0; i < num_queues; i++) {
> >>> +        dev->vq[i].inflight = (VuVirtqInflight *)rc;
> >>> +        rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align));
> >>> +    }
> >>> +
> >>> +    return false;
> >>> +}
> >>> +
> >>>    static bool
> >>>    vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >>>    {
> >>> @@ -1292,6 +1472,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
> >>>            return vu_set_postcopy_listen(dev, vmsg);
> >>>        case VHOST_USER_POSTCOPY_END:
> >>>            return vu_set_postcopy_end(dev, vmsg);
> >>> +    case VHOST_USER_GET_INFLIGHT_FD:
> >>> +        return vu_get_inflight_fd(dev, vmsg);
> >>> +    case VHOST_USER_SET_INFLIGHT_FD:
> >>> +        return vu_set_inflight_fd(dev, vmsg);
> >>>        default:
> >>>            vmsg_close_fds(vmsg);
> >>>            vu_panic(dev, "Unhandled request: %d", vmsg->request);
> >>> @@ -1359,8 +1543,18 @@ vu_deinit(VuDev *dev)
> >>>                close(vq->err_fd);
> >>>                vq->err_fd = -1;
> >>>            }
> >>> +        vq->inflight = NULL;
> >>>        }
> >>>
> >>> +    if (dev->inflight_info.addr) {
> >>> +        munmap(dev->inflight_info.addr, dev->inflight_info.size);
> >>> +        dev->inflight_info.addr = NULL;
> >>> +    }
> >>> +
> >>> +    if (dev->inflight_info.fd > 0) {
> >>> +        close(dev->inflight_info.fd);
> >>> +        dev->inflight_info.fd = -1;
> >>> +    }
> >>>
> >>>        vu_close_log(dev);
> >>>        if (dev->slave_fd != -1) {
> >>> @@ -1687,20 +1881,6 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq)
> >>>        return vring_avail_idx(vq) == vq->last_avail_idx;
> >>>    }
> >>>
> >>> -static inline
> >>> -bool has_feature(uint64_t features, unsigned int fbit)
> >>> -{
> >>> -    assert(fbit < 64);
> >>> -    return !!(features & (1ULL << fbit));
> >>> -}
> >>> -
> >>> -static inline
> >>> -bool vu_has_feature(VuDev *dev,
> >>> -                    unsigned int fbit)
> >>> -{
> >>> -    return has_feature(dev->features, fbit);
> >>> -}
> >>> -
> >>>    static bool
> >>>    vring_notify(VuDev *dev, VuVirtq *vq)
> >>>    {
> >>> @@ -1829,12 +2009,6 @@ virtqueue_map_desc(VuDev *dev,
> >>>        *p_num_sg = num_sg;
> >>>    }
> >>>
> >>> -/* Round number down to multiple */
> >>> -#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
> >>> -
> >>> -/* Round number up to multiple */
> >>> -#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
> >>> -
> >>>    static void *
> >>>    virtqueue_alloc_element(size_t sz,
> >>>                                         unsigned out_num, unsigned in_num)
> >>> @@ -1935,9 +2109,44 @@ vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
> >>>        return elem;
> >>>    }
> >>>
> >>> +static int
> >>> +vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
> >>> +{
> >>> +    if (!has_feature(dev->protocol_features,
> >>> +        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
> >>> +        return 0;
> >>> +    }
> >>> +
> >>> +    if (unlikely(!vq->inflight)) {
> >>> +        return -1;
> >>> +    }
> >>> +
> >>
> >> Just wonder what happens if backend get killed at this point?
> >>
> > We will re-caculate last_avail_idx like: last_avail_idx = inuse + used_idx
>
>
> I'm not sure I get you here, but it looks to me at least one pending
> descriptor is missed since you don't set vq->inflight->desc[desc_idx] to 1?
>
>
> >
> > At this point, backend could consume this entry correctly after reconnect.
> >
> >> You want to survive from the backend crash but you still depend on
> >> backend to get and put inflight descriptors which seems somehow conflict.
> >>
> > But if backend get killed in vu_queue_inflight_put(), I think you are
> > right, there is something conflict. One descriptor is consumed by
> > guest but still marked as inused in inflight buffer. Then we will
> > re-send this old descriptor after restart.
> >
> > Maybe we can add something like that to fix this issue:
> >
> > void vu_queue_push()
> > {
> >      vq->inflight->elem_idx = elem->idx;
> >      vu_queue_fill();
> >      vu_queue_flush();
> >      vq->inflight->desc[elem->idx] = 0;
>
>
> Does this safe to be killed here?
>
>
> >      vq->inflight->used_idx = vq->vring.used->idx;
> > }
> >
> > static int vu_check_queue_inflights()
> > {
> >      ....
> >      if (vq->inflight->used_idx != vq->vring.used->idx) {
> >          /* Crash in vu_queue_push() */
> >          vq->inflight->desc[vq->inflight->elem_idx] = 0;
> >      }
> >      ....
> > }
> >
> > Thanks,
> > Yongji
>
>
> Well, this may work but here're my points:
>
> 1) The code want to recover from backed crash by introducing extra space
> to store inflight data, but it still depends on the backend to set/get
> the inflight state
>
> 2) Since the backend could be killed at any time, the backend must have
> the ability to recover from the partial inflight state
>
> So it looks to me 1) tends to be self-contradictory and 2) tends to be
> recursive. The above lines show how tricky could the code looks like.
>
> Solving this at vhost-user level through at backend is probably wrong.
> It's time to consider the support from virtio itself.
>

I agree that supporting this in virtio level may be better. For
example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
Stefan's proposal. But I still think QEMU should be able to provide
this ability too. Supposed that one vhost-user backend need to support
multiple VMs. We can't enable reconnect ability until all VMs' guest
driver support the new feature. It's limited. But if QEMU have the
ability to store inflight buffer, the backend could at least have a
chance to support this case. Maybe backend could have other way to
avoid the tricky code.

Thanks,
Yongji
Michael S. Tsirkin Jan. 15, 2019, 3:58 p.m. UTC | #5
On Tue, Jan 15, 2019 at 03:52:21PM +0800, Jason Wang wrote:
> Well, this may work but here're my points:
> 
> 1) The code want to recover from backed crash by introducing extra space to
> store inflight data, but it still depends on the backend to set/get the
> inflight state
> 
> 2) Since the backend could be killed at any time, the backend must have the
> ability to recover from the partial inflight state
> 
> So it looks to me 1) tends to be self-contradictory and 2) tends to be
> recursive. The above lines show how tricky could the code looks like.

This is a well studied field. Basically you make sure you commit with an
atomic write.  Restartable sequences allow accelerating this even
further.

> Solving this at vhost-user level through at backend is probably wrong. It's
> time to consider the support from virtio itself.
> 
> Thanks

I think both approaches have their space.
Jason Wang Jan. 17, 2019, 9:57 a.m. UTC | #6
On 2019/1/15 下午10:51, Yongji Xie wrote:
>> Well, this may work but here're my points:
>>
>> 1) The code want to recover from backed crash by introducing extra space
>> to store inflight data, but it still depends on the backend to set/get
>> the inflight state
>>
>> 2) Since the backend could be killed at any time, the backend must have
>> the ability to recover from the partial inflight state
>>
>> So it looks to me 1) tends to be self-contradictory and 2) tends to be
>> recursive. The above lines show how tricky could the code looks like.
>>
>> Solving this at vhost-user level through at backend is probably wrong.
>> It's time to consider the support from virtio itself.
>>
> I agree that supporting this in virtio level may be better. For
> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
> Stefan's proposal. But I still think QEMU should be able to provide
> this ability too. Supposed that one vhost-user backend need to support
> multiple VMs. We can't enable reconnect ability until all VMs' guest
> driver support the new feature. It's limited.


That's the way virtio evolves.


>   But if QEMU have the
> ability to store inflight buffer, the backend could at least have a
> chance to support this case.


The problem is, you need a careful designed protocol described somewhere 
(is vhost-user.txt a good place for this?). And this work will be 
(partial) duplicated for the future support from virtio spec itself.


> Maybe backend could have other way to
> avoid the tricky code.


I'm not sure, but it was probably not easy.

Thanks


>
> Thanks,
Jason Wang Jan. 17, 2019, 10:01 a.m. UTC | #7
On 2019/1/15 下午11:58, Michael S. Tsirkin wrote:
> On Tue, Jan 15, 2019 at 03:52:21PM +0800, Jason Wang wrote:
>> Well, this may work but here're my points:
>>
>> 1) The code want to recover from backed crash by introducing extra space to
>> store inflight data, but it still depends on the backend to set/get the
>> inflight state
>>
>> 2) Since the backend could be killed at any time, the backend must have the
>> ability to recover from the partial inflight state
>>
>> So it looks to me 1) tends to be self-contradictory and 2) tends to be
>> recursive. The above lines show how tricky could the code looks like.
> This is a well studied field. Basically you make sure you commit with an
> atomic write.  Restartable sequences allow accelerating this even
> further.


I'm not sure I get this. But the issue is to exactly deduce all the 
inflight descriptors even if backend could be killed when doing the 
logging. If we could not be 100% accurate, it's have much less value.


>
>> Solving this at vhost-user level through at backend is probably wrong. It's
>> time to consider the support from virtio itself.
>>
>> Thanks
> I think both approaches have their space.


But there will be a lot of duplicated work if we decide to support it 
from virtio.

Thanks


>
> -- MST
Michael S. Tsirkin Jan. 17, 2019, 2:59 p.m. UTC | #8
On Thu, Jan 17, 2019 at 05:57:29PM +0800, Jason Wang wrote:
> 
> On 2019/1/15 下午10:51, Yongji Xie wrote:
> > > Well, this may work but here're my points:
> > > 
> > > 1) The code want to recover from backed crash by introducing extra space
> > > to store inflight data, but it still depends on the backend to set/get
> > > the inflight state
> > > 
> > > 2) Since the backend could be killed at any time, the backend must have
> > > the ability to recover from the partial inflight state
> > > 
> > > So it looks to me 1) tends to be self-contradictory and 2) tends to be
> > > recursive. The above lines show how tricky could the code looks like.
> > > 
> > > Solving this at vhost-user level through at backend is probably wrong.
> > > It's time to consider the support from virtio itself.
> > > 
> > I agree that supporting this in virtio level may be better. For
> > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
> > Stefan's proposal. But I still think QEMU should be able to provide
> > this ability too. Supposed that one vhost-user backend need to support
> > multiple VMs. We can't enable reconnect ability until all VMs' guest
> > driver support the new feature. It's limited.
> 
> 
> That's the way virtio evolves.
> 
> 
> >   But if QEMU have the
> > ability to store inflight buffer, the backend could at least have a
> > chance to support this case.
> 
> 
> The problem is, you need a careful designed protocol described somewhere (is
> vhost-user.txt a good place for this?). And this work will be (partial)
> duplicated for the future support from virtio spec itself.
> 
> 
> > Maybe backend could have other way to
> > avoid the tricky code.
> 
> 
> I'm not sure, but it was probably not easy.

I see an implementation in libvhost-user.

Do you see an issue there?


> Thanks
> 
> 
> > 
> > Thanks,
Michael S. Tsirkin Jan. 17, 2019, 3:04 p.m. UTC | #9
On Thu, Jan 17, 2019 at 06:01:16PM +0800, Jason Wang wrote:
> 
> On 2019/1/15 下午11:58, Michael S. Tsirkin wrote:
> > On Tue, Jan 15, 2019 at 03:52:21PM +0800, Jason Wang wrote:
> > > Well, this may work but here're my points:
> > > 
> > > 1) The code want to recover from backed crash by introducing extra space to
> > > store inflight data, but it still depends on the backend to set/get the
> > > inflight state
> > > 
> > > 2) Since the backend could be killed at any time, the backend must have the
> > > ability to recover from the partial inflight state
> > > 
> > > So it looks to me 1) tends to be self-contradictory and 2) tends to be
> > > recursive. The above lines show how tricky could the code looks like.
> > This is a well studied field. Basically you make sure you commit with an
> > atomic write.  Restartable sequences allow accelerating this even
> > further.
> 
> 
> I'm not sure I get this. But the issue is to exactly deduce all the inflight
> descriptors even if backend could be killed when doing the logging. If we
> could not be 100% accurate, it's have much less value.


I agree. But why discuss theoretical issues?
Can you point out a problem in the contrib/ code
included here?

If yes it must be fixed I think.

I personally think it's not too hard.
Consider packed ring for example - just maintain a list of the inflight
descriptors, as the last step write out the flags atomically.




> 
> > 
> > > Solving this at vhost-user level through at backend is probably wrong. It's
> > > time to consider the support from virtio itself.
> > > 
> > > Thanks
> > I think both approaches have their space.
> 
> 
> But there will be a lot of duplicated work if we decide to support it from
> virtio.
> 
> Thanks
> 
> 
> > 
> > -- MST
Yongji Xie Jan. 18, 2019, 3:32 a.m. UTC | #10
On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2019/1/15 下午10:51, Yongji Xie wrote:
> >> Well, this may work but here're my points:
> >>
> >> 1) The code want to recover from backed crash by introducing extra space
> >> to store inflight data, but it still depends on the backend to set/get
> >> the inflight state
> >>
> >> 2) Since the backend could be killed at any time, the backend must have
> >> the ability to recover from the partial inflight state
> >>
> >> So it looks to me 1) tends to be self-contradictory and 2) tends to be
> >> recursive. The above lines show how tricky could the code looks like.
> >>
> >> Solving this at vhost-user level through at backend is probably wrong.
> >> It's time to consider the support from virtio itself.
> >>
> > I agree that supporting this in virtio level may be better. For
> > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
> > Stefan's proposal. But I still think QEMU should be able to provide
> > this ability too. Supposed that one vhost-user backend need to support
> > multiple VMs. We can't enable reconnect ability until all VMs' guest
> > driver support the new feature. It's limited.
>
>
> That's the way virtio evolves.
>
>
> >   But if QEMU have the
> > ability to store inflight buffer, the backend could at least have a
> > chance to support this case.
>
>
> The problem is, you need a careful designed protocol described somewhere

That's what we should discuss in detail in this series.

> (is vhost-user.txt a good place for this?). And this work will be
> (partial) duplicated for the future support from virtio spec itself.
>

I think the duplicated code is to maintain the inflight descriptor
list which should be in backend. That's not main work in this series.
And backend could choose to include it or not.

Thanks,
Yongji
Michael S. Tsirkin Jan. 18, 2019, 3:56 a.m. UTC | #11
On Fri, Jan 18, 2019 at 11:32:03AM +0800, Yongji Xie wrote:
> On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > On 2019/1/15 下午10:51, Yongji Xie wrote:
> > >> Well, this may work but here're my points:
> > >>
> > >> 1) The code want to recover from backed crash by introducing extra space
> > >> to store inflight data, but it still depends on the backend to set/get
> > >> the inflight state
> > >>
> > >> 2) Since the backend could be killed at any time, the backend must have
> > >> the ability to recover from the partial inflight state
> > >>
> > >> So it looks to me 1) tends to be self-contradictory and 2) tends to be
> > >> recursive. The above lines show how tricky could the code looks like.
> > >>
> > >> Solving this at vhost-user level through at backend is probably wrong.
> > >> It's time to consider the support from virtio itself.
> > >>
> > > I agree that supporting this in virtio level may be better. For
> > > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
> > > Stefan's proposal. But I still think QEMU should be able to provide
> > > this ability too. Supposed that one vhost-user backend need to support
> > > multiple VMs. We can't enable reconnect ability until all VMs' guest
> > > driver support the new feature. It's limited.
> >
> >
> > That's the way virtio evolves.
> >
> >
> > >   But if QEMU have the
> > > ability to store inflight buffer, the backend could at least have a
> > > chance to support this case.
> >
> >
> > The problem is, you need a careful designed protocol described somewhere
> 
> That's what we should discuss in detail in this series.
> 
> > (is vhost-user.txt a good place for this?). And this work will be
> > (partial) duplicated for the future support from virtio spec itself.
> >
> 
> I think the duplicated code is to maintain the inflight descriptor
> list which should be in backend. That's not main work in this series.
> And backend could choose to include it or not.
> 
> Thanks,
> Yongji

It would be if someone volunteered to rewrite the vhost user informal
description that we have in qemu and make it a full spec.
So far a text + implementation in contrib seems plenty to me.
Jason Wang Jan. 18, 2019, 3:57 a.m. UTC | #12
On 2019/1/17 下午10:59, Michael S. Tsirkin wrote:
> On Thu, Jan 17, 2019 at 05:57:29PM +0800, Jason Wang wrote:
>> On 2019/1/15 下午10:51, Yongji Xie wrote:
>>>> Well, this may work but here're my points:
>>>>
>>>> 1) The code want to recover from backed crash by introducing extra space
>>>> to store inflight data, but it still depends on the backend to set/get
>>>> the inflight state
>>>>
>>>> 2) Since the backend could be killed at any time, the backend must have
>>>> the ability to recover from the partial inflight state
>>>>
>>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be
>>>> recursive. The above lines show how tricky could the code looks like.
>>>>
>>>> Solving this at vhost-user level through at backend is probably wrong.
>>>> It's time to consider the support from virtio itself.
>>>>
>>> I agree that supporting this in virtio level may be better. For
>>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
>>> Stefan's proposal. But I still think QEMU should be able to provide
>>> this ability too. Supposed that one vhost-user backend need to support
>>> multiple VMs. We can't enable reconnect ability until all VMs' guest
>>> driver support the new feature. It's limited.
>>
>> That's the way virtio evolves.
>>
>>
>>>    But if QEMU have the
>>> ability to store inflight buffer, the backend could at least have a
>>> chance to support this case.
>>
>> The problem is, you need a careful designed protocol described somewhere (is
>> vhost-user.txt a good place for this?). And this work will be (partial)
>> duplicated for the future support from virtio spec itself.
>>
>>
>>> Maybe backend could have other way to
>>> avoid the tricky code.
>>
>> I'm not sure, but it was probably not easy.
> I see an implementation in libvhost-user.
>
> Do you see an issue there?


I've asked some questions in this thread, it looks like we can still 
miss some inflight descriptors.

Thanks


>
>
>> Thanks
>>
>>
>>> Thanks,
Jason Wang Jan. 18, 2019, 3:59 a.m. UTC | #13
On 2019/1/18 上午11:32, Yongji Xie wrote:
> On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2019/1/15 下午10:51, Yongji Xie wrote:
>>>> Well, this may work but here're my points:
>>>>
>>>> 1) The code want to recover from backed crash by introducing extra space
>>>> to store inflight data, but it still depends on the backend to set/get
>>>> the inflight state
>>>>
>>>> 2) Since the backend could be killed at any time, the backend must have
>>>> the ability to recover from the partial inflight state
>>>>
>>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be
>>>> recursive. The above lines show how tricky could the code looks like.
>>>>
>>>> Solving this at vhost-user level through at backend is probably wrong.
>>>> It's time to consider the support from virtio itself.
>>>>
>>> I agree that supporting this in virtio level may be better. For
>>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
>>> Stefan's proposal. But I still think QEMU should be able to provide
>>> this ability too. Supposed that one vhost-user backend need to support
>>> multiple VMs. We can't enable reconnect ability until all VMs' guest
>>> driver support the new feature. It's limited.
>>
>> That's the way virtio evolves.
>>
>>
>>>    But if QEMU have the
>>> ability to store inflight buffer, the backend could at least have a
>>> chance to support this case.
>>
>> The problem is, you need a careful designed protocol described somewhere
> That's what we should discuss in detail in this series.


Well, I ask some questions for this patch, but it looks like they were 
still not answered. No?


>
>> (is vhost-user.txt a good place for this?). And this work will be
>> (partial) duplicated for the future support from virtio spec itself.
>>
> I think the duplicated code is to maintain the inflight descriptor
> list which should be in backend. That's not main work in this series.
> And backend could choose to include it or not.


You need to have a documentation to describe the protocol. Otherwise, it 
would be very hard for other backend to implement.

Thanks


>
> Thanks,
> Yongji
>
Michael S. Tsirkin Jan. 18, 2019, 4:03 a.m. UTC | #14
On Fri, Jan 18, 2019 at 11:59:50AM +0800, Jason Wang wrote:
> 
> On 2019/1/18 上午11:32, Yongji Xie wrote:
> > On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote:
> > > 
> > > On 2019/1/15 下午10:51, Yongji Xie wrote:
> > > > > Well, this may work but here're my points:
> > > > > 
> > > > > 1) The code want to recover from backed crash by introducing extra space
> > > > > to store inflight data, but it still depends on the backend to set/get
> > > > > the inflight state
> > > > > 
> > > > > 2) Since the backend could be killed at any time, the backend must have
> > > > > the ability to recover from the partial inflight state
> > > > > 
> > > > > So it looks to me 1) tends to be self-contradictory and 2) tends to be
> > > > > recursive. The above lines show how tricky could the code looks like.
> > > > > 
> > > > > Solving this at vhost-user level through at backend is probably wrong.
> > > > > It's time to consider the support from virtio itself.
> > > > > 
> > > > I agree that supporting this in virtio level may be better. For
> > > > example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
> > > > Stefan's proposal. But I still think QEMU should be able to provide
> > > > this ability too. Supposed that one vhost-user backend need to support
> > > > multiple VMs. We can't enable reconnect ability until all VMs' guest
> > > > driver support the new feature. It's limited.
> > > 
> > > That's the way virtio evolves.
> > > 
> > > 
> > > >    But if QEMU have the
> > > > ability to store inflight buffer, the backend could at least have a
> > > > chance to support this case.
> > > 
> > > The problem is, you need a careful designed protocol described somewhere
> > That's what we should discuss in detail in this series.
> 
> 
> Well, I ask some questions for this patch, but it looks like they were still
> not answered. No?
> 

Oh absolutely. I can't say I like the implementation,
I think it's both not the most robust and suboptimal.

> > 
> > > (is vhost-user.txt a good place for this?). And this work will be
> > > (partial) duplicated for the future support from virtio spec itself.
> > > 
> > I think the duplicated code is to maintain the inflight descriptor
> > list which should be in backend. That's not main work in this series.
> > And backend could choose to include it or not.
> 
> 
> You need to have a documentation to describe the protocol. Otherwise, it
> would be very hard for other backend to implement.
> 
> Thanks


Meaning how the inflight descriptors are saved in the buffer.
Yes I agree.

> 
> > 
> > Thanks,
> > Yongji
> >
Yongji Xie Jan. 18, 2019, 7:01 a.m. UTC | #15
On Fri, 18 Jan 2019 at 12:00, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2019/1/18 上午11:32, Yongji Xie wrote:
> > On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2019/1/15 下午10:51, Yongji Xie wrote:
> >>>> Well, this may work but here're my points:
> >>>>
> >>>> 1) The code want to recover from backed crash by introducing extra space
> >>>> to store inflight data, but it still depends on the backend to set/get
> >>>> the inflight state
> >>>>
> >>>> 2) Since the backend could be killed at any time, the backend must have
> >>>> the ability to recover from the partial inflight state
> >>>>
> >>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be
> >>>> recursive. The above lines show how tricky could the code looks like.
> >>>>
> >>>> Solving this at vhost-user level through at backend is probably wrong.
> >>>> It's time to consider the support from virtio itself.
> >>>>
> >>> I agree that supporting this in virtio level may be better. For
> >>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
> >>> Stefan's proposal. But I still think QEMU should be able to provide
> >>> this ability too. Supposed that one vhost-user backend need to support
> >>> multiple VMs. We can't enable reconnect ability until all VMs' guest
> >>> driver support the new feature. It's limited.
> >>
> >> That's the way virtio evolves.
> >>
> >>
> >>>    But if QEMU have the
> >>> ability to store inflight buffer, the backend could at least have a
> >>> chance to support this case.
> >>
> >> The problem is, you need a careful designed protocol described somewhere
> > That's what we should discuss in detail in this series.
>
>
> Well, I ask some questions for this patch, but it looks like they were
> still not answered. No?
>
>

Oh, sorry, I missed those questions. Let me try to answer them here.

Q1: If backend get killed in vu_queue_inflight_get() without setting
vq->inflight->desc[desc_idx] to 1, is there any problem?

The entry which stores the head of this inflight descriptor is not
lost in avail ring. So we can still get this inflight descriptor from
avail ring although we didn't set vq->inflight->desc[desc_idx] to 1.

Q2:
void vu_queue_push()
{
    vq->inflight->elem_idx = elem->idx;
    vu_queue_fill();
    vu_queue_flush();
    vq->inflight->desc[elem->idx] = 0;
                                                    <-------- Does
this safe to be killed here?
    vq->inflight->used_idx = vq->vring.used->idx;
}

Because there are no concurrency between vu_queue_push() and
vu_queue_pop(), I don't see any problem here.

Basically we just need to make sure this two operations
(vq->vring.used->idx++ and vq->inflight->desc[elem->idx] = 0) are
atomic. I think there are some approach to achieve that. I'm not sure
my approach here is good enough, but it should work.

> >
> >> (is vhost-user.txt a good place for this?). And this work will be
> >> (partial) duplicated for the future support from virtio spec itself.
> >>
> > I think the duplicated code is to maintain the inflight descriptor
> > list which should be in backend. That's not main work in this series.
> > And backend could choose to include it or not.
>
>
> You need to have a documentation to describe the protocol. Otherwise, it
> would be very hard for other backend to implement.
>

Yes, actually now I'm working on adding more detail to vhost-user.txt.

Thanks,
Yongji
Jason Wang Jan. 18, 2019, 9:26 a.m. UTC | #16
On 2019/1/18 下午3:01, Yongji Xie wrote:
> On Fri, 18 Jan 2019 at 12:00, Jason Wang <jasowang@redhat.com> wrote:
>>
>> On 2019/1/18 上午11:32, Yongji Xie wrote:
>>> On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote:
>>>> On 2019/1/15 下午10:51, Yongji Xie wrote:
>>>>>> Well, this may work but here're my points:
>>>>>>
>>>>>> 1) The code want to recover from backed crash by introducing extra space
>>>>>> to store inflight data, but it still depends on the backend to set/get
>>>>>> the inflight state
>>>>>>
>>>>>> 2) Since the backend could be killed at any time, the backend must have
>>>>>> the ability to recover from the partial inflight state
>>>>>>
>>>>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be
>>>>>> recursive. The above lines show how tricky could the code looks like.
>>>>>>
>>>>>> Solving this at vhost-user level through at backend is probably wrong.
>>>>>> It's time to consider the support from virtio itself.
>>>>>>
>>>>> I agree that supporting this in virtio level may be better. For
>>>>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
>>>>> Stefan's proposal. But I still think QEMU should be able to provide
>>>>> this ability too. Supposed that one vhost-user backend need to support
>>>>> multiple VMs. We can't enable reconnect ability until all VMs' guest
>>>>> driver support the new feature. It's limited.
>>>> That's the way virtio evolves.
>>>>
>>>>
>>>>>     But if QEMU have the
>>>>> ability to store inflight buffer, the backend could at least have a
>>>>> chance to support this case.
>>>> The problem is, you need a careful designed protocol described somewhere
>>> That's what we should discuss in detail in this series.
>>
>> Well, I ask some questions for this patch, but it looks like they were
>> still not answered. No?
>>
>>
> Oh, sorry, I missed those questions. Let me try to answer them here.
>
> Q1: If backend get killed in vu_queue_inflight_get() without setting
> vq->inflight->desc[desc_idx] to 1, is there any problem?
>
> The entry which stores the head of this inflight descriptor is not
> lost in avail ring. So we can still get this inflight descriptor from
> avail ring although we didn't set vq->inflight->desc[desc_idx] to 1.


Ok I get this.


> Q2:
> void vu_queue_push()
> {
>      vq->inflight->elem_idx = elem->idx;
>      vu_queue_fill();
>      vu_queue_flush();
>      vq->inflight->desc[elem->idx] = 0;
>                                                      <-------- Does
> this safe to be killed here?
>      vq->inflight->used_idx = vq->vring.used->idx;
> }
>
> Because there are no concurrency between vu_queue_push() and
> vu_queue_pop(), I don't see any problem here.
>
> Basically we just need to make sure this two operations
> (vq->vring.used->idx++ and vq->inflight->desc[elem->idx] = 0) are
> atomic. I think there are some approach to achieve that. I'm not sure
> my approach here is good enough, but it should work.


Rethink about this, some findings:

- What you suggest requires strict ordering in some part of 
vu_queue_push(). E.g it looks to me you need a compiler barrier to make 
sure used_idx is set before clearing desc[elem->idx]? This a an side 
effect of introduce new metadata (inflight->elem_idx and 
inflight->used_idx) to recover from crashing when logging 
inflgith->desc[] array.

- Modern backends like dpdk do batching aggressively, which means you 
probably need an array of elem_idx[]. This tends to be more complex 
since we probably need to negotiate the size of this array and the 
overhead is probably noticeable.

I don't audit all other places of the codes, but I suspect it would be 
hard to be 100% correct. And what happens for packed virtqueue? 
Basically, we don't want to produce tricky codes that is hard to debug 
in the future. Think in another direction, in order will be supported by 
virtio 1.1. With that, the inflight descriptors could be much more 
easier to be deduced like [used_idx, avail_idx)? So it looks to me 
supporting this from virtio layer is much more easier.

Thoughts?

Thanks


>
>>>> (is vhost-user.txt a good place for this?). And this work will be
>>>> (partial) duplicated for the future support from virtio spec itself.
>>>>
>>> I think the duplicated code is to maintain the inflight descriptor
>>> list which should be in backend. That's not main work in this series.
>>> And backend could choose to include it or not.
>>
>> You need to have a documentation to describe the protocol. Otherwise, it
>> would be very hard for other backend to implement.
>>
> Yes, actually now I'm working on adding more detail to vhost-user.txt.
>
> Thanks,
> Yongji
Yongji Xie Jan. 19, 2019, 12:19 p.m. UTC | #17
On Fri, 18 Jan 2019 at 17:27, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2019/1/18 下午3:01, Yongji Xie wrote:
> > On Fri, 18 Jan 2019 at 12:00, Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> On 2019/1/18 上午11:32, Yongji Xie wrote:
> >>> On Thu, 17 Jan 2019 at 17:57, Jason Wang <jasowang@redhat.com> wrote:
> >>>> On 2019/1/15 下午10:51, Yongji Xie wrote:
> >>>>>> Well, this may work but here're my points:
> >>>>>>
> >>>>>> 1) The code want to recover from backed crash by introducing extra space
> >>>>>> to store inflight data, but it still depends on the backend to set/get
> >>>>>> the inflight state
> >>>>>>
> >>>>>> 2) Since the backend could be killed at any time, the backend must have
> >>>>>> the ability to recover from the partial inflight state
> >>>>>>
> >>>>>> So it looks to me 1) tends to be self-contradictory and 2) tends to be
> >>>>>> recursive. The above lines show how tricky could the code looks like.
> >>>>>>
> >>>>>> Solving this at vhost-user level through at backend is probably wrong.
> >>>>>> It's time to consider the support from virtio itself.
> >>>>>>
> >>>>> I agree that supporting this in virtio level may be better. For
> >>>>> example, resubmitting inflight I/O once DEVICE_NEEDS_RESET is set in
> >>>>> Stefan's proposal. But I still think QEMU should be able to provide
> >>>>> this ability too. Supposed that one vhost-user backend need to support
> >>>>> multiple VMs. We can't enable reconnect ability until all VMs' guest
> >>>>> driver support the new feature. It's limited.
> >>>> That's the way virtio evolves.
> >>>>
> >>>>
> >>>>>     But if QEMU have the
> >>>>> ability to store inflight buffer, the backend could at least have a
> >>>>> chance to support this case.
> >>>> The problem is, you need a careful designed protocol described somewhere
> >>> That's what we should discuss in detail in this series.
> >>
> >> Well, I ask some questions for this patch, but it looks like they were
> >> still not answered. No?
> >>
> >>
> > Oh, sorry, I missed those questions. Let me try to answer them here.
> >
> > Q1: If backend get killed in vu_queue_inflight_get() without setting
> > vq->inflight->desc[desc_idx] to 1, is there any problem?
> >
> > The entry which stores the head of this inflight descriptor is not
> > lost in avail ring. So we can still get this inflight descriptor from
> > avail ring although we didn't set vq->inflight->desc[desc_idx] to 1.
>
>
> Ok I get this.
>
>
> > Q2:
> > void vu_queue_push()
> > {
> >      vq->inflight->elem_idx = elem->idx;
> >      vu_queue_fill();
> >      vu_queue_flush();
> >      vq->inflight->desc[elem->idx] = 0;
> >                                                      <-------- Does
> > this safe to be killed here?
> >      vq->inflight->used_idx = vq->vring.used->idx;
> > }
> >
> > Because there are no concurrency between vu_queue_push() and
> > vu_queue_pop(), I don't see any problem here.
> >
> > Basically we just need to make sure this two operations
> > (vq->vring.used->idx++ and vq->inflight->desc[elem->idx] = 0) are
> > atomic. I think there are some approach to achieve that. I'm not sure
> > my approach here is good enough, but it should work.
>
>
> Rethink about this, some findings:
>
> - What you suggest requires strict ordering in some part of
> vu_queue_push(). E.g it looks to me you need a compiler barrier to make
> sure used_idx is set before clearing desc[elem->idx]? This a an side
> effect of introduce new metadata (inflight->elem_idx and
> inflight->used_idx) to recover from crashing when logging
> inflgith->desc[] array.
>

Yes, compiler barries are needed here.

> - Modern backends like dpdk do batching aggressively, which means you
> probably need an array of elem_idx[]. This tends to be more complex
> since we probably need to negotiate the size of this array and the
> overhead is probably noticeable.
>

Maybe this new approach could make things easier:

void vu_queue_flush(const VuVirtqElement *elem, unsigned int count)
{
    uint16_t old = vring.used->idx;
    uint16_t new = old + count;

    inflight->used_idx = new;
    barrier();
    for (i = 0; i < count; i++) {
        inflgith->desc[elem[i]->idx] = FLAG_PROCESS;
    }
    barrier();
    vring.used->idx = new;
    barrier();
    for (i = 0; i < count; i++) {
        inflgith->desc[elem[i]->idx] = FLAG_AVAIL;
    }
}

static int vu_check_queue_inflights()
{
    ....
    for (i = 0; i < desc_count; i++) {
        if (inflight->desc[i] == FLAG_PROCESS) {
            if (inflight->used_idx != vring.used->idx) {
                /* rollback */
                inflight->desc[i] = FLAG_INUSE;
            } else {
                /* commit */
                inflight->desc[i] = FLAG_AVAIL;
            }
        }
    }
    ....
}

> I don't audit all other places of the codes, but I suspect it would be
> hard to be 100% correct. And what happens for packed virtqueue?
> Basically, we don't want to produce tricky codes that is hard to debug
> in the future. Think in another direction, in order will be supported by
> virtio 1.1. With that, the inflight descriptors could be much more
> easier to be deduced like [used_idx, avail_idx)? So it looks to me
> supporting this from virtio layer is much more easier.
>

Yes, I agree that we should support this in virtio layer for packed
virtqueue or newer virtio device. And I'll be happy to do that. But
this series also make sense to me. Because we can use this to support
legacy virtio 1.0 or 0.9 device.

Thanks,
Yongji
diff mbox series

Patch

diff --git a/Makefile b/Makefile
index dd53965f77..b5c9092605 100644
--- a/Makefile
+++ b/Makefile
@@ -473,7 +473,7 @@  Makefile: $(version-obj-y)
 # Build libraries
 
 libqemuutil.a: $(util-obj-y) $(trace-obj-y) $(stub-obj-y)
-libvhost-user.a: $(libvhost-user-obj-y)
+libvhost-user.a: $(libvhost-user-obj-y) $(util-obj-y) $(stub-obj-y)
 
 ######################################################################
 
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 23bd52264c..e73ce04619 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -41,6 +41,8 @@ 
 #endif
 
 #include "qemu/atomic.h"
+#include "qemu/osdep.h"
+#include "qemu/memfd.h"
 
 #include "libvhost-user.h"
 
@@ -53,6 +55,18 @@ 
             _min1 < _min2 ? _min1 : _min2; })
 #endif
 
+/* Round number down to multiple */
+#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
+
+/* Round number up to multiple */
+#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
+
+/* Align each region to cache line size in inflight buffer */
+#define INFLIGHT_ALIGNMENT 64
+
+/* The version of inflight buffer */
+#define INFLIGHT_VERSION 1
+
 #define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
 
 /* The version of the protocol we support */
@@ -66,6 +80,20 @@ 
         }                                       \
     } while (0)
 
+static inline
+bool has_feature(uint64_t features, unsigned int fbit)
+{
+    assert(fbit < 64);
+    return !!(features & (1ULL << fbit));
+}
+
+static inline
+bool vu_has_feature(VuDev *dev,
+                    unsigned int fbit)
+{
+    return has_feature(dev->features, fbit);
+}
+
 static const char *
 vu_request_to_string(unsigned int req)
 {
@@ -100,6 +128,8 @@  vu_request_to_string(unsigned int req)
         REQ(VHOST_USER_POSTCOPY_ADVISE),
         REQ(VHOST_USER_POSTCOPY_LISTEN),
         REQ(VHOST_USER_POSTCOPY_END),
+        REQ(VHOST_USER_GET_INFLIGHT_FD),
+        REQ(VHOST_USER_SET_INFLIGHT_FD),
         REQ(VHOST_USER_MAX),
     };
 #undef REQ
@@ -890,6 +920,41 @@  vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg)
     return true;
 }
 
+static int
+vu_check_queue_inflights(VuDev *dev, VuVirtq *vq)
+{
+    int i = 0;
+
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->used_idx = vq->vring.used->idx;
+    vq->inflight_num = 0;
+    for (i = 0; i < vq->vring.num; i++) {
+        if (vq->inflight->desc[i] == 0) {
+            continue;
+        }
+
+        vq->inflight_desc[vq->inflight_num++] = i;
+        vq->inuse++;
+    }
+    vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
+
+    /* in case of I/O hang after reconnecting */
+    if (eventfd_write(vq->kick_fd, 1) ||
+        eventfd_write(vq->call_fd, 1)) {
+        return -1;
+    }
+
+    return 0;
+}
+
 static bool
 vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -925,6 +990,10 @@  vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg)
                dev->vq[index].kick_fd, index);
     }
 
+    if (vu_check_queue_inflights(dev, &dev->vq[index])) {
+        vu_panic(dev, "Failed to check inflights for vq: %d\n", index);
+    }
+
     return false;
 }
 
@@ -1215,6 +1284,117 @@  vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
     return true;
 }
 
+static bool
+vu_get_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd;
+    void *addr;
+    uint64_t mmap_size;
+
+    if (vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid get_inflight_fd message:%d", vmsg->size);
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n",
+           vmsg->payload.inflight.num_queues);
+
+    mmap_size = vmsg->payload.inflight.num_queues *
+                ALIGN_UP(sizeof(VuVirtqInflight), INFLIGHT_ALIGNMENT);
+
+    addr = qemu_memfd_alloc("vhost-inflight", mmap_size,
+                            F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+                            &fd, NULL);
+
+    if (!addr) {
+        vu_panic(dev, "Failed to alloc vhost inflight area");
+        vmsg->payload.inflight.mmap_size = 0;
+        return true;
+    }
+
+    dev->inflight_info.addr = addr;
+    dev->inflight_info.size = vmsg->payload.inflight.mmap_size = mmap_size;
+    vmsg->payload.inflight.mmap_offset = 0;
+    vmsg->payload.inflight.align = INFLIGHT_ALIGNMENT;
+    vmsg->payload.inflight.version = INFLIGHT_VERSION;
+    vmsg->fd_num = 1;
+    dev->inflight_info.fd = vmsg->fds[0] = fd;
+
+    DPRINT("send inflight mmap_size: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_size);
+    DPRINT("send inflight mmap offset: %"PRId64"\n",
+           vmsg->payload.inflight.mmap_offset);
+    DPRINT("send inflight align: %"PRId32"\n",
+           vmsg->payload.inflight.align);
+    DPRINT("send inflight version: %"PRId16"\n",
+           vmsg->payload.inflight.version);
+
+    return true;
+}
+
+static bool
+vu_set_inflight_fd(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int fd, i;
+    uint64_t mmap_size, mmap_offset;
+    uint32_t align;
+    uint16_t num_queues, version;
+    void *rc;
+
+    if (vmsg->fd_num != 1 ||
+        vmsg->size != sizeof(vmsg->payload.inflight)) {
+        vu_panic(dev, "Invalid set_inflight_fd message size:%d fds:%d",
+                 vmsg->size, vmsg->fd_num);
+        return false;
+    }
+
+    fd = vmsg->fds[0];
+    mmap_size = vmsg->payload.inflight.mmap_size;
+    mmap_offset = vmsg->payload.inflight.mmap_offset;
+    align = vmsg->payload.inflight.align;
+    num_queues = vmsg->payload.inflight.num_queues;
+    version = vmsg->payload.inflight.version;
+
+    DPRINT("set_inflight_fd mmap_size: %"PRId64"\n", mmap_size);
+    DPRINT("set_inflight_fd mmap_offset: %"PRId64"\n", mmap_offset);
+    DPRINT("set_inflight_fd align: %"PRId32"\n", align);
+    DPRINT("set_inflight_fd num_queues: %"PRId16"\n", num_queues);
+    DPRINT("set_inflight_fd version: %"PRId16"\n", version);
+
+    rc = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+              fd, mmap_offset);
+
+    if (rc == MAP_FAILED) {
+        vu_panic(dev, "set_inflight_fd mmap error: %s", strerror(errno));
+        return false;
+    }
+
+    if (version != INFLIGHT_VERSION) {
+        vu_panic(dev, "Invalid set_inflight_fd version: %d", version);
+        return false;
+    }
+
+    if (dev->inflight_info.fd) {
+        close(dev->inflight_info.fd);
+    }
+
+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+    }
+
+    dev->inflight_info.fd = fd;
+    dev->inflight_info.addr = rc;
+    dev->inflight_info.size = mmap_size;
+
+    for (i = 0; i < num_queues; i++) {
+        dev->vq[i].inflight = (VuVirtqInflight *)rc;
+        rc = (void *)((char *)rc + ALIGN_UP(sizeof(VuVirtqInflight), align));
+    }
+
+    return false;
+}
+
 static bool
 vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -1292,6 +1472,10 @@  vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
         return vu_set_postcopy_listen(dev, vmsg);
     case VHOST_USER_POSTCOPY_END:
         return vu_set_postcopy_end(dev, vmsg);
+    case VHOST_USER_GET_INFLIGHT_FD:
+        return vu_get_inflight_fd(dev, vmsg);
+    case VHOST_USER_SET_INFLIGHT_FD:
+        return vu_set_inflight_fd(dev, vmsg);
     default:
         vmsg_close_fds(vmsg);
         vu_panic(dev, "Unhandled request: %d", vmsg->request);
@@ -1359,8 +1543,18 @@  vu_deinit(VuDev *dev)
             close(vq->err_fd);
             vq->err_fd = -1;
         }
+        vq->inflight = NULL;
     }
 
+    if (dev->inflight_info.addr) {
+        munmap(dev->inflight_info.addr, dev->inflight_info.size);
+        dev->inflight_info.addr = NULL;
+    }
+
+    if (dev->inflight_info.fd > 0) {
+        close(dev->inflight_info.fd);
+        dev->inflight_info.fd = -1;
+    }
 
     vu_close_log(dev);
     if (dev->slave_fd != -1) {
@@ -1687,20 +1881,6 @@  vu_queue_empty(VuDev *dev, VuVirtq *vq)
     return vring_avail_idx(vq) == vq->last_avail_idx;
 }
 
-static inline
-bool has_feature(uint64_t features, unsigned int fbit)
-{
-    assert(fbit < 64);
-    return !!(features & (1ULL << fbit));
-}
-
-static inline
-bool vu_has_feature(VuDev *dev,
-                    unsigned int fbit)
-{
-    return has_feature(dev->features, fbit);
-}
-
 static bool
 vring_notify(VuDev *dev, VuVirtq *vq)
 {
@@ -1829,12 +2009,6 @@  virtqueue_map_desc(VuDev *dev,
     *p_num_sg = num_sg;
 }
 
-/* Round number down to multiple */
-#define ALIGN_DOWN(n, m) ((n) / (m) * (m))
-
-/* Round number up to multiple */
-#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
-
 static void *
 virtqueue_alloc_element(size_t sz,
                                      unsigned out_num, unsigned in_num)
@@ -1935,9 +2109,44 @@  vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, size_t sz)
     return elem;
 }
 
+static int
+vu_queue_inflight_get(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->inflight->desc[desc_idx] = 1;
+
+    return 0;
+}
+
+static int
+vu_queue_inflight_put(VuDev *dev, VuVirtq *vq, int desc_idx)
+{
+    if (!has_feature(dev->protocol_features,
+        VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
+        return 0;
+    }
+
+    if (unlikely(!vq->inflight)) {
+        return -1;
+    }
+
+    vq->inflight->desc[desc_idx] = 0;
+
+    return 0;
+}
+
 void *
 vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
 {
+    int i;
     unsigned int head;
     VuVirtqElement *elem;
 
@@ -1946,6 +2155,12 @@  vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
         return NULL;
     }
 
+    if (unlikely(vq->inflight_num > 0)) {
+        i = (--vq->inflight_num);
+        elem = vu_queue_map_desc(dev, vq, vq->inflight_desc[i], sz);
+        return elem;
+    }
+
     if (vu_queue_empty(dev, vq)) {
         return NULL;
     }
@@ -1976,6 +2191,8 @@  vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz)
 
     vq->inuse++;
 
+    vu_queue_inflight_get(dev, vq, head);
+
     return elem;
 }
 
@@ -2121,4 +2338,5 @@  vu_queue_push(VuDev *dev, VuVirtq *vq,
 {
     vu_queue_fill(dev, vq, elem, len, 0);
     vu_queue_flush(dev, vq, 1);
+    vu_queue_inflight_put(dev, vq, elem->index);
 }
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 4aa55b4d2d..5afb80ea5c 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -53,6 +53,7 @@  enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_CONFIG = 9,
     VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD = 10,
     VHOST_USER_PROTOCOL_F_HOST_NOTIFIER = 11,
+    VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12,
 
     VHOST_USER_PROTOCOL_F_MAX
 };
@@ -91,6 +92,8 @@  typedef enum VhostUserRequest {
     VHOST_USER_POSTCOPY_ADVISE  = 28,
     VHOST_USER_POSTCOPY_LISTEN  = 29,
     VHOST_USER_POSTCOPY_END     = 30,
+    VHOST_USER_GET_INFLIGHT_FD = 31,
+    VHOST_USER_SET_INFLIGHT_FD = 32,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -138,6 +141,14 @@  typedef struct VhostUserVringArea {
     uint64_t offset;
 } VhostUserVringArea;
 
+typedef struct VhostUserInflight {
+    uint64_t mmap_size;
+    uint64_t mmap_offset;
+    uint32_t align;
+    uint16_t num_queues;
+    uint16_t version;
+} VhostUserInflight;
+
 #if defined(_WIN32)
 # define VU_PACKED __attribute__((gcc_struct, packed))
 #else
@@ -163,6 +174,7 @@  typedef struct VhostUserMsg {
         VhostUserLog log;
         VhostUserConfig config;
         VhostUserVringArea area;
+        VhostUserInflight inflight;
     } payload;
 
     int fds[VHOST_MEMORY_MAX_NREGIONS];
@@ -234,9 +246,19 @@  typedef struct VuRing {
     uint32_t flags;
 } VuRing;
 
+typedef struct VuVirtqInflight {
+    char desc[VIRTQUEUE_MAX_SIZE];
+} VuVirtqInflight;
+
 typedef struct VuVirtq {
     VuRing vring;
 
+    VuVirtqInflight *inflight;
+
+    uint16_t inflight_desc[VIRTQUEUE_MAX_SIZE];
+
+    uint16_t inflight_num;
+
     /* Next head to pop */
     uint16_t last_avail_idx;
 
@@ -279,11 +301,18 @@  typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition,
                                  vu_watch_cb cb, void *data);
 typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd);
 
+typedef struct VuDevInflightInfo {
+    int fd;
+    void *addr;
+    uint64_t size;
+} VuDevInflightInfo;
+
 struct VuDev {
     int sock;
     uint32_t nregions;
     VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS];
     VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE];
+    VuDevInflightInfo inflight_info;
     int log_call_fd;
     int slave_fd;
     uint64_t log_size;