diff mbox series

[v9,4/5] vhost-user block device backend server

Message ID 20200614183907.514282-5-coiby.xu@gmail.com
State New
Headers show
Series vhost-user block device backend implementation | expand

Commit Message

Coiby Xu June 14, 2020, 6:39 p.m. UTC
By making use of libvhost-user, block device drive can be shared to
the connected vhost-user client. Only one client can connect to the
server one time.

Since vhost-user-server needs a block drive to be created first, delay
the creation of this object.

Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
---
 block/Makefile.objs                  |   1 +
 block/export/vhost-user-blk-server.c | 669 +++++++++++++++++++++++++++
 block/export/vhost-user-blk-server.h |  35 ++
 softmmu/vl.c                         |   4 +
 4 files changed, 709 insertions(+)
 create mode 100644 block/export/vhost-user-blk-server.c
 create mode 100644 block/export/vhost-user-blk-server.h

Comments

Kevin Wolf June 18, 2020, 3:57 p.m. UTC | #1
Am 14.06.2020 um 20:39 hat Coiby Xu geschrieben:
> By making use of libvhost-user, block device drive can be shared to
> the connected vhost-user client. Only one client can connect to the
> server one time.
> 
> Since vhost-user-server needs a block drive to be created first, delay
> the creation of this object.
> 
> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
> ---
>  block/Makefile.objs                  |   1 +
>  block/export/vhost-user-blk-server.c | 669 +++++++++++++++++++++++++++
>  block/export/vhost-user-blk-server.h |  35 ++
>  softmmu/vl.c                         |   4 +
>  4 files changed, 709 insertions(+)
>  create mode 100644 block/export/vhost-user-blk-server.c
>  create mode 100644 block/export/vhost-user-blk-server.h
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 3635b6b4c1..0eb7eff470 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -24,6 +24,7 @@ block-obj-y += throttle-groups.o
>  block-obj-$(CONFIG_LINUX) += nvme.o
>  
>  block-obj-y += nbd.o
> +block-obj-$(CONFIG_LINUX) += export/vhost-user-blk-server.o ../contrib/libvhost-user/libvhost-user.o
>  block-obj-$(CONFIG_SHEEPDOG) += sheepdog.o
>  block-obj-$(CONFIG_LIBISCSI) += iscsi.o
>  block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
> diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
> new file mode 100644
> index 0000000000..bbf2ceaa9b
> --- /dev/null
> +++ b/block/export/vhost-user-blk-server.c
> @@ -0,0 +1,669 @@
> +/*
> + * Sharing QEMU block devices via vhost-user protocal
> + *
> + * Author: Coiby Xu <coiby.xu@gmail.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +#include "qemu/osdep.h"
> +#include "block/block.h"
> +#include "vhost-user-blk-server.h"
> +#include "qapi/error.h"
> +#include "qom/object_interfaces.h"
> +#include "sysemu/block-backend.h"
> +#include "util/block-helpers.h"
> +
> +enum {
> +    VHOST_USER_BLK_MAX_QUEUES = 1,
> +};
> +struct virtio_blk_inhdr {
> +    unsigned char status;
> +};
> +
> +
> +typedef struct VuBlockReq {
> +    VuVirtqElement *elem;
> +    int64_t sector_num;
> +    size_t size;
> +    struct virtio_blk_inhdr *in;
> +    struct virtio_blk_outhdr out;
> +    VuServer *server;
> +    struct VuVirtq *vq;
> +} VuBlockReq;
> +
> +
> +static void vu_block_req_complete(VuBlockReq *req)
> +{
> +    VuDev *vu_dev = &req->server->vu_dev;
> +
> +    /* IO size with 1 extra status byte */
> +    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
> +    vu_queue_notify(vu_dev, req->vq);
> +
> +    if (req->elem) {
> +        free(req->elem);
> +    }
> +
> +    g_free(req);
> +}
> +
> +static VuBlockDev *get_vu_block_device_by_server(VuServer *server)
> +{
> +    return container_of(server, VuBlockDev, vu_server);
> +}
> +
> +static int coroutine_fn
> +vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
> +                              uint32_t iovcnt, uint32_t type)
> +{
> +    struct virtio_blk_discard_write_zeroes desc;
> +    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
> +    if (unlikely(size != sizeof(desc))) {
> +        error_report("Invalid size %ld, expect %ld", size, sizeof(desc));
> +        return -EINVAL;
> +    }
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
> +    uint64_t range[2] = { le64_to_cpu(desc.sector) << 9,
> +                          le32_to_cpu(desc.num_sectors) << 9 };
> +    if (type == VIRTIO_BLK_T_DISCARD) {
> +        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
> +            return 0;
> +        }
> +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
> +        if (blk_co_pwrite_zeroes(vdev_blk->backend,
> +                                 range[0], range[1], 0) == 0) {
> +            return 0;
> +        }
> +    }
> +
> +    return -EINVAL;
> +}
> +
> +
> +static void coroutine_fn vu_block_flush(VuBlockReq *req)
> +{
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
> +    BlockBackend *backend = vdev_blk->backend;
> +    blk_co_flush(backend);
> +}
> +
> +
> +struct req_data {
> +    VuServer *server;
> +    VuVirtq *vq;
> +    VuVirtqElement *elem;
> +};
> +
> +static void coroutine_fn vu_block_virtio_process_req(void *opaque)
> +{
> +    struct req_data *data = opaque;
> +    VuServer *server = data->server;
> +    VuVirtq *vq = data->vq;
> +    VuVirtqElement *elem = data->elem;
> +    uint32_t type;
> +    VuBlockReq *req;
> +
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
> +    BlockBackend *backend = vdev_blk->backend;
> +
> +    struct iovec *in_iov = elem->in_sg;
> +    struct iovec *out_iov = elem->out_sg;
> +    unsigned in_num = elem->in_num;
> +    unsigned out_num = elem->out_num;
> +    /* refer to hw/block/virtio_blk.c */
> +    if (elem->out_num < 1 || elem->in_num < 1) {
> +        error_report("virtio-blk request missing headers");
> +        free(elem);
> +        return;
> +    }
> +
> +    req = g_new0(VuBlockReq, 1);
> +    req->server = server;
> +    req->vq = vq;
> +    req->elem = elem;
> +
> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
> +                            sizeof(req->out)) != sizeof(req->out))) {
> +        error_report("virtio-blk request outhdr too short");
> +        goto err;
> +    }
> +
> +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
> +
> +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
> +        error_report("virtio-blk request inhdr too short");
> +        goto err;
> +    }
> +
> +    /* We always touch the last byte, so just see how big in_iov is.  */
> +    req->in = (void *)in_iov[in_num - 1].iov_base
> +              + in_iov[in_num - 1].iov_len
> +              - sizeof(struct virtio_blk_inhdr);
> +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
> +
> +
> +    type = le32_to_cpu(req->out.type);
> +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
> +    case VIRTIO_BLK_T_IN:
> +    case VIRTIO_BLK_T_OUT: {
> +        ssize_t ret = 0;
> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        req->sector_num = le64_to_cpu(req->out.sector);
> +
> +        int64_t offset = req->sector_num * vdev_blk->blk_size;
> +        QEMUIOVector qiov;
> +        if (is_write) {
> +            qemu_iovec_init_external(&qiov, out_iov, out_num);
> +            ret = blk_co_pwritev(backend, offset, qiov.size,
> +                                 &qiov, 0);
> +        } else {
> +            qemu_iovec_init_external(&qiov, in_iov, in_num);
> +            ret = blk_co_preadv(backend, offset, qiov.size,
> +                                &qiov, 0);
> +        }
> +        if (ret >= 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        break;
> +    }
> +    case VIRTIO_BLK_T_FLUSH:
> +        vu_block_flush(req);
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        break;
> +    case VIRTIO_BLK_T_GET_ID: {
> +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> +                          VIRTIO_BLK_ID_BYTES);
> +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk_server");
> +        req->in->status = VIRTIO_BLK_S_OK;
> +        req->size = elem->in_sg[0].iov_len;
> +        break;
> +    }
> +    case VIRTIO_BLK_T_DISCARD:
> +    case VIRTIO_BLK_T_WRITE_ZEROES: {
> +        int rc;
> +        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
> +                                           out_num, type);
> +        if (rc == 0) {
> +            req->in->status = VIRTIO_BLK_S_OK;
> +        } else {
> +            req->in->status = VIRTIO_BLK_S_IOERR;
> +        }
> +        break;
> +    }
> +    default:
> +        req->in->status = VIRTIO_BLK_S_UNSUPP;
> +        break;
> +    }
> +
> +    vu_block_req_complete(req);
> +    return;
> +
> +err:
> +    free(elem);
> +    g_free(req);
> +    return;
> +}
> +
> +
> +
> +static void vu_block_process_vq(VuDev *vu_dev, int idx)
> +{
> +    VuServer *server;
> +    VuVirtq *vq;
> +
> +    server = container_of(vu_dev, VuServer, vu_dev);
> +    assert(server);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    assert(vq);
> +    VuVirtqElement *elem;
> +    while (1) {
> +        elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
> +                                    sizeof(VuBlockReq));
> +        if (elem) {
> +            struct req_data req_data = {
> +                .server = server,
> +                .vq = vq,
> +                .elem = elem
> +            };

This is on the stack of the function.

> +            Coroutine *co = qemu_coroutine_create(vu_block_virtio_process_req,
> +                                                  &req_data);
> +            aio_co_enter(server->ioc->ctx, co);

Therefore, this code is only correct, if co accesses the data only while
the function has not returned yet.

This function is called in the context of vu_dispatch(), which in turn
is called from vu_client_trip(). So we already run in a coroutine. In
this case, aio_co_enter() only schedules co to run after the current
coroutine yields or terminates. In other words, this looks wrong to me
because req_data will be accessed when it's long out of scope.

I think we need to malloc it.

> +        } else {
> +            break;
> +        }
> +    }
> +}
> +
> +static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
> +{
> +    VuVirtq *vq;
> +
> +    assert(vu_dev);
> +
> +    vq = vu_get_queue(vu_dev, idx);
> +    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
> +}
> +
> +static uint64_t vu_block_get_features(VuDev *dev)
> +{
> +    uint64_t features;
> +    VuServer *server = container_of(dev, VuServer, vu_dev);
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
> +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
> +               1ull << VIRTIO_BLK_F_SEG_MAX |
> +               1ull << VIRTIO_BLK_F_TOPOLOGY |
> +               1ull << VIRTIO_BLK_F_BLK_SIZE |
> +               1ull << VIRTIO_BLK_F_FLUSH |
> +               1ull << VIRTIO_BLK_F_DISCARD |
> +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
> +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
> +               1ull << VIRTIO_F_VERSION_1 |
> +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
> +               1ull << VIRTIO_RING_F_EVENT_IDX |
> +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
> +
> +    if (!vdev_blk->writable) {
> +        features |= 1ull << VIRTIO_BLK_F_RO;
> +    }
> +
> +    return features;
> +}
> +
> +static uint64_t vu_block_get_protocol_features(VuDev *dev)
> +{
> +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
> +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
> +}
> +
> +static int
> +vu_block_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
> +{
> +    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
> +    memcpy(config, &vdev_blk->blkcfg, len);
> +
> +    return 0;
> +}
> +
> +static int
> +vu_block_set_config(VuDev *vu_dev, const uint8_t *data,
> +                    uint32_t offset, uint32_t size, uint32_t flags)
> +{
> +    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
> +    uint8_t wce;
> +
> +    /* don't support live migration */
> +    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
> +        return -EINVAL;
> +    }
> +
> +
> +    if (offset != offsetof(struct virtio_blk_config, wce) ||
> +        size != 1) {
> +        return -EINVAL;
> +    }
> +
> +    wce = *data;
> +    if (wce == vdev_blk->blkcfg.wce) {
> +        /* Do nothing as same with old configuration */
> +        return 0;
> +    }

This check is unnecessary. Nothing bad happens if you set the same value
again.

> +    vdev_blk->blkcfg.wce = wce;
> +    blk_set_enable_write_cache(vdev_blk->backend, wce);
> +    return 0;
> +}
> +
> +
> +/*
> + * When the client disconnects, it sends a VHOST_USER_NONE request
> + * and vu_process_message will simple call exit which cause the VM
> + * to exit abruptly.
> + * To avoid this issue,  process VHOST_USER_NONE request ahead
> + * of vu_process_message.
> + *
> + */
> +static int vu_block_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
> +{
> +    if (vmsg->request == VHOST_USER_NONE) {
> +        dev->panic(dev, "disconnect");
> +        return true;
> +    }
> +    return false;
> +}
> +
> +
> +static const VuDevIface vu_block_iface = {
> +    .get_features          = vu_block_get_features,
> +    .queue_set_started     = vu_block_queue_set_started,
> +    .get_protocol_features = vu_block_get_protocol_features,
> +    .get_config            = vu_block_get_config,
> +    .set_config            = vu_block_set_config,
> +    .process_msg           = vu_block_process_msg,
> +};
> +
> +static void blk_aio_attached(AioContext *ctx, void *opaque)
> +{
> +    VuBlockDev *vub_dev = opaque;
> +    aio_context_acquire(ctx);
> +    vhost_user_server_set_aio_context(ctx, &vub_dev->vu_server);
> +    aio_context_release(ctx);
> +}
> +
> +static void blk_aio_detach(void *opaque)
> +{
> +    VuBlockDev *vub_dev = opaque;
> +    AioContext *ctx = vub_dev->vu_server.ctx;
> +    aio_context_acquire(ctx);
> +    vhost_user_server_set_aio_context(NULL, &vub_dev->vu_server);
> +    aio_context_release(ctx);
> +}
> +
> +
> +static void
> +vu_block_initialize_config(BlockDriverState *bs,
> +                           struct virtio_blk_config *config, uint32_t blk_size)
> +{
> +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> +    config->blk_size = blk_size;
> +    config->size_max = 0;
> +    config->seg_max = 128 - 2;
> +    config->min_io_size = 1;
> +    config->opt_io_size = 1;
> +    config->num_queues = VHOST_USER_BLK_MAX_QUEUES;
> +    config->max_discard_sectors = 32768;
> +    config->max_discard_seg = 1;
> +    config->discard_sector_alignment = config->blk_size >> 9;
> +    config->max_write_zeroes_sectors = 32768;
> +    config->max_write_zeroes_seg = 1;
> +}
> +
> +
> +static VuBlockDev *vu_block_init(VuBlockDev *vu_block_device, Error **errp)
> +{
> +
> +    BlockBackend *blk;
> +    Error *local_error = NULL;
> +    const char *node_name = vu_block_device->node_name;
> +    bool writable = vu_block_device->writable;
> +    /*
> +     * Don't allow resize while the vhost user server is running,
> +     * otherwise we don't care what happens with the node.
> +     */

I think this comment belong to the blk_new() below where the shared
permissions are specified.

> +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
> +    int ret;
> +
> +    AioContext *ctx;
> +
> +    BlockDriverState *bs = bdrv_lookup_bs(node_name, node_name, &local_error);
> +
> +    if (!bs) {
> +        error_propagate(errp, local_error);
> +        return NULL;
> +    }
> +
> +    if (bdrv_is_read_only(bs)) {
> +        writable = false;
> +    }
> +
> +    if (writable) {
> +        perm |= BLK_PERM_WRITE;
> +    }
> +
> +    ctx = bdrv_get_aio_context(bs);
> +    aio_context_acquire(ctx);
> +    bdrv_invalidate_cache(bs, NULL);
> +    aio_context_release(ctx);
> +
> +    blk = blk_new(bdrv_get_aio_context(bs), perm,
> +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
> +    ret = blk_insert_bs(blk, bs, errp);
> +
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    blk_set_enable_write_cache(blk, false);
> +
> +    blk_set_allow_aio_context_change(blk, true);
> +
> +    vu_block_device->blkcfg.wce = 0;
> +    vu_block_device->backend = blk;
> +    if (!vu_block_device->blk_size) {
> +        vu_block_device->blk_size = BDRV_SECTOR_SIZE;
> +    }
> +    vu_block_device->blkcfg.blk_size = vu_block_device->blk_size;
> +    blk_set_guest_block_size(blk, vu_block_device->blk_size);
> +    vu_block_initialize_config(bs, &vu_block_device->blkcfg,
> +                                   vu_block_device->blk_size);
> +    return vu_block_device;
> +
> +fail:
> +    blk_unref(blk);
> +    return NULL;
> +}
> +
> +static void vhost_user_blk_server_stop(VuBlockDev *vu_block_device)
> +{
> +    if (!vu_block_device) {
> +        return;
> +    }
> +
> +    vhost_user_server_stop(&vu_block_device->vu_server);
> +
> +    if (vu_block_device->backend) {
> +        blk_remove_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
> +                                        blk_aio_detach, vu_block_device);
> +    }
> +
> +    blk_unref(vu_block_device->backend);
> +
> +}
> +
> +
> +static void vhost_user_blk_server_start(VuBlockDev *vu_block_device,
> +                                        Error **errp)
> +{
> +    SocketAddress *addr = vu_block_device->addr;
> +
> +    if (!vu_block_init(vu_block_device, errp)) {
> +        return;
> +    }
> +
> +    AioContext *ctx = bdrv_get_aio_context(blk_bs(vu_block_device->backend));

Please move declarations to the top of the function.

> +    if (!vhost_user_server_start(&vu_block_device->vu_server, addr, ctx,
> +                                 VHOST_USER_BLK_MAX_QUEUES,
> +                                 NULL, &vu_block_iface,
> +                                 errp)) {
> +        goto error;
> +    }
> +
> +    blk_add_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
> +                                 blk_aio_detach, vu_block_device);
> +    vu_block_device->running = true;
> +    return;
> +
> + error:
> +    vhost_user_blk_server_stop(vu_block_device);

vu_block_device hasn't been fully set up. You need to undo only
vu_block_init(). You must not call vhost_user_server_stop().

> +}
> +
> +static bool vu_prop_modificable(VuBlockDev *vus, Error **errp)

The word is "modifiable".

> +{
> +    if (vus->running) {
> +            error_setg(errp, "The property can't be modified "
> +                    "while the server is running");
> +            return false;

The indentation is off here.

> +    }
> +    return true;
> +}
> +static void vu_set_node_name(Object *obj, const char *value, Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +
> +    if (vus->node_name) {
> +        if (!vu_prop_modificable(vus, errp)) {
> +            return;
> +        }

Why don't we need to check vu_prop_modificable() when the property isn't
set yet? I assume it's because the server can't even be started without
a node name, but it would be more obviously correct if the check were
done unconditionally.

> +        g_free(vus->node_name);
> +    }
> +
> +    vus->node_name = g_strdup(value);
> +}
> +
> +static char *vu_get_node_name(Object *obj, Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +    return g_strdup(vus->node_name);
> +}
> +
> +
> +static void vu_set_unix_socket(Object *obj, const char *value,
> +                               Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +
> +    if (vus->addr) {
> +        if (!vu_prop_modificable(vus, errp)) {
> +            return;
> +        }

Same here.

> +        g_free(vus->addr->u.q_unix.path);
> +        g_free(vus->addr);
> +    }
> +
> +    SocketAddress *addr = g_new0(SocketAddress, 1);
> +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
> +    addr->u.q_unix.path = g_strdup(value);
> +    vus->addr = addr;
> +}
> +
> +static char *vu_get_unix_socket(Object *obj, Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +    return g_strdup(vus->addr->u.q_unix.path);
> +}
> +
> +static bool vu_get_block_writable(Object *obj, Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +    return vus->writable;
> +}
> +
> +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +
> +    if (!vu_prop_modificable(vus, errp)) {
> +            return;
> +    }
> +
> +    vus->writable = value;
> +}
> +
> +static void vu_get_blk_size(Object *obj, Visitor *v, const char *name,
> +                            void *opaque, Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +    uint32_t value = vus->blk_size;
> +
> +    visit_type_uint32(v, name, &value, errp);
> +}
> +
> +static void vu_set_blk_size(Object *obj, Visitor *v, const char *name,
> +                            void *opaque, Error **errp)
> +{
> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
> +
> +    Error *local_err = NULL;
> +    uint32_t value;
> +
> +    if (!vu_prop_modificable(vus, errp)) {
> +            return;
> +    }
> +
> +    visit_type_uint32(v, name, &value, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +
> +    check_logical_block_size(object_get_typename(obj), name, value, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +
> +    vus->blk_size = value;
> +
> +out:
> +    error_propagate(errp, local_err);
> +    vus->blk_size = value;

Surely you don't want to set the value here, when some check failed?

> +}
> +
> +
> +static void vhost_user_blk_server_instance_finalize(Object *obj)
> +{
> +    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
> +
> +    vhost_user_blk_server_stop(vub);
> +}
> +
> +static void vhost_user_blk_server_complete(UserCreatable *obj, Error **errp)
> +{
> +    Error *local_error = NULL;
> +    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
> +
> +    vhost_user_blk_server_start(vub, &local_error);
> +
> +    if (local_error) {
> +        error_propagate(errp, local_error);
> +        return;
> +    }

If you don't do anything with local_error (which is named inconsistently
with local_err used above), you can just directly pass errp to
vhost_user_blk_server_start().

> +}
> +
> +static void vhost_user_blk_server_class_init(ObjectClass *klass,
> +                                             void *class_data)
> +{
> +    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
> +    ucc->complete = vhost_user_blk_server_complete;
> +
> +    object_class_property_add_bool(klass, "writable",
> +                                   vu_get_block_writable,
> +                                   vu_set_block_writable);
> +
> +    object_class_property_add_str(klass, "node-name",
> +                                  vu_get_node_name,
> +                                  vu_set_node_name);
> +
> +    object_class_property_add_str(klass, "unix-socket",
> +                                  vu_get_unix_socket,
> +                                  vu_set_unix_socket);
> +
> +    object_class_property_add(klass, "logical-block-size", "uint32",
> +                              vu_get_blk_size, vu_set_blk_size,
> +                              NULL, NULL);
> +}
> +
> +static const TypeInfo vhost_user_blk_server_info = {
> +    .name = TYPE_VHOST_USER_BLK_SERVER,
> +    .parent = TYPE_OBJECT,
> +    .instance_size = sizeof(VuBlockDev),
> +    .instance_finalize = vhost_user_blk_server_instance_finalize,
> +    .class_init = vhost_user_blk_server_class_init,
> +    .interfaces = (InterfaceInfo[]) {
> +        {TYPE_USER_CREATABLE},
> +        {}
> +    },
> +};
> +
> +static void vhost_user_blk_server_register_types(void)
> +{
> +    type_register_static(&vhost_user_blk_server_info);
> +}
> +

Please remove the trailing empty line.

Compared to the last version that I reviewed, this seems to get the
architecture for concurrent requests right, which is an important
improvement. I feel we're getting quite close to mergable now.

Kevin
Coiby Xu Aug. 17, 2020, 12:30 p.m. UTC | #2
On Thu, Jun 18, 2020 at 05:57:40PM +0200, Kevin Wolf wrote:
>Am 14.06.2020 um 20:39 hat Coiby Xu geschrieben:
>> By making use of libvhost-user, block device drive can be shared to
>> the connected vhost-user client. Only one client can connect to the
>> server one time.
>>
>> Since vhost-user-server needs a block drive to be created first, delay
>> the creation of this object.
>>
>> Signed-off-by: Coiby Xu <coiby.xu@gmail.com>
>> ---
>>  block/Makefile.objs                  |   1 +
>>  block/export/vhost-user-blk-server.c | 669 +++++++++++++++++++++++++++
>>  block/export/vhost-user-blk-server.h |  35 ++
>>  softmmu/vl.c                         |   4 +
>>  4 files changed, 709 insertions(+)
>>  create mode 100644 block/export/vhost-user-blk-server.c
>>  create mode 100644 block/export/vhost-user-blk-server.h
>>
>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> index 3635b6b4c1..0eb7eff470 100644
>> --- a/block/Makefile.objs
>> +++ b/block/Makefile.objs
>> @@ -24,6 +24,7 @@ block-obj-y += throttle-groups.o
>>  block-obj-$(CONFIG_LINUX) += nvme.o
>>
>>  block-obj-y += nbd.o
>> +block-obj-$(CONFIG_LINUX) += export/vhost-user-blk-server.o ../contrib/libvhost-user/libvhost-user.o
>>  block-obj-$(CONFIG_SHEEPDOG) += sheepdog.o
>>  block-obj-$(CONFIG_LIBISCSI) += iscsi.o
>>  block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
>> diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
>> new file mode 100644
>> index 0000000000..bbf2ceaa9b
>> --- /dev/null
>> +++ b/block/export/vhost-user-blk-server.c
>> @@ -0,0 +1,669 @@
>> +/*
>> + * Sharing QEMU block devices via vhost-user protocal
>> + *
>> + * Author: Coiby Xu <coiby.xu@gmail.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>> + * later.  See the COPYING file in the top-level directory.
>> + */
>> +#include "qemu/osdep.h"
>> +#include "block/block.h"
>> +#include "vhost-user-blk-server.h"
>> +#include "qapi/error.h"
>> +#include "qom/object_interfaces.h"
>> +#include "sysemu/block-backend.h"
>> +#include "util/block-helpers.h"
>> +
>> +enum {
>> +    VHOST_USER_BLK_MAX_QUEUES = 1,
>> +};
>> +struct virtio_blk_inhdr {
>> +    unsigned char status;
>> +};
>> +
>> +
>> +typedef struct VuBlockReq {
>> +    VuVirtqElement *elem;
>> +    int64_t sector_num;
>> +    size_t size;
>> +    struct virtio_blk_inhdr *in;
>> +    struct virtio_blk_outhdr out;
>> +    VuServer *server;
>> +    struct VuVirtq *vq;
>> +} VuBlockReq;
>> +
>> +
>> +static void vu_block_req_complete(VuBlockReq *req)
>> +{
>> +    VuDev *vu_dev = &req->server->vu_dev;
>> +
>> +    /* IO size with 1 extra status byte */
>> +    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
>> +    vu_queue_notify(vu_dev, req->vq);
>> +
>> +    if (req->elem) {
>> +        free(req->elem);
>> +    }
>> +
>> +    g_free(req);
>> +}
>> +
>> +static VuBlockDev *get_vu_block_device_by_server(VuServer *server)
>> +{
>> +    return container_of(server, VuBlockDev, vu_server);
>> +}
>> +
>> +static int coroutine_fn
>> +vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
>> +                              uint32_t iovcnt, uint32_t type)
>> +{
>> +    struct virtio_blk_discard_write_zeroes desc;
>> +    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
>> +    if (unlikely(size != sizeof(desc))) {
>> +        error_report("Invalid size %ld, expect %ld", size, sizeof(desc));
>> +        return -EINVAL;
>> +    }
>> +
>> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
>> +    uint64_t range[2] = { le64_to_cpu(desc.sector) << 9,
>> +                          le32_to_cpu(desc.num_sectors) << 9 };
>> +    if (type == VIRTIO_BLK_T_DISCARD) {
>> +        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
>> +            return 0;
>> +        }
>> +    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
>> +        if (blk_co_pwrite_zeroes(vdev_blk->backend,
>> +                                 range[0], range[1], 0) == 0) {
>> +            return 0;
>> +        }
>> +    }
>> +
>> +    return -EINVAL;
>> +}
>> +
>> +
>> +static void coroutine_fn vu_block_flush(VuBlockReq *req)
>> +{
>> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
>> +    BlockBackend *backend = vdev_blk->backend;
>> +    blk_co_flush(backend);
>> +}
>> +
>> +
>> +struct req_data {
>> +    VuServer *server;
>> +    VuVirtq *vq;
>> +    VuVirtqElement *elem;
>> +};
>> +
>> +static void coroutine_fn vu_block_virtio_process_req(void *opaque)
>> +{
>> +    struct req_data *data = opaque;
>> +    VuServer *server = data->server;
>> +    VuVirtq *vq = data->vq;
>> +    VuVirtqElement *elem = data->elem;
>> +    uint32_t type;
>> +    VuBlockReq *req;
>> +
>> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
>> +    BlockBackend *backend = vdev_blk->backend;
>> +
>> +    struct iovec *in_iov = elem->in_sg;
>> +    struct iovec *out_iov = elem->out_sg;
>> +    unsigned in_num = elem->in_num;
>> +    unsigned out_num = elem->out_num;
>> +    /* refer to hw/block/virtio_blk.c */
>> +    if (elem->out_num < 1 || elem->in_num < 1) {
>> +        error_report("virtio-blk request missing headers");
>> +        free(elem);
>> +        return;
>> +    }
>> +
>> +    req = g_new0(VuBlockReq, 1);
>> +    req->server = server;
>> +    req->vq = vq;
>> +    req->elem = elem;
>> +
>> +    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
>> +                            sizeof(req->out)) != sizeof(req->out))) {
>> +        error_report("virtio-blk request outhdr too short");
>> +        goto err;
>> +    }
>> +
>> +    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
>> +
>> +    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
>> +        error_report("virtio-blk request inhdr too short");
>> +        goto err;
>> +    }
>> +
>> +    /* We always touch the last byte, so just see how big in_iov is.  */
>> +    req->in = (void *)in_iov[in_num - 1].iov_base
>> +              + in_iov[in_num - 1].iov_len
>> +              - sizeof(struct virtio_blk_inhdr);
>> +    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
>> +
>> +
>> +    type = le32_to_cpu(req->out.type);
>> +    switch (type & ~VIRTIO_BLK_T_BARRIER) {
>> +    case VIRTIO_BLK_T_IN:
>> +    case VIRTIO_BLK_T_OUT: {
>> +        ssize_t ret = 0;
>> +        bool is_write = type & VIRTIO_BLK_T_OUT;
>> +        req->sector_num = le64_to_cpu(req->out.sector);
>> +
>> +        int64_t offset = req->sector_num * vdev_blk->blk_size;
>> +        QEMUIOVector qiov;
>> +        if (is_write) {
>> +            qemu_iovec_init_external(&qiov, out_iov, out_num);
>> +            ret = blk_co_pwritev(backend, offset, qiov.size,
>> +                                 &qiov, 0);
>> +        } else {
>> +            qemu_iovec_init_external(&qiov, in_iov, in_num);
>> +            ret = blk_co_preadv(backend, offset, qiov.size,
>> +                                &qiov, 0);
>> +        }
>> +        if (ret >= 0) {
>> +            req->in->status = VIRTIO_BLK_S_OK;
>> +        } else {
>> +            req->in->status = VIRTIO_BLK_S_IOERR;
>> +        }
>> +        break;
>> +    }
>> +    case VIRTIO_BLK_T_FLUSH:
>> +        vu_block_flush(req);
>> +        req->in->status = VIRTIO_BLK_S_OK;
>> +        break;
>> +    case VIRTIO_BLK_T_GET_ID: {
>> +        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
>> +                          VIRTIO_BLK_ID_BYTES);
>> +        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk_server");
>> +        req->in->status = VIRTIO_BLK_S_OK;
>> +        req->size = elem->in_sg[0].iov_len;
>> +        break;
>> +    }
>> +    case VIRTIO_BLK_T_DISCARD:
>> +    case VIRTIO_BLK_T_WRITE_ZEROES: {
>> +        int rc;
>> +        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
>> +                                           out_num, type);
>> +        if (rc == 0) {
>> +            req->in->status = VIRTIO_BLK_S_OK;
>> +        } else {
>> +            req->in->status = VIRTIO_BLK_S_IOERR;
>> +        }
>> +        break;
>> +    }
>> +    default:
>> +        req->in->status = VIRTIO_BLK_S_UNSUPP;
>> +        break;
>> +    }
>> +
>> +    vu_block_req_complete(req);
>> +    return;
>> +
>> +err:
>> +    free(elem);
>> +    g_free(req);
>> +    return;
>> +}
>> +
>> +
>> +
>> +static void vu_block_process_vq(VuDev *vu_dev, int idx)
>> +{
>> +    VuServer *server;
>> +    VuVirtq *vq;
>> +
>> +    server = container_of(vu_dev, VuServer, vu_dev);
>> +    assert(server);
>> +
>> +    vq = vu_get_queue(vu_dev, idx);
>> +    assert(vq);
>> +    VuVirtqElement *elem;
>> +    while (1) {
>> +        elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
>> +                                    sizeof(VuBlockReq));
>> +        if (elem) {
>> +            struct req_data req_data = {
>> +                .server = server,
>> +                .vq = vq,
>> +                .elem = elem
>> +            };
>
>This is on the stack of the function.
>
>> +            Coroutine *co = qemu_coroutine_create(vu_block_virtio_process_req,
>> +                                                  &req_data);
>> +            aio_co_enter(server->ioc->ctx, co);
>
>Therefore, this code is only correct, if co accesses the data only while
>the function has not returned yet.
>
>This function is called in the context of vu_dispatch(), which in turn
>is called from vu_client_trip(). So we already run in a coroutine. In
>this case, aio_co_enter() only schedules co to run after the current
>coroutine yields or terminates. In other words, this looks wrong to me
>because req_data will be accessed when it's long out of scope.
>
>I think we need to malloc it.

vu_dispatch is only for reading vhost-user message. This function is called
by the kick handler which is no longer run as a coroutine since I
followed the advice to make better use of contrib/libvhost. Although
no error has appeared or been reported by address sanitizer with
--enable-sanitizers, using malloc is the correct way. I've fixed it in
v10.

>
>> +        } else {
>> +            break;
>> +        }
>> +    }
>> +}
>> +
>> +static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
>> +{
>> +    VuVirtq *vq;
>> +
>> +    assert(vu_dev);
>> +
>> +    vq = vu_get_queue(vu_dev, idx);
>> +    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
>> +}
>> +
>> +static uint64_t vu_block_get_features(VuDev *dev)
>> +{
>> +    uint64_t features;
>> +    VuServer *server = container_of(dev, VuServer, vu_dev);
>> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
>> +    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
>> +               1ull << VIRTIO_BLK_F_SEG_MAX |
>> +               1ull << VIRTIO_BLK_F_TOPOLOGY |
>> +               1ull << VIRTIO_BLK_F_BLK_SIZE |
>> +               1ull << VIRTIO_BLK_F_FLUSH |
>> +               1ull << VIRTIO_BLK_F_DISCARD |
>> +               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
>> +               1ull << VIRTIO_BLK_F_CONFIG_WCE |
>> +               1ull << VIRTIO_F_VERSION_1 |
>> +               1ull << VIRTIO_RING_F_INDIRECT_DESC |
>> +               1ull << VIRTIO_RING_F_EVENT_IDX |
>> +               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
>> +
>> +    if (!vdev_blk->writable) {
>> +        features |= 1ull << VIRTIO_BLK_F_RO;
>> +    }
>> +
>> +    return features;
>> +}
>> +
>> +static uint64_t vu_block_get_protocol_features(VuDev *dev)
>> +{
>> +    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
>> +           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
>> +}
>> +
>> +static int
>> +vu_block_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
>> +{
>> +    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
>> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
>> +    memcpy(config, &vdev_blk->blkcfg, len);
>> +
>> +    return 0;
>> +}
>> +
>> +static int
>> +vu_block_set_config(VuDev *vu_dev, const uint8_t *data,
>> +                    uint32_t offset, uint32_t size, uint32_t flags)
>> +{
>> +    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
>> +    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
>> +    uint8_t wce;
>> +
>> +    /* don't support live migration */
>> +    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
>> +        return -EINVAL;
>> +    }
>> +
>> +
>> +    if (offset != offsetof(struct virtio_blk_config, wce) ||
>> +        size != 1) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    wce = *data;
>> +    if (wce == vdev_blk->blkcfg.wce) {
>> +        /* Do nothing as same with old configuration */
>> +        return 0;
>> +    }
>
>This check is unnecessary. Nothing bad happens if you set the same value
>again.

Yes, removing it also simplifies the code.

>
>> +    vdev_blk->blkcfg.wce = wce;
>> +    blk_set_enable_write_cache(vdev_blk->backend, wce);
>> +    return 0;
>> +}
>> +
>> +
>> +/*
>> + * When the client disconnects, it sends a VHOST_USER_NONE request
>> + * and vu_process_message will simple call exit which cause the VM
>> + * to exit abruptly.
>> + * To avoid this issue,  process VHOST_USER_NONE request ahead
>> + * of vu_process_message.
>> + *
>> + */
>> +static int vu_block_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
>> +{
>> +    if (vmsg->request == VHOST_USER_NONE) {
>> +        dev->panic(dev, "disconnect");
>> +        return true;
>> +    }
>> +    return false;
>> +}
>> +
>> +
>> +static const VuDevIface vu_block_iface = {
>> +    .get_features          = vu_block_get_features,
>> +    .queue_set_started     = vu_block_queue_set_started,
>> +    .get_protocol_features = vu_block_get_protocol_features,
>> +    .get_config            = vu_block_get_config,
>> +    .set_config            = vu_block_set_config,
>> +    .process_msg           = vu_block_process_msg,
>> +};
>> +
>> +static void blk_aio_attached(AioContext *ctx, void *opaque)
>> +{
>> +    VuBlockDev *vub_dev = opaque;
>> +    aio_context_acquire(ctx);
>> +    vhost_user_server_set_aio_context(ctx, &vub_dev->vu_server);
>> +    aio_context_release(ctx);
>> +}
>> +
>> +static void blk_aio_detach(void *opaque)
>> +{
>> +    VuBlockDev *vub_dev = opaque;
>> +    AioContext *ctx = vub_dev->vu_server.ctx;
>> +    aio_context_acquire(ctx);
>> +    vhost_user_server_set_aio_context(NULL, &vub_dev->vu_server);
>> +    aio_context_release(ctx);
>> +}
>> +
>> +
>> +static void
>> +vu_block_initialize_config(BlockDriverState *bs,
>> +                           struct virtio_blk_config *config, uint32_t blk_size)
>> +{
>> +    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
>> +    config->blk_size = blk_size;
>> +    config->size_max = 0;
>> +    config->seg_max = 128 - 2;
>> +    config->min_io_size = 1;
>> +    config->opt_io_size = 1;
>> +    config->num_queues = VHOST_USER_BLK_MAX_QUEUES;
>> +    config->max_discard_sectors = 32768;
>> +    config->max_discard_seg = 1;
>> +    config->discard_sector_alignment = config->blk_size >> 9;
>> +    config->max_write_zeroes_sectors = 32768;
>> +    config->max_write_zeroes_seg = 1;
>> +}
>> +
>> +
>> +static VuBlockDev *vu_block_init(VuBlockDev *vu_block_device, Error **errp)
>> +{
>> +
>> +    BlockBackend *blk;
>> +    Error *local_error = NULL;
>> +    const char *node_name = vu_block_device->node_name;
>> +    bool writable = vu_block_device->writable;
>> +    /*
>> +     * Don't allow resize while the vhost user server is running,
>> +     * otherwise we don't care what happens with the node.
>> +     */
>
>I think this comment belong to the blk_new() below where the shared
>permissions are specified.

Yes, it makes more sense to put the comment right above blk_new)_.

>
>> +    uint64_t perm = BLK_PERM_CONSISTENT_READ;
>> +    int ret;
>> +
>> +    AioContext *ctx;
>> +
>> +    BlockDriverState *bs = bdrv_lookup_bs(node_name, node_name, &local_error);
>> +
>> +    if (!bs) {
>> +        error_propagate(errp, local_error);
>> +        return NULL;
>> +    }
>> +
>> +    if (bdrv_is_read_only(bs)) {
>> +        writable = false;
>> +    }
>> +
>> +    if (writable) {
>> +        perm |= BLK_PERM_WRITE;
>> +    }
>> +
>> +    ctx = bdrv_get_aio_context(bs);
>> +    aio_context_acquire(ctx);
>> +    bdrv_invalidate_cache(bs, NULL);
>> +    aio_context_release(ctx);
>> +
>> +    blk = blk_new(bdrv_get_aio_context(bs), perm,
>> +                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
>> +                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
>> +    ret = blk_insert_bs(blk, bs, errp);
>> +
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +
>> +    blk_set_enable_write_cache(blk, false);
>> +
>> +    blk_set_allow_aio_context_change(blk, true);
>> +
>> +    vu_block_device->blkcfg.wce = 0;
>> +    vu_block_device->backend = blk;
>> +    if (!vu_block_device->blk_size) {
>> +        vu_block_device->blk_size = BDRV_SECTOR_SIZE;
>> +    }
>> +    vu_block_device->blkcfg.blk_size = vu_block_device->blk_size;
>> +    blk_set_guest_block_size(blk, vu_block_device->blk_size);
>> +    vu_block_initialize_config(bs, &vu_block_device->blkcfg,
>> +                                   vu_block_device->blk_size);
>> +    return vu_block_device;
>> +
>> +fail:
>> +    blk_unref(blk);
>> +    return NULL;
>> +}
>> +
>> +static void vhost_user_blk_server_stop(VuBlockDev *vu_block_device)
>> +{
>> +    if (!vu_block_device) {
>> +        return;
>> +    }
>> +
>> +    vhost_user_server_stop(&vu_block_device->vu_server);
>> +
>> +    if (vu_block_device->backend) {
>> +        blk_remove_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
>> +                                        blk_aio_detach, vu_block_device);
>> +    }
>> +
>> +    blk_unref(vu_block_device->backend);
>> +
>> +}
>> +
>> +
>> +static void vhost_user_blk_server_start(VuBlockDev *vu_block_device,
>> +                                        Error **errp)
>> +{
>> +    SocketAddress *addr = vu_block_device->addr;
>> +
>> +    if (!vu_block_init(vu_block_device, errp)) {
>> +        return;
>> +    }
>> +
>> +    AioContext *ctx = bdrv_get_aio_context(blk_bs(vu_block_device->backend));
>
>Please move declarations to the top of the function.

Been fixed in v10.

>
>> +    if (!vhost_user_server_start(&vu_block_device->vu_server, addr, ctx,
>> +                                 VHOST_USER_BLK_MAX_QUEUES,
>> +                                 NULL, &vu_block_iface,
>> +                                 errp)) {
>> +        goto error;
>> +    }
>> +
>> +    blk_add_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
>> +                                 blk_aio_detach, vu_block_device);
>> +    vu_block_device->running = true;
>> +    return;
>> +
>> + error:
>> +    vhost_user_blk_server_stop(vu_block_device);
>
>vu_block_device hasn't been fully set up. You need to undo only
>vu_block_init(). You must not call vhost_user_server_stop().
>
>> +}
>> +
>> +static bool vu_prop_modificable(VuBlockDev *vus, Error **errp)
>
>The word is "modifiable".

Thank you for correcting my spelling!

>
>> +{
>> +    if (vus->running) {
>> +            error_setg(errp, "The property can't be modified "
>> +                    "while the server is running");
>> +            return false;
>
>The indentation is off here.
>
>> +    }
>> +    return true;
>> +}
>> +static void vu_set_node_name(Object *obj, const char *value, Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +
>> +    if (vus->node_name) {
>> +        if (!vu_prop_modificable(vus, errp)) {
>> +            return;
>> +        }
>
>Why don't we need to check vu_prop_modificable() when the property isn't
>set yet? I assume it's because the server can't even be started without
>a node name, but it would be more obviously correct if the check were
>done unconditionally.
>
>> +        g_free(vus->node_name);
>> +    }
>> +
>> +    vus->node_name = g_strdup(value);
>> +}
>> +
>> +static char *vu_get_node_name(Object *obj, Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +    return g_strdup(vus->node_name);
>> +}
>> +
>> +
>> +static void vu_set_unix_socket(Object *obj, const char *value,
>> +                               Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +
>> +    if (vus->addr) {
>> +        if (!vu_prop_modificable(vus, errp)) {
>> +            return;
>> +        }
>
>Same here.

It makes the code more readable. Thank you!

>
>> +        g_free(vus->addr->u.q_unix.path);
>> +        g_free(vus->addr);
>> +    }
>> +
>> +    SocketAddress *addr = g_new0(SocketAddress, 1);
>> +    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
>> +    addr->u.q_unix.path = g_strdup(value);
>> +    vus->addr = addr;
>> +}
>> +
>> +static char *vu_get_unix_socket(Object *obj, Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +    return g_strdup(vus->addr->u.q_unix.path);
>> +}
>> +
>> +static bool vu_get_block_writable(Object *obj, Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +    return vus->writable;
>> +}
>> +
>> +static void vu_set_block_writable(Object *obj, bool value, Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +
>> +    if (!vu_prop_modificable(vus, errp)) {
>> +            return;
>> +    }
>> +
>> +    vus->writable = value;
>> +}
>> +
>> +static void vu_get_blk_size(Object *obj, Visitor *v, const char *name,
>> +                            void *opaque, Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +    uint32_t value = vus->blk_size;
>> +
>> +    visit_type_uint32(v, name, &value, errp);
>> +}
>> +
>> +static void vu_set_blk_size(Object *obj, Visitor *v, const char *name,
>> +                            void *opaque, Error **errp)
>> +{
>> +    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
>> +
>> +    Error *local_err = NULL;
>> +    uint32_t value;
>> +
>> +    if (!vu_prop_modificable(vus, errp)) {
>> +            return;
>> +    }
>> +
>> +    visit_type_uint32(v, name, &value, &local_err);
>> +    if (local_err) {
>> +        goto out;
>> +    }
>> +
>> +    check_logical_block_size(object_get_typename(obj), name, value, &local_err);
>> +    if (local_err) {
>> +        goto out;
>> +    }
>> +
>> +    vus->blk_size = value;
>> +
>> +out:
>> +    error_propagate(errp, local_err);
>> +    vus->blk_size = value;
>
>Surely you don't want to set the value here, when some check failed?

Yes, I must have left this line of code here by mistake.

>
>> +}
>> +
>> +
>> +static void vhost_user_blk_server_instance_finalize(Object *obj)
>> +{
>> +    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
>> +
>> +    vhost_user_blk_server_stop(vub);
>> +}
>> +
>> +static void vhost_user_blk_server_complete(UserCreatable *obj, Error **errp)
>> +{
>> +    Error *local_error = NULL;
>> +    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
>> +
>> +    vhost_user_blk_server_start(vub, &local_error);
>> +
>> +    if (local_error) {
>> +        error_propagate(errp, local_error);
>> +        return;
>> +    }
>
>If you don't do anything with local_error (which is named inconsistently
>with local_err used above), you can just directly pass errp to
>vhost_user_blk_server_start().
>
>> +}
>> +
>> +static void vhost_user_blk_server_class_init(ObjectClass *klass,
>> +                                             void *class_data)
>> +{
>> +    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
>> +    ucc->complete = vhost_user_blk_server_complete;
>> +
>> +    object_class_property_add_bool(klass, "writable",
>> +                                   vu_get_block_writable,
>> +                                   vu_set_block_writable);
>> +
>> +    object_class_property_add_str(klass, "node-name",
>> +                                  vu_get_node_name,
>> +                                  vu_set_node_name);
>> +
>> +    object_class_property_add_str(klass, "unix-socket",
>> +                                  vu_get_unix_socket,
>> +                                  vu_set_unix_socket);
>> +
>> +    object_class_property_add(klass, "logical-block-size", "uint32",
>> +                              vu_get_blk_size, vu_set_blk_size,
>> +                              NULL, NULL);
>> +}
>> +
>> +static const TypeInfo vhost_user_blk_server_info = {
>> +    .name = TYPE_VHOST_USER_BLK_SERVER,
>> +    .parent = TYPE_OBJECT,
>> +    .instance_size = sizeof(VuBlockDev),
>> +    .instance_finalize = vhost_user_blk_server_instance_finalize,
>> +    .class_init = vhost_user_blk_server_class_init,
>> +    .interfaces = (InterfaceInfo[]) {
>> +        {TYPE_USER_CREATABLE},
>> +        {}
>> +    },
>> +};
>> +
>> +static void vhost_user_blk_server_register_types(void)
>> +{
>> +    type_register_static(&vhost_user_blk_server_info);
>> +}
>> +
>
>Please remove the trailing empty line.
>
>Compared to the last version that I reviewed, this seems to get the
>architecture for concurrent requests right, which is an important
>improvement. I feel we're getting quite close to mergable now.
>
>Kevin
>
Thank you for the guidance and helpful feedbacks along the way:)


--
Best regards,
Coiby
diff mbox series

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 3635b6b4c1..0eb7eff470 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -24,6 +24,7 @@  block-obj-y += throttle-groups.o
 block-obj-$(CONFIG_LINUX) += nvme.o
 
 block-obj-y += nbd.o
+block-obj-$(CONFIG_LINUX) += export/vhost-user-blk-server.o ../contrib/libvhost-user/libvhost-user.o
 block-obj-$(CONFIG_SHEEPDOG) += sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(if $(CONFIG_LIBISCSI),y,n) += iscsi-opts.o
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
new file mode 100644
index 0000000000..bbf2ceaa9b
--- /dev/null
+++ b/block/export/vhost-user-blk-server.c
@@ -0,0 +1,669 @@ 
+/*
+ * Sharing QEMU block devices via vhost-user protocal
+ *
+ * Author: Coiby Xu <coiby.xu@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "vhost-user-blk-server.h"
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/block-backend.h"
+#include "util/block-helpers.h"
+
+enum {
+    VHOST_USER_BLK_MAX_QUEUES = 1,
+};
+struct virtio_blk_inhdr {
+    unsigned char status;
+};
+
+
+typedef struct VuBlockReq {
+    VuVirtqElement *elem;
+    int64_t sector_num;
+    size_t size;
+    struct virtio_blk_inhdr *in;
+    struct virtio_blk_outhdr out;
+    VuServer *server;
+    struct VuVirtq *vq;
+} VuBlockReq;
+
+
+static void vu_block_req_complete(VuBlockReq *req)
+{
+    VuDev *vu_dev = &req->server->vu_dev;
+
+    /* IO size with 1 extra status byte */
+    vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
+    vu_queue_notify(vu_dev, req->vq);
+
+    if (req->elem) {
+        free(req->elem);
+    }
+
+    g_free(req);
+}
+
+static VuBlockDev *get_vu_block_device_by_server(VuServer *server)
+{
+    return container_of(server, VuBlockDev, vu_server);
+}
+
+static int coroutine_fn
+vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
+                              uint32_t iovcnt, uint32_t type)
+{
+    struct virtio_blk_discard_write_zeroes desc;
+    ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
+    if (unlikely(size != sizeof(desc))) {
+        error_report("Invalid size %ld, expect %ld", size, sizeof(desc));
+        return -EINVAL;
+    }
+
+    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
+    uint64_t range[2] = { le64_to_cpu(desc.sector) << 9,
+                          le32_to_cpu(desc.num_sectors) << 9 };
+    if (type == VIRTIO_BLK_T_DISCARD) {
+        if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
+            return 0;
+        }
+    } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
+        if (blk_co_pwrite_zeroes(vdev_blk->backend,
+                                 range[0], range[1], 0) == 0) {
+            return 0;
+        }
+    }
+
+    return -EINVAL;
+}
+
+
+static void coroutine_fn vu_block_flush(VuBlockReq *req)
+{
+    VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
+    BlockBackend *backend = vdev_blk->backend;
+    blk_co_flush(backend);
+}
+
+
+struct req_data {
+    VuServer *server;
+    VuVirtq *vq;
+    VuVirtqElement *elem;
+};
+
+static void coroutine_fn vu_block_virtio_process_req(void *opaque)
+{
+    struct req_data *data = opaque;
+    VuServer *server = data->server;
+    VuVirtq *vq = data->vq;
+    VuVirtqElement *elem = data->elem;
+    uint32_t type;
+    VuBlockReq *req;
+
+    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
+    BlockBackend *backend = vdev_blk->backend;
+
+    struct iovec *in_iov = elem->in_sg;
+    struct iovec *out_iov = elem->out_sg;
+    unsigned in_num = elem->in_num;
+    unsigned out_num = elem->out_num;
+    /* refer to hw/block/virtio_blk.c */
+    if (elem->out_num < 1 || elem->in_num < 1) {
+        error_report("virtio-blk request missing headers");
+        free(elem);
+        return;
+    }
+
+    req = g_new0(VuBlockReq, 1);
+    req->server = server;
+    req->vq = vq;
+    req->elem = elem;
+
+    if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
+                            sizeof(req->out)) != sizeof(req->out))) {
+        error_report("virtio-blk request outhdr too short");
+        goto err;
+    }
+
+    iov_discard_front(&out_iov, &out_num, sizeof(req->out));
+
+    if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
+        error_report("virtio-blk request inhdr too short");
+        goto err;
+    }
+
+    /* We always touch the last byte, so just see how big in_iov is.  */
+    req->in = (void *)in_iov[in_num - 1].iov_base
+              + in_iov[in_num - 1].iov_len
+              - sizeof(struct virtio_blk_inhdr);
+    iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
+
+
+    type = le32_to_cpu(req->out.type);
+    switch (type & ~VIRTIO_BLK_T_BARRIER) {
+    case VIRTIO_BLK_T_IN:
+    case VIRTIO_BLK_T_OUT: {
+        ssize_t ret = 0;
+        bool is_write = type & VIRTIO_BLK_T_OUT;
+        req->sector_num = le64_to_cpu(req->out.sector);
+
+        int64_t offset = req->sector_num * vdev_blk->blk_size;
+        QEMUIOVector qiov;
+        if (is_write) {
+            qemu_iovec_init_external(&qiov, out_iov, out_num);
+            ret = blk_co_pwritev(backend, offset, qiov.size,
+                                 &qiov, 0);
+        } else {
+            qemu_iovec_init_external(&qiov, in_iov, in_num);
+            ret = blk_co_preadv(backend, offset, qiov.size,
+                                &qiov, 0);
+        }
+        if (ret >= 0) {
+            req->in->status = VIRTIO_BLK_S_OK;
+        } else {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+        }
+        break;
+    }
+    case VIRTIO_BLK_T_FLUSH:
+        vu_block_flush(req);
+        req->in->status = VIRTIO_BLK_S_OK;
+        break;
+    case VIRTIO_BLK_T_GET_ID: {
+        size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
+                          VIRTIO_BLK_ID_BYTES);
+        snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk_server");
+        req->in->status = VIRTIO_BLK_S_OK;
+        req->size = elem->in_sg[0].iov_len;
+        break;
+    }
+    case VIRTIO_BLK_T_DISCARD:
+    case VIRTIO_BLK_T_WRITE_ZEROES: {
+        int rc;
+        rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1],
+                                           out_num, type);
+        if (rc == 0) {
+            req->in->status = VIRTIO_BLK_S_OK;
+        } else {
+            req->in->status = VIRTIO_BLK_S_IOERR;
+        }
+        break;
+    }
+    default:
+        req->in->status = VIRTIO_BLK_S_UNSUPP;
+        break;
+    }
+
+    vu_block_req_complete(req);
+    return;
+
+err:
+    free(elem);
+    g_free(req);
+    return;
+}
+
+
+
+static void vu_block_process_vq(VuDev *vu_dev, int idx)
+{
+    VuServer *server;
+    VuVirtq *vq;
+
+    server = container_of(vu_dev, VuServer, vu_dev);
+    assert(server);
+
+    vq = vu_get_queue(vu_dev, idx);
+    assert(vq);
+    VuVirtqElement *elem;
+    while (1) {
+        elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) +
+                                    sizeof(VuBlockReq));
+        if (elem) {
+            struct req_data req_data = {
+                .server = server,
+                .vq = vq,
+                .elem = elem
+            };
+            Coroutine *co = qemu_coroutine_create(vu_block_virtio_process_req,
+                                                  &req_data);
+            aio_co_enter(server->ioc->ctx, co);
+        } else {
+            break;
+        }
+    }
+}
+
+static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started)
+{
+    VuVirtq *vq;
+
+    assert(vu_dev);
+
+    vq = vu_get_queue(vu_dev, idx);
+    vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL);
+}
+
+static uint64_t vu_block_get_features(VuDev *dev)
+{
+    uint64_t features;
+    VuServer *server = container_of(dev, VuServer, vu_dev);
+    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
+    features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
+               1ull << VIRTIO_BLK_F_SEG_MAX |
+               1ull << VIRTIO_BLK_F_TOPOLOGY |
+               1ull << VIRTIO_BLK_F_BLK_SIZE |
+               1ull << VIRTIO_BLK_F_FLUSH |
+               1ull << VIRTIO_BLK_F_DISCARD |
+               1ull << VIRTIO_BLK_F_WRITE_ZEROES |
+               1ull << VIRTIO_BLK_F_CONFIG_WCE |
+               1ull << VIRTIO_F_VERSION_1 |
+               1ull << VIRTIO_RING_F_INDIRECT_DESC |
+               1ull << VIRTIO_RING_F_EVENT_IDX |
+               1ull << VHOST_USER_F_PROTOCOL_FEATURES;
+
+    if (!vdev_blk->writable) {
+        features |= 1ull << VIRTIO_BLK_F_RO;
+    }
+
+    return features;
+}
+
+static uint64_t vu_block_get_protocol_features(VuDev *dev)
+{
+    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
+           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
+}
+
+static int
+vu_block_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
+{
+    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
+    memcpy(config, &vdev_blk->blkcfg, len);
+
+    return 0;
+}
+
+static int
+vu_block_set_config(VuDev *vu_dev, const uint8_t *data,
+                    uint32_t offset, uint32_t size, uint32_t flags)
+{
+    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+    VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
+    uint8_t wce;
+
+    /* don't support live migration */
+    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
+        return -EINVAL;
+    }
+
+
+    if (offset != offsetof(struct virtio_blk_config, wce) ||
+        size != 1) {
+        return -EINVAL;
+    }
+
+    wce = *data;
+    if (wce == vdev_blk->blkcfg.wce) {
+        /* Do nothing as same with old configuration */
+        return 0;
+    }
+
+    vdev_blk->blkcfg.wce = wce;
+    blk_set_enable_write_cache(vdev_blk->backend, wce);
+    return 0;
+}
+
+
+/*
+ * When the client disconnects, it sends a VHOST_USER_NONE request
+ * and vu_process_message will simple call exit which cause the VM
+ * to exit abruptly.
+ * To avoid this issue,  process VHOST_USER_NONE request ahead
+ * of vu_process_message.
+ *
+ */
+static int vu_block_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
+{
+    if (vmsg->request == VHOST_USER_NONE) {
+        dev->panic(dev, "disconnect");
+        return true;
+    }
+    return false;
+}
+
+
+static const VuDevIface vu_block_iface = {
+    .get_features          = vu_block_get_features,
+    .queue_set_started     = vu_block_queue_set_started,
+    .get_protocol_features = vu_block_get_protocol_features,
+    .get_config            = vu_block_get_config,
+    .set_config            = vu_block_set_config,
+    .process_msg           = vu_block_process_msg,
+};
+
+static void blk_aio_attached(AioContext *ctx, void *opaque)
+{
+    VuBlockDev *vub_dev = opaque;
+    aio_context_acquire(ctx);
+    vhost_user_server_set_aio_context(ctx, &vub_dev->vu_server);
+    aio_context_release(ctx);
+}
+
+static void blk_aio_detach(void *opaque)
+{
+    VuBlockDev *vub_dev = opaque;
+    AioContext *ctx = vub_dev->vu_server.ctx;
+    aio_context_acquire(ctx);
+    vhost_user_server_set_aio_context(NULL, &vub_dev->vu_server);
+    aio_context_release(ctx);
+}
+
+
+static void
+vu_block_initialize_config(BlockDriverState *bs,
+                           struct virtio_blk_config *config, uint32_t blk_size)
+{
+    config->capacity = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+    config->blk_size = blk_size;
+    config->size_max = 0;
+    config->seg_max = 128 - 2;
+    config->min_io_size = 1;
+    config->opt_io_size = 1;
+    config->num_queues = VHOST_USER_BLK_MAX_QUEUES;
+    config->max_discard_sectors = 32768;
+    config->max_discard_seg = 1;
+    config->discard_sector_alignment = config->blk_size >> 9;
+    config->max_write_zeroes_sectors = 32768;
+    config->max_write_zeroes_seg = 1;
+}
+
+
+static VuBlockDev *vu_block_init(VuBlockDev *vu_block_device, Error **errp)
+{
+
+    BlockBackend *blk;
+    Error *local_error = NULL;
+    const char *node_name = vu_block_device->node_name;
+    bool writable = vu_block_device->writable;
+    /*
+     * Don't allow resize while the vhost user server is running,
+     * otherwise we don't care what happens with the node.
+     */
+    uint64_t perm = BLK_PERM_CONSISTENT_READ;
+    int ret;
+
+    AioContext *ctx;
+
+    BlockDriverState *bs = bdrv_lookup_bs(node_name, node_name, &local_error);
+
+    if (!bs) {
+        error_propagate(errp, local_error);
+        return NULL;
+    }
+
+    if (bdrv_is_read_only(bs)) {
+        writable = false;
+    }
+
+    if (writable) {
+        perm |= BLK_PERM_WRITE;
+    }
+
+    ctx = bdrv_get_aio_context(bs);
+    aio_context_acquire(ctx);
+    bdrv_invalidate_cache(bs, NULL);
+    aio_context_release(ctx);
+
+    blk = blk_new(bdrv_get_aio_context(bs), perm,
+                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                  BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
+    ret = blk_insert_bs(blk, bs, errp);
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    blk_set_enable_write_cache(blk, false);
+
+    blk_set_allow_aio_context_change(blk, true);
+
+    vu_block_device->blkcfg.wce = 0;
+    vu_block_device->backend = blk;
+    if (!vu_block_device->blk_size) {
+        vu_block_device->blk_size = BDRV_SECTOR_SIZE;
+    }
+    vu_block_device->blkcfg.blk_size = vu_block_device->blk_size;
+    blk_set_guest_block_size(blk, vu_block_device->blk_size);
+    vu_block_initialize_config(bs, &vu_block_device->blkcfg,
+                                   vu_block_device->blk_size);
+    return vu_block_device;
+
+fail:
+    blk_unref(blk);
+    return NULL;
+}
+
+static void vhost_user_blk_server_stop(VuBlockDev *vu_block_device)
+{
+    if (!vu_block_device) {
+        return;
+    }
+
+    vhost_user_server_stop(&vu_block_device->vu_server);
+
+    if (vu_block_device->backend) {
+        blk_remove_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
+                                        blk_aio_detach, vu_block_device);
+    }
+
+    blk_unref(vu_block_device->backend);
+
+}
+
+
+static void vhost_user_blk_server_start(VuBlockDev *vu_block_device,
+                                        Error **errp)
+{
+    SocketAddress *addr = vu_block_device->addr;
+
+    if (!vu_block_init(vu_block_device, errp)) {
+        return;
+    }
+
+    AioContext *ctx = bdrv_get_aio_context(blk_bs(vu_block_device->backend));
+
+    if (!vhost_user_server_start(&vu_block_device->vu_server, addr, ctx,
+                                 VHOST_USER_BLK_MAX_QUEUES,
+                                 NULL, &vu_block_iface,
+                                 errp)) {
+        goto error;
+    }
+
+    blk_add_aio_context_notifier(vu_block_device->backend, blk_aio_attached,
+                                 blk_aio_detach, vu_block_device);
+    vu_block_device->running = true;
+    return;
+
+ error:
+    vhost_user_blk_server_stop(vu_block_device);
+}
+
+static bool vu_prop_modificable(VuBlockDev *vus, Error **errp)
+{
+    if (vus->running) {
+            error_setg(errp, "The property can't be modified "
+                    "while the server is running");
+            return false;
+    }
+    return true;
+}
+static void vu_set_node_name(Object *obj, const char *value, Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+
+    if (vus->node_name) {
+        if (!vu_prop_modificable(vus, errp)) {
+            return;
+        }
+        g_free(vus->node_name);
+    }
+
+    vus->node_name = g_strdup(value);
+}
+
+static char *vu_get_node_name(Object *obj, Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+    return g_strdup(vus->node_name);
+}
+
+
+static void vu_set_unix_socket(Object *obj, const char *value,
+                               Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+
+    if (vus->addr) {
+        if (!vu_prop_modificable(vus, errp)) {
+            return;
+        }
+        g_free(vus->addr->u.q_unix.path);
+        g_free(vus->addr);
+    }
+
+    SocketAddress *addr = g_new0(SocketAddress, 1);
+    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+    addr->u.q_unix.path = g_strdup(value);
+    vus->addr = addr;
+}
+
+static char *vu_get_unix_socket(Object *obj, Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+    return g_strdup(vus->addr->u.q_unix.path);
+}
+
+static bool vu_get_block_writable(Object *obj, Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+    return vus->writable;
+}
+
+static void vu_set_block_writable(Object *obj, bool value, Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+
+    if (!vu_prop_modificable(vus, errp)) {
+            return;
+    }
+
+    vus->writable = value;
+}
+
+static void vu_get_blk_size(Object *obj, Visitor *v, const char *name,
+                            void *opaque, Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+    uint32_t value = vus->blk_size;
+
+    visit_type_uint32(v, name, &value, errp);
+}
+
+static void vu_set_blk_size(Object *obj, Visitor *v, const char *name,
+                            void *opaque, Error **errp)
+{
+    VuBlockDev *vus = VHOST_USER_BLK_SERVER(obj);
+
+    Error *local_err = NULL;
+    uint32_t value;
+
+    if (!vu_prop_modificable(vus, errp)) {
+            return;
+    }
+
+    visit_type_uint32(v, name, &value, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    check_logical_block_size(object_get_typename(obj), name, value, &local_err);
+    if (local_err) {
+        goto out;
+    }
+
+    vus->blk_size = value;
+
+out:
+    error_propagate(errp, local_err);
+    vus->blk_size = value;
+}
+
+
+static void vhost_user_blk_server_instance_finalize(Object *obj)
+{
+    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
+
+    vhost_user_blk_server_stop(vub);
+}
+
+static void vhost_user_blk_server_complete(UserCreatable *obj, Error **errp)
+{
+    Error *local_error = NULL;
+    VuBlockDev *vub = VHOST_USER_BLK_SERVER(obj);
+
+    vhost_user_blk_server_start(vub, &local_error);
+
+    if (local_error) {
+        error_propagate(errp, local_error);
+        return;
+    }
+}
+
+static void vhost_user_blk_server_class_init(ObjectClass *klass,
+                                             void *class_data)
+{
+    UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
+    ucc->complete = vhost_user_blk_server_complete;
+
+    object_class_property_add_bool(klass, "writable",
+                                   vu_get_block_writable,
+                                   vu_set_block_writable);
+
+    object_class_property_add_str(klass, "node-name",
+                                  vu_get_node_name,
+                                  vu_set_node_name);
+
+    object_class_property_add_str(klass, "unix-socket",
+                                  vu_get_unix_socket,
+                                  vu_set_unix_socket);
+
+    object_class_property_add(klass, "logical-block-size", "uint32",
+                              vu_get_blk_size, vu_set_blk_size,
+                              NULL, NULL);
+}
+
+static const TypeInfo vhost_user_blk_server_info = {
+    .name = TYPE_VHOST_USER_BLK_SERVER,
+    .parent = TYPE_OBJECT,
+    .instance_size = sizeof(VuBlockDev),
+    .instance_finalize = vhost_user_blk_server_instance_finalize,
+    .class_init = vhost_user_blk_server_class_init,
+    .interfaces = (InterfaceInfo[]) {
+        {TYPE_USER_CREATABLE},
+        {}
+    },
+};
+
+static void vhost_user_blk_server_register_types(void)
+{
+    type_register_static(&vhost_user_blk_server_info);
+}
+
+type_init(vhost_user_blk_server_register_types)
diff --git a/block/export/vhost-user-blk-server.h b/block/export/vhost-user-blk-server.h
new file mode 100644
index 0000000000..5398e5d352
--- /dev/null
+++ b/block/export/vhost-user-blk-server.h
@@ -0,0 +1,35 @@ 
+/*
+ * Sharing QEMU block devices via vhost-user protocal
+ *
+ * Author: Coiby Xu <coiby.xu@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef VHOST_USER_BLK_SERVER_H
+#define VHOST_USER_BLK_SERVER_H
+#include "util/vhost-user-server.h"
+
+typedef struct VuBlockDev VuBlockDev;
+#define TYPE_VHOST_USER_BLK_SERVER "vhost-user-blk-server"
+#define VHOST_USER_BLK_SERVER(obj) \
+   OBJECT_CHECK(VuBlockDev, obj, TYPE_VHOST_USER_BLK_SERVER)
+
+/* vhost user block device */
+struct VuBlockDev {
+    Object parent_obj;
+    char *node_name;
+    SocketAddress *addr;
+    AioContext *ctx;
+    VuServer vu_server;
+    bool running;
+    uint32_t blk_size;
+    BlockBackend *backend;
+    QIOChannelSocket *sioc;
+    QTAILQ_ENTRY(VuBlockDev) next;
+    struct virtio_blk_config blkcfg;
+    bool writable;
+};
+
+#endif /* VHOST_USER_BLK_SERVER_H */
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 05d1a4cb6b..838df3e57a 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -2520,6 +2520,10 @@  static bool object_create_initial(const char *type, QemuOpts *opts)
     }
 #endif
 
+    /* Reason: vhost-user-blk-server property "node-name" */
+    if (g_str_equal(type, "vhost-user-blk-server")) {
+        return false;
+    }
     /*
      * Reason: filter-* property "netdev" etc.
      */