diff mbox

[1/2] vhost-user-blk: introduce a new vhost-user-blk host device

Message ID 1501120851-4314-2-git-send-email-changpeng.liu@intel.com
State New
Headers show

Commit Message

Liu, Changpeng July 27, 2017, 2 a.m. UTC
This commit introduces a vhost-user device for block, it uses a
chardev to connect to the backend, Same with virito_blk, Guest
OS still uses the virtio_blk frontend driver.

To use it, start Qemu with command line like this:

qemu-system-x86_64 \
    -chardev socket,id=char0,path=/path/vhost.socket \
    -device vhost-user-blk-pci,chardev=char0,size=10G, \
            logical_block_size=...

Different with exist Qemu virtio_blk host device, it makes more easy
for users to implement their own I/O processing logic, such as all
user space I/O stack against hardware block device. However, due to
exist vhost-user messages, Qemu can't get the parameters such as
capacity of the backend block device, users need to append such
parameters when start Qemu.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
---
 configure                          |  11 ++
 hw/block/Makefile.objs             |   3 +
 hw/block/vhost-user-blk.c          | 352 +++++++++++++++++++++++++++++++++++++
 hw/virtio/virtio-pci.c             |  55 ++++++
 hw/virtio/virtio-pci.h             |  18 ++
 include/hw/virtio/vhost-user-blk.h |  46 +++++
 6 files changed, 485 insertions(+)
 create mode 100644 hw/block/vhost-user-blk.c
 create mode 100644 include/hw/virtio/vhost-user-blk.h

Comments

Stefan Hajnoczi July 26, 2017, 10:35 a.m. UTC | #1
On Thu, Jul 27, 2017 at 10:00:50AM +0800, Changpeng Liu wrote:
> +static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
> +{
> +    VHostUserBlk *s = VHOST_USER_BLK(vdev);
> +    struct virtio_blk_config blkcfg;
> +
> +    memcpy(&blkcfg, config, sizeof(blkcfg));
> +
> +    if (blkcfg.wce != s->config_wce) {
> +        error_report("vhost-user-blk: does not support the operation");

If vhost-user-blk doesn't support the operation then please remove the
VIRTIO_BLK_F_CONFIG_WCE feature bit.  That way the guest knows it cannot
modify this field.

> +static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
> +{
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VHostUserBlk *s = VHOST_USER_BLK(vdev);
> +    int ret;
> +    uint64_t size;
> +
> +    if (!s->chardev.chr) {
> +        error_setg(errp, "vhost-user-blk: chardev is mandatary");

mandatory

> +        return;
> +    }
> +
> +    if (!s->num_queues) {
> +        error_setg(errp, "vhost-user-blk: invalid number of IO queues");
> +        return;
> +    }
> +
> +    if (!s->queue_size) {
> +        error_setg(errp, "vhost-user-blk: invalid count of the IO queue");

"count of the IO queue" sounds like number of queues.  I suggest saying
"queue size must be non-zero".

> +        return;
> +    }
> +
> +    if (!s->size) {
> +        error_setg(errp, "vhost-user-blk: block capacity must be assigned,"
> +                  "size can be specified by GiB or MiB");
> +        return;
> +    }
> +
> +    ret = qemu_strtosz_MiB(s->size, NULL, &size);
> +    if (ret < 0) {
> +        error_setg(errp, "vhost-user-blk: invalid size %s in GiB/MiB", s->size);
> +        return;
> +    }
> +    s->capacity = size / 512;
> +
> +    /* block size with default 512 Bytes */
> +    if (!s->blkcfg.logical_block_size) {
> +        s->blkcfg.logical_block_size = 512;
> +    }
> +
> +    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
> +                sizeof(struct virtio_blk_config));
> +    virtio_add_queue(vdev, s->queue_size, vhost_user_blk_handle_output);
> +
> +    s->dev.nvqs = s->num_queues;
> +    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);

Please test multi-queue, it's currently broken.  virtio_add_queue() must
be called for each queue.

> +    s->dev.vq_index = 0;
> +    s->dev.backend_features = 0;
> +
> +    ret = vhost_dev_init(&s->dev, (void *)&s->chardev,

The compiler automatically converts any pointer type to void * without a
warning in C.  (This is different from C++!)

The (void *) cast can be dropped.

> +                         VHOST_BACKEND_TYPE_USER, 0);
> +    if (ret < 0) {
> +        error_setg(errp, "vhost-user-blk: vhost initialization failed: %s",
> +                   strerror(-ret));

If realize fails unrealize() will not be called.  Cleanup must be
performed here.

> +        return;
> +    }
> +}
> +
> +static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
> +{
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VHostUserBlk *s = VHOST_USER_BLK(dev);
> +
> +    vhost_user_blk_set_status(vdev, 0);
> +    vhost_dev_cleanup(&s->dev);
> +    g_free(s->dev.vqs);
> +    virtio_cleanup(vdev);
> +}
> +
> +static void vhost_user_blk_instance_init(Object *obj)
> +{
> +    VHostUserBlk *s = VHOST_USER_BLK(obj);
> +
> +    device_add_bootindex_property(obj, &s->bootindex, "bootindex",
> +                                  "/disk@0,0", DEVICE(obj), NULL);
> +}
> +
> +static const VMStateDescription vmstate_vhost_user_blk = {
> +    .name = "vhost-user-blk",
> +    .minimum_version_id = 2,
> +    .version_id = 2,

Why is version_id 2?  There has never been a vhost-user-blk device in
qemu.git before, so I would expect version to be 1.

> +    .fields = (VMStateField[]) {
> +        VMSTATE_VIRTIO_DEVICE,
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
> +static Property vhost_user_blk_properties[] = {
> +    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
> +    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),

DEFINE_BLOCK_PROPERTIES cannot be used directly because it includes the
'drive' (blkcfg.blk) parameter.  The command-line should not allow a
drive= parameter.

Also, parameters like the discard granularity, optimal I/O size, logical
block size, etc can be initialized to sensible defaults by QEMU's block
layer when drive= is used.  Since you are bypassing QEMU's block layer
there is no way for QEMU to set good defaults.

Are you relying on the managment tools populating these parameters with
the correct values from the vhost-user-blk process?  Or did you have
some other mechanism in mind?

> +    DEFINE_BLOCK_CHS_PROPERTIES(VHostUserBlk, blkcfg),
> +    DEFINE_PROP_STRING("size", VHostUserBlk, size),

virtio-blk supports online resize.  QEMU and the vhost-user-blk process
need a way to exchange disk capacity information for online resize.

Please add the necessary vhost-user protocol messages now so size
information can be automatically exchanged and updated for resize.

> +typedef struct VHostUserBlk {
> +    VirtIODevice parent_obj;
> +    CharBackend chardev;
> +    Error *migration_blocker;

This field is unused, please drop it.
Liu, Changpeng July 27, 2017, 12:29 a.m. UTC | #2
> -----Original Message-----
> From: Stefan Hajnoczi [mailto:stefanha@redhat.com]
> Sent: Wednesday, July 26, 2017 6:35 PM
> To: Liu, Changpeng <changpeng.liu@intel.com>
> Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; felipe@nutanix.com;
> mst@redhat.com
> Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host
> device
> 
> On Thu, Jul 27, 2017 at 10:00:50AM +0800, Changpeng Liu wrote:
> > +static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t
> *config)
> > +{
> > +    VHostUserBlk *s = VHOST_USER_BLK(vdev);
> > +    struct virtio_blk_config blkcfg;
> > +
> > +    memcpy(&blkcfg, config, sizeof(blkcfg));
> > +
> > +    if (blkcfg.wce != s->config_wce) {
> > +        error_report("vhost-user-blk: does not support the operation");
> 
> If vhost-user-blk doesn't support the operation then please remove the
> VIRTIO_BLK_F_CONFIG_WCE feature bit.  That way the guest knows it cannot
> modify this field.
> 
> > +static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
> > +{
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > +    VHostUserBlk *s = VHOST_USER_BLK(vdev);
> > +    int ret;
> > +    uint64_t size;
> > +
> > +    if (!s->chardev.chr) {
> > +        error_setg(errp, "vhost-user-blk: chardev is mandatary");
> 
> mandatory
> 
> > +        return;
> > +    }
> > +
> > +    if (!s->num_queues) {
> > +        error_setg(errp, "vhost-user-blk: invalid number of IO queues");
> > +        return;
> > +    }
> > +
> > +    if (!s->queue_size) {
> > +        error_setg(errp, "vhost-user-blk: invalid count of the IO queue");
> 
> "count of the IO queue" sounds like number of queues.  I suggest saying
> "queue size must be non-zero".
> 
> > +        return;
> > +    }
> > +
> > +    if (!s->size) {
> > +        error_setg(errp, "vhost-user-blk: block capacity must be assigned,"
> > +                  "size can be specified by GiB or MiB");
> > +        return;
> > +    }
> > +
> > +    ret = qemu_strtosz_MiB(s->size, NULL, &size);
> > +    if (ret < 0) {
> > +        error_setg(errp, "vhost-user-blk: invalid size %s in GiB/MiB", s->size);
> > +        return;
> > +    }
> > +    s->capacity = size / 512;
> > +
> > +    /* block size with default 512 Bytes */
> > +    if (!s->blkcfg.logical_block_size) {
> > +        s->blkcfg.logical_block_size = 512;
> > +    }
> > +
> > +    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
> > +                sizeof(struct virtio_blk_config));
> > +    virtio_add_queue(vdev, s->queue_size, vhost_user_blk_handle_output);
> > +
> > +    s->dev.nvqs = s->num_queues;
> > +    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
> 
> Please test multi-queue, it's currently broken.  virtio_add_queue() must
> be called for each queue.
Yes, should move virtio_add_queue into loop.
> 
> > +    s->dev.vq_index = 0;
> > +    s->dev.backend_features = 0;
> > +
> > +    ret = vhost_dev_init(&s->dev, (void *)&s->chardev,
> 
> The compiler automatically converts any pointer type to void * without a
> warning in C.  (This is different from C++!)
> 
> The (void *) cast can be dropped.
> 
> > +                         VHOST_BACKEND_TYPE_USER, 0);
> > +    if (ret < 0) {
> > +        error_setg(errp, "vhost-user-blk: vhost initialization failed: %s",
> > +                   strerror(-ret));
> 
> If realize fails unrealize() will not be called.  Cleanup must be
> performed here.
> 
> > +        return;
> > +    }
> > +}
> > +
> > +static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
> > +{
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > +    VHostUserBlk *s = VHOST_USER_BLK(dev);
> > +
> > +    vhost_user_blk_set_status(vdev, 0);
> > +    vhost_dev_cleanup(&s->dev);
> > +    g_free(s->dev.vqs);
> > +    virtio_cleanup(vdev);
> > +}
> > +
> > +static void vhost_user_blk_instance_init(Object *obj)
> > +{
> > +    VHostUserBlk *s = VHOST_USER_BLK(obj);
> > +
> > +    device_add_bootindex_property(obj, &s->bootindex, "bootindex",
> > +                                  "/disk@0,0", DEVICE(obj), NULL);
> > +}
> > +
> > +static const VMStateDescription vmstate_vhost_user_blk = {
> > +    .name = "vhost-user-blk",
> > +    .minimum_version_id = 2,
> > +    .version_id = 2,
> 
> Why is version_id 2?  There has never been a vhost-user-blk device in
> qemu.git before, so I would expect version to be 1.
> 
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_VIRTIO_DEVICE,
> > +        VMSTATE_END_OF_LIST()
> > +    },
> > +};
> > +
> > +static Property vhost_user_blk_properties[] = {
> > +    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
> > +    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),
> 
> DEFINE_BLOCK_PROPERTIES cannot be used directly because it includes the
> 'drive' (blkcfg.blk) parameter.  The command-line should not allow a
> drive= parameter.
> 
> Also, parameters like the discard granularity, optimal I/O size, logical
> block size, etc can be initialized to sensible defaults by QEMU's block
> layer when drive= is used.  Since you are bypassing QEMU's block layer
> there is no way for QEMU to set good defaults.
> 
> Are you relying on the managment tools populating these parameters with
> the correct values from the vhost-user-blk process?  Or did you have
> some other mechanism in mind?
Actually for this part and your comments about block resize, I think maybe add several
additional vhost user messages such like: VHOST_USER_GET_BLK_CFG/VHOST_USER_SET_BLK_CFG
makes the code more clear, I'm okay to add such messages.
> 
> > +    DEFINE_BLOCK_CHS_PROPERTIES(VHostUserBlk, blkcfg),
> > +    DEFINE_PROP_STRING("size", VHostUserBlk, size),
> 
> virtio-blk supports online resize.  QEMU and the vhost-user-blk process
> need a way to exchange disk capacity information for online resize.
> 
> Please add the necessary vhost-user protocol messages now so size
> information can be automatically exchanged and updated for resize.
> 
> > +typedef struct VHostUserBlk {
> > +    VirtIODevice parent_obj;
> > +    CharBackend chardev;
> > +    Error *migration_blocker;
> 
> This field is unused, please drop it.
Stefan Hajnoczi July 27, 2017, 9:48 a.m. UTC | #3
On Thu, Jul 27, 2017 at 12:29:45AM +0000, Liu, Changpeng wrote:
> > > +    .fields = (VMStateField[]) {
> > > +        VMSTATE_VIRTIO_DEVICE,
> > > +        VMSTATE_END_OF_LIST()
> > > +    },
> > > +};
> > > +
> > > +static Property vhost_user_blk_properties[] = {
> > > +    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
> > > +    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),
> > 
> > DEFINE_BLOCK_PROPERTIES cannot be used directly because it includes the
> > 'drive' (blkcfg.blk) parameter.  The command-line should not allow a
> > drive= parameter.
> > 
> > Also, parameters like the discard granularity, optimal I/O size, logical
> > block size, etc can be initialized to sensible defaults by QEMU's block
> > layer when drive= is used.  Since you are bypassing QEMU's block layer
> > there is no way for QEMU to set good defaults.
> > 
> > Are you relying on the managment tools populating these parameters with
> > the correct values from the vhost-user-blk process?  Or did you have
> > some other mechanism in mind?
> Actually for this part and your comments about block resize, I think maybe add several
> additional vhost user messages such like: VHOST_USER_GET_BLK_CFG/VHOST_USER_SET_BLK_CFG
> makes the code more clear, I'm okay to add such messages.

New messages for virtio config space read/write might be useful for
other devices besides virtio-blk.

It's worth checking how virtio config space is live migrated and how to
do that consistently if the vhost-user process is involved.

Stefan
Liu, Changpeng July 27, 2017, 10:08 a.m. UTC | #4
> -----Original Message-----
> From: Stefan Hajnoczi [mailto:stefanha@redhat.com]
> Sent: Thursday, July 27, 2017 5:49 PM
> To: Liu, Changpeng <changpeng.liu@intel.com>
> Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; felipe@nutanix.com;
> mst@redhat.com
> Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host
> device
> 
> On Thu, Jul 27, 2017 at 12:29:45AM +0000, Liu, Changpeng wrote:
> > > > +    .fields = (VMStateField[]) {
> > > > +        VMSTATE_VIRTIO_DEVICE,
> > > > +        VMSTATE_END_OF_LIST()
> > > > +    },
> > > > +};
> > > > +
> > > > +static Property vhost_user_blk_properties[] = {
> > > > +    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
> > > > +    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),
> > >
> > > DEFINE_BLOCK_PROPERTIES cannot be used directly because it includes the
> > > 'drive' (blkcfg.blk) parameter.  The command-line should not allow a
> > > drive= parameter.
> > >
> > > Also, parameters like the discard granularity, optimal I/O size, logical
> > > block size, etc can be initialized to sensible defaults by QEMU's block
> > > layer when drive= is used.  Since you are bypassing QEMU's block layer
> > > there is no way for QEMU to set good defaults.
> > >
> > > Are you relying on the managment tools populating these parameters with
> > > the correct values from the vhost-user-blk process?  Or did you have
> > > some other mechanism in mind?
> > Actually for this part and your comments about block resize, I think maybe add
> several
> > additional vhost user messages such like:
> VHOST_USER_GET_BLK_CFG/VHOST_USER_SET_BLK_CFG
> > makes the code more clear, I'm okay to add such messages.
> 
> New messages for virtio config space read/write might be useful for
> other devices besides virtio-blk.
I mean all the block device information like: block_size/capacity are stored in another process, 
so add a new vhost user message to Qemu vhost-user-blk can get those information when Qemu get
started, once those messages stored to virtio_pci config space, we will not change it.
> 
> It's worth checking how virtio config space is live migrated and how to
> do that consistently if the vhost-user process is involved.
Virtio will config spaces for virtio_blk, and even the config space migrated to another machine, it should be
same with the another I/O process, did I get your comments ? Thanks. 
> 
> Stefan
Stefan Hajnoczi July 28, 2017, 10:35 a.m. UTC | #5
On Thu, Jul 27, 2017 at 10:08:49AM +0000, Liu, Changpeng wrote:
> > -----Original Message-----
> > From: Stefan Hajnoczi [mailto:stefanha@redhat.com]
> > Sent: Thursday, July 27, 2017 5:49 PM
> > To: Liu, Changpeng <changpeng.liu@intel.com>
> > Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; felipe@nutanix.com;
> > mst@redhat.com
> > Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host
> > device
> > 
> > On Thu, Jul 27, 2017 at 12:29:45AM +0000, Liu, Changpeng wrote:
> > > > > +    .fields = (VMStateField[]) {
> > > > > +        VMSTATE_VIRTIO_DEVICE,
> > > > > +        VMSTATE_END_OF_LIST()
> > > > > +    },
> > > > > +};
> > > > > +
> > > > > +static Property vhost_user_blk_properties[] = {
> > > > > +    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
> > > > > +    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),
> > > >
> > > > DEFINE_BLOCK_PROPERTIES cannot be used directly because it includes the
> > > > 'drive' (blkcfg.blk) parameter.  The command-line should not allow a
> > > > drive= parameter.
> > > >
> > > > Also, parameters like the discard granularity, optimal I/O size, logical
> > > > block size, etc can be initialized to sensible defaults by QEMU's block
> > > > layer when drive= is used.  Since you are bypassing QEMU's block layer
> > > > there is no way for QEMU to set good defaults.
> > > >
> > > > Are you relying on the managment tools populating these parameters with
> > > > the correct values from the vhost-user-blk process?  Or did you have
> > > > some other mechanism in mind?
> > > Actually for this part and your comments about block resize, I think maybe add
> > several
> > > additional vhost user messages such like:
> > VHOST_USER_GET_BLK_CFG/VHOST_USER_SET_BLK_CFG
> > > makes the code more clear, I'm okay to add such messages.
> > 
> > New messages for virtio config space read/write might be useful for
> > other devices besides virtio-blk.
> I mean all the block device information like: block_size/capacity are stored in another process, 
> so add a new vhost user message to Qemu vhost-user-blk can get those information when Qemu get
> started, once those messages stored to virtio_pci config space, we will not change it.

No, I think updates are necessary:

The virtio block device can do online disk resize, so it will be
necessary to change the capacity field from the vhost-user-blk process
at runtime and raise a config change notification interrupt.

The virtio block device also has a config space field ("wce") that is
writable by the guest.  Supporting this feature also requires virtio
config space support in vhost-user.

If you focus on adding generic virtio config space messages to
vhost-user then these virtio block features can be supported by
vhost-user-blk.

Regarding the two approaches of adding "block device information" as you
have suggested versus adding generic virtio config space support as I'm
suggesting, from a protocol design perspective it's nicer to have
generic messages that are usable by all device types.  I'm not aware of
a reason why high-level "block device information" is needed since QEMU
will just put that into the config space but otherwise does not
interpret the information.

> > It's worth checking how virtio config space is live migrated and how to
> > do that consistently if the vhost-user process is involved.
> Virtio will config spaces for virtio_blk, and even the config space migrated to another machine, it should be
> same with the another I/O process, did I get your comments ? Thanks. 

The guest-writable "wce" config space field is an example that shows the
vhost-user-blk process on the destination host does not have all the
state necessary.  QEMU needs to migrate the config space and send it to
the vhost-user-blk process on the destination.

Stefan
Liu, Changpeng July 29, 2017, 3:21 a.m. UTC | #6
> -----Original Message-----
> From: Stefan Hajnoczi [mailto:stefanha@redhat.com]
> Sent: Friday, July 28, 2017 6:36 PM
> To: Liu, Changpeng <changpeng.liu@intel.com>
> Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; felipe@nutanix.com;
> mst@redhat.com
> Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host device
> 
> On Thu, Jul 27, 2017 at 10:08:49AM +0000, Liu, Changpeng wrote:
> > > -----Original Message-----
> > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com]
> > > Sent: Thursday, July 27, 2017 5:49 PM
> > > To: Liu, Changpeng <changpeng.liu@intel.com>
> > > Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; felipe@nutanix.com;
> > > mst@redhat.com
> > > Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host
> > > device
> > >
> > > On Thu, Jul 27, 2017 at 12:29:45AM +0000, Liu, Changpeng wrote:
> > > > > > +    .fields = (VMStateField[]) {
> > > > > > +        VMSTATE_VIRTIO_DEVICE,
> > > > > > +        VMSTATE_END_OF_LIST()
> > > > > > +    },
> > > > > > +};
> > > > > > +
> > > > > > +static Property vhost_user_blk_properties[] = {
> > > > > > +    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
> > > > > > +    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),
> > > > >
> > > > > DEFINE_BLOCK_PROPERTIES cannot be used directly because it includes the
> > > > > 'drive' (blkcfg.blk) parameter.  The command-line should not allow a
> > > > > drive= parameter.
> > > > >
> > > > > Also, parameters like the discard granularity, optimal I/O size, logical
> > > > > block size, etc can be initialized to sensible defaults by QEMU's block
> > > > > layer when drive= is used.  Since you are bypassing QEMU's block layer
> > > > > there is no way for QEMU to set good defaults.
> > > > >
> > > > > Are you relying on the managment tools populating these parameters with
> > > > > the correct values from the vhost-user-blk process?  Or did you have
> > > > > some other mechanism in mind?
> > > > Actually for this part and your comments about block resize, I think maybe
> add
> > > several
> > > > additional vhost user messages such like:
> > > VHOST_USER_GET_BLK_CFG/VHOST_USER_SET_BLK_CFG
> > > > makes the code more clear, I'm okay to add such messages.
> > >
> > > New messages for virtio config space read/write might be useful for
> > > other devices besides virtio-blk.
> > I mean all the block device information like: block_size/capacity are stored in
> another process,
> > so add a new vhost user message to Qemu vhost-user-blk can get those
> information when Qemu get
> > started, once those messages stored to virtio_pci config space, we will not
> change it.
> 
> No, I think updates are necessary:
> 
> The virtio block device can do online disk resize, so it will be
> necessary to change the capacity field from the vhost-user-blk process
> at runtime and raise a config change notification interrupt.
> 
> The virtio block device also has a config space field ("wce") that is
> writable by the guest.  Supporting this feature also requires virtio
> config space support in vhost-user.
> 
> If you focus on adding generic virtio config space messages to
> vhost-user then these virtio block features can be supported by
> vhost-user-blk.
> 
> Regarding the two approaches of adding "block device information" as you
> have suggested versus adding generic virtio config space support as I'm
> suggesting, from a protocol design perspective it's nicer to have
> generic messages that are usable by all device types.  I'm not aware of
> a reason why high-level "block device information" is needed since QEMU
> will just put that into the config space but otherwise does not
> interpret the information.
Agreed, adding a vhost message  to get/set generic vitio_pci device config space 
is clear to me now, it's better than only for vhost-user-blk messages.

Here I still have an concern about *resize* feature for vhost-user-blk, currently
Qemu vhost-user-blk is the client for vhost user messages, does this means
the I/O processing process must send a new vhost message back to Qemu
vhost-user-blk driver that the capacity of the block device is changed? Or you
have better idea to do this? Thanks.
> 
> > > It's worth checking how virtio config space is live migrated and how to
> > > do that consistently if the vhost-user process is involved.
> > Virtio will config spaces for virtio_blk, and even the config space migrated to
> another machine, it should be
> > same with the another I/O process, did I get your comments ? Thanks.
> 
> The guest-writable "wce" config space field is an example that shows the
> vhost-user-blk process on the destination host does not have all the
> state necessary.  QEMU needs to migrate the config space and send it to
> the vhost-user-blk process on the destination.
> 
> Stefan
Stefan Hajnoczi July 31, 2017, 2:51 p.m. UTC | #7
On Sat, Jul 29, 2017 at 03:21:16AM +0000, Liu, Changpeng wrote:
> 
> 
> > -----Original Message-----
> > From: Stefan Hajnoczi [mailto:stefanha@redhat.com]
> > Sent: Friday, July 28, 2017 6:36 PM
> > To: Liu, Changpeng <changpeng.liu@intel.com>
> > Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; felipe@nutanix.com;
> > mst@redhat.com
> > Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host device
> > 
> > On Thu, Jul 27, 2017 at 10:08:49AM +0000, Liu, Changpeng wrote:
> > > > -----Original Message-----
> > > > From: Stefan Hajnoczi [mailto:stefanha@redhat.com]
> > > > Sent: Thursday, July 27, 2017 5:49 PM
> > > > To: Liu, Changpeng <changpeng.liu@intel.com>
> > > > Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; felipe@nutanix.com;
> > > > mst@redhat.com
> > > > Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host
> > > > device
> > > >
> > > > On Thu, Jul 27, 2017 at 12:29:45AM +0000, Liu, Changpeng wrote:
> > > > > > > +    .fields = (VMStateField[]) {
> > > > > > > +        VMSTATE_VIRTIO_DEVICE,
> > > > > > > +        VMSTATE_END_OF_LIST()
> > > > > > > +    },
> > > > > > > +};
> > > > > > > +
> > > > > > > +static Property vhost_user_blk_properties[] = {
> > > > > > > +    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
> > > > > > > +    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),
> > > > > >
> > > > > > DEFINE_BLOCK_PROPERTIES cannot be used directly because it includes the
> > > > > > 'drive' (blkcfg.blk) parameter.  The command-line should not allow a
> > > > > > drive= parameter.
> > > > > >
> > > > > > Also, parameters like the discard granularity, optimal I/O size, logical
> > > > > > block size, etc can be initialized to sensible defaults by QEMU's block
> > > > > > layer when drive= is used.  Since you are bypassing QEMU's block layer
> > > > > > there is no way for QEMU to set good defaults.
> > > > > >
> > > > > > Are you relying on the managment tools populating these parameters with
> > > > > > the correct values from the vhost-user-blk process?  Or did you have
> > > > > > some other mechanism in mind?
> > > > > Actually for this part and your comments about block resize, I think maybe
> > add
> > > > several
> > > > > additional vhost user messages such like:
> > > > VHOST_USER_GET_BLK_CFG/VHOST_USER_SET_BLK_CFG
> > > > > makes the code more clear, I'm okay to add such messages.
> > > >
> > > > New messages for virtio config space read/write might be useful for
> > > > other devices besides virtio-blk.
> > > I mean all the block device information like: block_size/capacity are stored in
> > another process,
> > > so add a new vhost user message to Qemu vhost-user-blk can get those
> > information when Qemu get
> > > started, once those messages stored to virtio_pci config space, we will not
> > change it.
> > 
> > No, I think updates are necessary:
> > 
> > The virtio block device can do online disk resize, so it will be
> > necessary to change the capacity field from the vhost-user-blk process
> > at runtime and raise a config change notification interrupt.
> > 
> > The virtio block device also has a config space field ("wce") that is
> > writable by the guest.  Supporting this feature also requires virtio
> > config space support in vhost-user.
> > 
> > If you focus on adding generic virtio config space messages to
> > vhost-user then these virtio block features can be supported by
> > vhost-user-blk.
> > 
> > Regarding the two approaches of adding "block device information" as you
> > have suggested versus adding generic virtio config space support as I'm
> > suggesting, from a protocol design perspective it's nicer to have
> > generic messages that are usable by all device types.  I'm not aware of
> > a reason why high-level "block device information" is needed since QEMU
> > will just put that into the config space but otherwise does not
> > interpret the information.
> Agreed, adding a vhost message  to get/set generic vitio_pci device config space 
> is clear to me now, it's better than only for vhost-user-blk messages.
> 
> Here I still have an concern about *resize* feature for vhost-user-blk, currently
> Qemu vhost-user-blk is the client for vhost user messages, does this means
> the I/O processing process must send a new vhost message back to Qemu
> vhost-user-blk driver that the capacity of the block device is changed? Or you
> have better idea to do this? Thanks.

A vhost-user process -> vhost-user master message is not necessary.  An
eventfd can be used to signal config changes instead.

I have CCed Marc-André because I don't know much about the vhost-user
protocol design.

Here is what I suggest for virtio config space:

typedef enum VhostUserRequest {
    ...

    /* Submitted by the vhost-user master when the guest writes to
     * virtio config space and also after live migration on the
     * destination host.
     */
    VHOST_USER_SET_CONFIG,

    /* Submitted by the vhost-user master to fetch the contents of the
     * virtio config space.  The vhost-user master may cache the
     * contents to avoid repeated VHOST_USER_GET_CONFIG calls.
     */
    VHOST_USER_GET_CONFIG,

    ...
};

struct VuDev {
    ...
    int config_change_fd;
    ...
};

/**
 * vu_config_change_notify:
 * @dev: a VuDev context
 *
 * Notify the vhost-user master that the device has updated virtio
 * config space.  The master must raise a config change interrupt in the
 * guest and invalidate any cached virtio config space contents so that
 * the next guest read results in a VHOST_USER_GET_CONFIG request.
 */
void vu_config_change_notify(VuDev *dev);
Paolo Bonzini July 31, 2017, 3:41 p.m. UTC | #8
On 31/07/2017 16:51, Stefan Hajnoczi wrote:
> typedef enum VhostUserRequest {
>     ...
> 
>     /* Submitted by the vhost-user master when the guest writes to
>      * virtio config space and also after live migration on the
>      * destination host.
>      */
>     VHOST_USER_SET_CONFIG,
> 
>     /* Submitted by the vhost-user master to fetch the contents of the
>      * virtio config space.  The vhost-user master may cache the
>      * contents to avoid repeated VHOST_USER_GET_CONFIG calls.
>      */
>     VHOST_USER_GET_CONFIG,
> 
>     ...
> };

Also:

	/* Submitted by the vhost-user master if it would like to
	 * be informed of virtio config space changes.   The slave
	 * signals the eventfd whenever config space is modified.
	 */
	VHOST_USER_SET_CONFIG_FD,

Paolo

> struct VuDev {
>     ...
>     int config_change_fd;
>     ...
> };
Liu, Changpeng Aug. 1, 2017, 12:10 a.m. UTC | #9
> -----Original Message-----
> From: Paolo Bonzini [mailto:pbonzini@redhat.com]
> Sent: Monday, July 31, 2017 11:41 PM
> To: Stefan Hajnoczi <stefanha@redhat.com>; Liu, Changpeng
> <changpeng.liu@intel.com>
> Cc: qemu-devel@nongnu.org; felipe@nutanix.com; mst@redhat.com; Marc-
> André Lureau <marcandre.lureau@redhat.com>
> Subject: Re: [PATCH 1/2] vhost-user-blk: introduce a new vhost-user-blk host
> device
> 
> On 31/07/2017 16:51, Stefan Hajnoczi wrote:
> > typedef enum VhostUserRequest {
> >     ...
> >
> >     /* Submitted by the vhost-user master when the guest writes to
> >      * virtio config space and also after live migration on the
> >      * destination host.
> >      */
> >     VHOST_USER_SET_CONFIG,
> >
> >     /* Submitted by the vhost-user master to fetch the contents of the
> >      * virtio config space.  The vhost-user master may cache the
> >      * contents to avoid repeated VHOST_USER_GET_CONFIG calls.
> >      */
> >     VHOST_USER_GET_CONFIG,
> >
> >     ...
> > };
> 
> Also:
> 
> 	/* Submitted by the vhost-user master if it would like to
> 	 * be informed of virtio config space changes.   The slave
> 	 * signals the eventfd whenever config space is modified.
> 	 */
> 	VHOST_USER_SET_CONFIG_FD,
> 
> Paolo
> 
Thanks Stefan and Paolo, looks much cleaner now.
> > struct VuDev {
> >     ...
> >     int config_change_fd;
> >     ...
> > };
diff mbox

Patch

diff --git a/configure b/configure
index 987f59b..466b3b4 100755
--- a/configure
+++ b/configure
@@ -305,6 +305,7 @@  tcg="yes"
 
 vhost_net="no"
 vhost_scsi="no"
+vhost_user_blk="no"
 vhost_vsock="no"
 kvm="no"
 hax="no"
@@ -778,6 +779,7 @@  Linux)
   kvm="yes"
   vhost_net="yes"
   vhost_scsi="yes"
+  vhost_user_blk="yes"
   vhost_vsock="yes"
   QEMU_INCLUDES="-I\$(SRC_PATH)/linux-headers -I$(pwd)/linux-headers $QEMU_INCLUDES"
   supported_os="yes"
@@ -1135,6 +1137,10 @@  for opt do
   ;;
   --enable-vhost-scsi) vhost_scsi="yes"
   ;;
+  --disable-vhost-user-blk) vhost_user_blk="no"
+  ;;
+  --enable-vhost-user-blk) vhost_user_blk="yes"
+  ;;
   --disable-vhost-vsock) vhost_vsock="no"
   ;;
   --enable-vhost-vsock) vhost_vsock="yes"
@@ -1489,6 +1495,7 @@  disabled with --disable-FEATURE, default is enabled if available:
   cap-ng          libcap-ng support
   attr            attr and xattr support
   vhost-net       vhost-net acceleration support
+  vhost-user-blk  VM virtio-blk acceleration in user space
   spice           spice
   rbd             rados block device (rbd)
   libiscsi        iscsi support
@@ -5347,6 +5354,7 @@  echo "posix_madvise     $posix_madvise"
 echo "libcap-ng support $cap_ng"
 echo "vhost-net support $vhost_net"
 echo "vhost-scsi support $vhost_scsi"
+echo "vhost-user-blk support $vhost_user_blk"
 echo "vhost-vsock support $vhost_vsock"
 echo "Trace backends    $trace_backends"
 if have_backend "simple"; then
@@ -5757,6 +5765,9 @@  fi
 if test "$vhost_scsi" = "yes" ; then
   echo "CONFIG_VHOST_SCSI=y" >> $config_host_mak
 fi
+if test "$vhost_user_blk" = "yes" ; then
+  echo "CONFIG_VHOST_USER_BLK=y" >> $config_host_mak
+fi
 if test "$vhost_net" = "yes" ; then
   echo "CONFIG_VHOST_NET_USED=y" >> $config_host_mak
 fi
diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index e0ed980..4c19a58 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -13,3 +13,6 @@  obj-$(CONFIG_SH4) += tc58128.o
 
 obj-$(CONFIG_VIRTIO) += virtio-blk.o
 obj-$(CONFIG_VIRTIO) += dataplane/
+ifeq ($(CONFIG_VIRTIO),y)
+obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk.o
+endif
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
new file mode 100644
index 0000000..38d7855
--- /dev/null
+++ b/hw/block/vhost-user-blk.c
@@ -0,0 +1,352 @@ 
+/*
+ * vhost-user-blk host device
+ *
+ * Copyright IBM, Corp. 2011
+ * Copyright(C) 2017 Intel Corporation.
+ *
+ * Authors:
+ *  Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *  Changpeng Liu <changpeng.liu@intel.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "migration/vmstate.h"
+#include "migration/migration.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/typedefs.h"
+#include "qemu/cutils.h"
+#include "qom/object.h"
+#include "hw/qdev-core.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user-blk.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+
+static const int user_feature_bits[] = {
+    VIRTIO_BLK_F_SIZE_MAX,
+    VIRTIO_BLK_F_SEG_MAX,
+    VIRTIO_BLK_F_GEOMETRY,
+    VIRTIO_BLK_F_BLK_SIZE,
+    VIRTIO_BLK_F_TOPOLOGY,
+    VIRTIO_BLK_F_SCSI,
+    VIRTIO_BLK_F_MQ,
+    VIRTIO_BLK_F_RO,
+    VIRTIO_BLK_F_FLUSH,
+    VIRTIO_BLK_F_BARRIER,
+    VIRTIO_BLK_F_CONFIG_WCE,
+    VIRTIO_F_VERSION_1,
+    VIRTIO_RING_F_INDIRECT_DESC,
+    VIRTIO_RING_F_EVENT_IDX,
+    VIRTIO_F_NOTIFY_ON_EMPTY,
+    VHOST_INVALID_FEATURE_BIT
+};
+
+static void vhost_user_blk_update_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    int blk_size = s->blkcfg.logical_block_size;
+    struct virtio_blk_config blkcfg;
+
+    memset(&blkcfg, 0, sizeof(blkcfg));
+
+    virtio_stq_p(vdev, &blkcfg.capacity, s->capacity);
+    virtio_stl_p(vdev, &blkcfg.seg_max, s->max_segment_num - 2);
+    virtio_stl_p(vdev, &blkcfg.size_max, s->max_segment_size);
+    virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
+    virtio_stw_p(vdev, &blkcfg.min_io_size, s->blkcfg.min_io_size / blk_size);
+    virtio_stl_p(vdev, &blkcfg.opt_io_size, s->blkcfg.opt_io_size / blk_size);
+    virtio_stw_p(vdev, &blkcfg.num_queues, s->num_queues);
+    virtio_stw_p(vdev, &blkcfg.geometry.cylinders,
+                 s->blkcfg.cyls);
+    blkcfg.geometry.heads = s->blkcfg.heads;
+    blkcfg.geometry.sectors = s->blkcfg.secs;
+    blkcfg.physical_block_exp = get_physical_block_exp(&s->blkcfg);
+    blkcfg.alignment_offset = 0;
+    blkcfg.wce = s->config_wce;
+
+    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
+}
+
+static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    struct virtio_blk_config blkcfg;
+
+    memcpy(&blkcfg, config, sizeof(blkcfg));
+
+    if (blkcfg.wce != s->config_wce) {
+        error_report("vhost-user-blk: does not support the operation");
+    }
+}
+
+static void vhost_user_blk_start(VirtIODevice *vdev)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    int i, ret;
+
+    if (!k->set_guest_notifiers) {
+        error_report("binding does not support guest notifiers");
+        return;
+    }
+
+    ret = vhost_dev_enable_notifiers(&s->dev, vdev);
+    if (ret < 0) {
+        error_report("Error enabling host notifiers: %d", -ret);
+        return;
+    }
+
+    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
+    if (ret < 0) {
+        error_report("Error binding guest notifier: %d", -ret);
+        goto err_host_notifiers;
+    }
+
+    s->dev.acked_features = vdev->guest_features;
+    ret = vhost_dev_start(&s->dev, vdev);
+    if (ret < 0) {
+        error_report("Error starting vhost: %d", -ret);
+        goto err_guest_notifiers;
+    }
+
+    /* guest_notifier_mask/pending not used yet, so just unmask
+     * everything here. virtio-pci will do the right thing by
+     * enabling/disabling irqfd.
+     */
+    for (i = 0; i < s->dev.nvqs; i++) {
+        vhost_virtqueue_mask(&s->dev, vdev, i, false);
+    }
+
+    return;
+
+err_guest_notifiers:
+    k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
+err_host_notifiers:
+    vhost_dev_disable_notifiers(&s->dev, vdev);
+}
+
+static void vhost_user_blk_stop(VirtIODevice *vdev)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    int ret;
+
+    if (!k->set_guest_notifiers) {
+        return;
+    }
+
+    vhost_dev_stop(&s->dev, vdev);
+
+    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
+    if (ret < 0) {
+        error_report("vhost guest notifier cleanup failed: %d", ret);
+        return;
+    }
+
+    vhost_dev_disable_notifiers(&s->dev, vdev);
+}
+
+static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK;
+
+    if (!vdev->vm_running) {
+        should_start = false;
+    }
+
+    if (s->dev.started == should_start) {
+        return;
+    }
+
+    if (should_start) {
+        vhost_user_blk_start(vdev);
+    } else {
+        vhost_user_blk_stop(vdev);
+    }
+
+}
+
+static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev,
+                                            uint64_t features,
+                                            Error **errp)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    uint64_t get_features;
+
+    /* Turn on pre-defined features */
+    features |= s->host_features;
+
+    get_features = vhost_get_features(&s->dev, user_feature_bits, features);
+
+    return get_features;
+}
+
+static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+
+}
+
+static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    int ret;
+    uint64_t size;
+
+    if (!s->chardev.chr) {
+        error_setg(errp, "vhost-user-blk: chardev is mandatary");
+        return;
+    }
+
+    if (!s->num_queues) {
+        error_setg(errp, "vhost-user-blk: invalid number of IO queues");
+        return;
+    }
+
+    if (!s->queue_size) {
+        error_setg(errp, "vhost-user-blk: invalid count of the IO queue");
+        return;
+    }
+
+    if (!s->size) {
+        error_setg(errp, "vhost-user-blk: block capacity must be assigned,"
+                  "size can be specified by GiB or MiB");
+        return;
+    }
+
+    ret = qemu_strtosz_MiB(s->size, NULL, &size);
+    if (ret < 0) {
+        error_setg(errp, "vhost-user-blk: invalid size %s in GiB/MiB", s->size);
+        return;
+    }
+    s->capacity = size / 512;
+
+    /* block size with default 512 Bytes */
+    if (!s->blkcfg.logical_block_size) {
+        s->blkcfg.logical_block_size = 512;
+    }
+
+    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
+                sizeof(struct virtio_blk_config));
+    virtio_add_queue(vdev, s->queue_size, vhost_user_blk_handle_output);
+
+    s->dev.nvqs = s->num_queues;
+    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
+    s->dev.vq_index = 0;
+    s->dev.backend_features = 0;
+
+    ret = vhost_dev_init(&s->dev, (void *)&s->chardev,
+                         VHOST_BACKEND_TYPE_USER, 0);
+    if (ret < 0) {
+        error_setg(errp, "vhost-user-blk: vhost initialization failed: %s",
+                   strerror(-ret));
+        return;
+    }
+}
+
+static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(dev);
+
+    vhost_user_blk_set_status(vdev, 0);
+    vhost_dev_cleanup(&s->dev);
+    g_free(s->dev.vqs);
+    virtio_cleanup(vdev);
+}
+
+static void vhost_user_blk_instance_init(Object *obj)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(obj);
+
+    device_add_bootindex_property(obj, &s->bootindex, "bootindex",
+                                  "/disk@0,0", DEVICE(obj), NULL);
+}
+
+static const VMStateDescription vmstate_vhost_user_blk = {
+    .name = "vhost-user-blk",
+    .minimum_version_id = 2,
+    .version_id = 2,
+    .fields = (VMStateField[]) {
+        VMSTATE_VIRTIO_DEVICE,
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static Property vhost_user_blk_properties[] = {
+    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
+    DEFINE_BLOCK_PROPERTIES(VHostUserBlk, blkcfg),
+    DEFINE_BLOCK_CHS_PROPERTIES(VHostUserBlk, blkcfg),
+    DEFINE_PROP_STRING("size", VHostUserBlk, size),
+    DEFINE_PROP_UINT16("num_queues", VHostUserBlk, num_queues, 1),
+    DEFINE_PROP_UINT32("queue_size", VHostUserBlk, queue_size, 128),
+    DEFINE_PROP_UINT32("max_segment_size", VHostUserBlk, max_segment_size,
+                       131072),
+    DEFINE_PROP_UINT32("max_segment_num", VHostUserBlk, max_segment_num, 34),
+    DEFINE_PROP_BIT("config_wce", VHostUserBlk, config_wce, 0, false),
+    DEFINE_PROP_BIT64("f_size_max", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_SIZE_MAX, true),
+    DEFINE_PROP_BIT64("f_sizemax", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_SIZE_MAX, true),
+    DEFINE_PROP_BIT64("f_segmax", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_SEG_MAX, true),
+    DEFINE_PROP_BIT64("f_geometry", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_GEOMETRY, true),
+    DEFINE_PROP_BIT64("f_readonly", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_RO, false),
+    DEFINE_PROP_BIT64("f_blocksize", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_BLK_SIZE, true),
+    DEFINE_PROP_BIT64("f_topology", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_TOPOLOGY, true),
+    DEFINE_PROP_BIT64("f_multiqueue", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_MQ, true),
+    DEFINE_PROP_BIT64("f_flush", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_FLUSH, true),
+    DEFINE_PROP_BIT64("f_barrier", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_BARRIER, false),
+    DEFINE_PROP_BIT64("f_scsi", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_SCSI, false),
+    DEFINE_PROP_BIT64("f_writecache", VHostUserBlk, host_features,
+                      VIRTIO_BLK_F_CONFIG_WCE, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_blk_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+    dc->props = vhost_user_blk_properties;
+    dc->vmsd = &vmstate_vhost_user_blk;
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    vdc->realize = vhost_user_blk_device_realize;
+    vdc->unrealize = vhost_user_blk_device_unrealize;
+    vdc->get_config = vhost_user_blk_update_config;
+    vdc->set_config = vhost_user_blk_set_config;
+    vdc->get_features = vhost_user_blk_get_features;
+    vdc->set_status = vhost_user_blk_set_status;
+}
+
+static const TypeInfo vhost_user_blk_info = {
+    .name = TYPE_VHOST_USER_BLK,
+    .parent = TYPE_VIRTIO_DEVICE,
+    .instance_size = sizeof(VHostUserBlk),
+    .instance_init = vhost_user_blk_instance_init,
+    .class_init = vhost_user_blk_class_init,
+};
+
+static void virtio_register_types(void)
+{
+    type_register_static(&vhost_user_blk_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 5d14bd6..0cff259 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -2012,6 +2012,58 @@  static const TypeInfo virtio_blk_pci_info = {
     .class_init    = virtio_blk_pci_class_init,
 };
 
+#ifdef CONFIG_VHOST_USER_BLK
+/* vhost-user-blk */
+
+static Property vhost_user_blk_pci_properties[] = {
+    DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(vpci_dev);
+    DeviceState *vdev = DEVICE(&dev->vdev);
+
+    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void vhost_user_blk_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    dc->props = vhost_user_blk_pci_properties;
+    k->realize = vhost_user_blk_pci_realize;
+    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
+    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+    pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void vhost_user_blk_pci_instance_init(Object *obj)
+{
+    VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_USER_BLK);
+    object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+                              "bootindex", &error_abort);
+}
+
+static const TypeInfo vhost_user_blk_pci_info = {
+    .name           = TYPE_VHOST_USER_BLK_PCI,
+    .parent         = TYPE_VIRTIO_PCI,
+    .instance_size  = sizeof(VHostUserBlkPCI),
+    .instance_init  = vhost_user_blk_pci_instance_init,
+    .class_init     = vhost_user_blk_pci_class_init,
+};
+#endif
+
 /* virtio-scsi-pci */
 
 static Property virtio_scsi_pci_properties[] = {
@@ -2658,6 +2710,9 @@  static void virtio_pci_register_types(void)
     type_register_static(&virtio_9p_pci_info);
 #endif
     type_register_static(&virtio_blk_pci_info);
+#ifdef CONFIG_VHOST_USER_BLK
+    type_register_static(&vhost_user_blk_pci_info);
+#endif
     type_register_static(&virtio_scsi_pci_info);
     type_register_static(&virtio_balloon_pci_info);
     type_register_static(&virtio_serial_pci_info);
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 69f5959..19a0d01 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -27,6 +27,9 @@ 
 #include "hw/virtio/virtio-gpu.h"
 #include "hw/virtio/virtio-crypto.h"
 #include "hw/virtio/vhost-user-scsi.h"
+#ifdef CONFIG_VHOST_USER_BLK
+#include "hw/virtio/vhost-user-blk.h"
+#endif
 
 #ifdef CONFIG_VIRTFS
 #include "hw/9pfs/virtio-9p.h"
@@ -46,6 +49,7 @@  typedef struct VirtIOSerialPCI VirtIOSerialPCI;
 typedef struct VirtIONetPCI VirtIONetPCI;
 typedef struct VHostSCSIPCI VHostSCSIPCI;
 typedef struct VHostUserSCSIPCI VHostUserSCSIPCI;
+typedef struct VHostUserBlkPCI VHostUserBlkPCI;
 typedef struct VirtIORngPCI VirtIORngPCI;
 typedef struct VirtIOInputPCI VirtIOInputPCI;
 typedef struct VirtIOInputHIDPCI VirtIOInputHIDPCI;
@@ -241,6 +245,20 @@  struct VHostUserSCSIPCI {
     VHostUserSCSI vdev;
 };
 
+#ifdef CONFIG_VHOST_USER_BLK
+/*
+ * vhost-user-blk-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_USER_BLK_PCI "vhost-user-blk-pci"
+#define VHOST_USER_BLK_PCI(obj) \
+        OBJECT_CHECK(VHostUserBlkPCI, (obj), TYPE_VHOST_USER_BLK_PCI)
+
+struct VHostUserBlkPCI {
+    VirtIOPCIProxy parent_obj;
+    VHostUserBlk vdev;
+};
+#endif
+
 /*
  * virtio-blk-pci: This extends VirtioPCIProxy.
  */
diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h
new file mode 100644
index 0000000..1d0079c
--- /dev/null
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -0,0 +1,46 @@ 
+/*
+ * vhost-user-blk host device
+ * Copyright IBM, Corp. 2011
+ * Copyright(C) 2017 Intel Corporation.
+ *
+ * Authors:
+ *  Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *  Changpeng Liu <changpeng.liu@intel.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef VHOST_USER_BLK_H
+#define VHOST_USER_BLK_H
+
+#include "standard-headers/linux/virtio_blk.h"
+#include "qemu-common.h"
+#include "hw/qdev.h"
+#include "hw/block/block.h"
+#include "chardev/char-fe.h"
+#include "hw/virtio/vhost.h"
+
+#define TYPE_VHOST_USER_BLK "vhost-user-blk"
+#define VHOST_USER_BLK(obj) \
+        OBJECT_CHECK(VHostUserBlk, (obj), TYPE_VHOST_USER_BLK)
+
+typedef struct VHostUserBlk {
+    VirtIODevice parent_obj;
+    CharBackend chardev;
+    Error *migration_blocker;
+    int32_t bootindex;
+    uint64_t host_features;
+    BlockConf blkcfg;
+    uint64_t capacity;
+    char *size;
+    uint32_t max_segment_size;
+    uint32_t max_segment_num;
+    uint16_t num_queues;
+    uint32_t queue_size;
+    uint32_t config_wce;
+    struct vhost_dev dev;
+} VHostUserBlk;
+
+#endif