diff mbox series

[RFC,v2,05/13] vhost: Route guest->host notification through shadow virtqueue

Message ID 20210315194842.277740-6-eperezma@redhat.com
State New
Headers show
Series vDPA software assisted live migration | expand

Commit Message

Eugenio Perez Martin March 15, 2021, 7:48 p.m. UTC
Shadow virtqueue notifications forwarding is disabled when vhost_dev
stops, so code flow follows usual cleanup.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.h |   7 ++
 include/hw/virtio/vhost.h          |   4 +
 hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
 hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
 4 files changed, 265 insertions(+), 2 deletions(-)

Comments

Jason Wang March 16, 2021, 7:18 a.m. UTC | #1
在 2021/3/16 上午3:48, Eugenio Pérez 写道:
> Shadow virtqueue notifications forwarding is disabled when vhost_dev
> stops, so code flow follows usual cleanup.
>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>   hw/virtio/vhost-shadow-virtqueue.h |   7 ++
>   include/hw/virtio/vhost.h          |   4 +
>   hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
>   hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
>   4 files changed, 265 insertions(+), 2 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> index 6cc18d6acb..c891c6510d 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -17,6 +17,13 @@
>   
>   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>   
> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> +                           unsigned idx,
> +                           VhostShadowVirtqueue *svq);
> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> +                          unsigned idx,
> +                          VhostShadowVirtqueue *svq);
> +
>   VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
>   
>   void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index ac963bf23d..7ffdf9aea0 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -55,6 +55,8 @@ struct vhost_iommu {
>       QLIST_ENTRY(vhost_iommu) iommu_next;
>   };
>   
> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> +
>   typedef struct VhostDevConfigOps {
>       /* Vhost device config space changed callback
>        */
> @@ -83,7 +85,9 @@ struct vhost_dev {
>       uint64_t backend_cap;
>       bool started;
>       bool log_enabled;
> +    bool shadow_vqs_enabled;
>       uint64_t log_size;
> +    VhostShadowVirtqueue **shadow_vqs;


Any reason that you don't embed the shadow virtqueue into 
vhost_virtqueue structure?

(Note that there's a masked_notifier in struct vhost_virtqueue).


>       Error *migration_blocker;
>       const VhostOps *vhost_ops;
>       void *opaque;
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> index 4512e5b058..3e43399e9c 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -8,9 +8,12 @@
>    */
>   
>   #include "hw/virtio/vhost-shadow-virtqueue.h"
> +#include "hw/virtio/vhost.h"
> +
> +#include "standard-headers/linux/vhost_types.h"
>   
>   #include "qemu/error-report.h"
> -#include "qemu/event_notifier.h"
> +#include "qemu/main-loop.h"
>   
>   /* Shadow virtqueue to relay notifications */
>   typedef struct VhostShadowVirtqueue {
> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
>       EventNotifier kick_notifier;
>       /* Shadow call notifier, sent to vhost */
>       EventNotifier call_notifier;
> +
> +    /*
> +     * Borrowed virtqueue's guest to host notifier.
> +     * To borrow it in this event notifier allows to register on the event
> +     * loop and access the associated shadow virtqueue easily. If we use the
> +     * VirtQueue, we don't have an easy way to retrieve it.


So this is something that worries me. It looks like a layer violation 
that makes the codes harder to work correctly.

I wonder if it would be simpler to start from a vDPA dedicated shadow 
virtqueue implementation:

1) have the above fields embeded in vhost_vdpa structure
2) Work at the level of 
vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()

Then the layer is still isolated and you have a much simpler context to 
work that you don't need to care a lot of synchornization:

1) vq masking
2) vhost dev start and stop

?


> +     *
> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
> +     */
> +    EventNotifier host_notifier;
> +
> +    /* Virtio queue shadowing */
> +    VirtQueue *vq;
>   } VhostShadowVirtqueue;
>   
> +/* Forward guest notifications */
> +static void vhost_handle_guest_kick(EventNotifier *n)
> +{
> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> +                                             host_notifier);
> +
> +    if (unlikely(!event_notifier_test_and_clear(n))) {
> +        return;
> +    }
> +
> +    event_notifier_set(&svq->kick_notifier);
> +}
> +
> +/*
> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
> + */
> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
> +                                                     unsigned vhost_index,
> +                                                     VhostShadowVirtqueue *svq)
> +{
> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> +    struct vhost_vring_file file = {
> +        .index = vhost_index,
> +        .fd = event_notifier_get_fd(vq_host_notifier),
> +    };
> +    int r;
> +
> +    /* Restore vhost kick */
> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> +    return r ? -errno : 0;
> +}
> +
> +/*
> + * Start shadow virtqueue operation.
> + * @dev vhost device
> + * @hidx vhost virtqueue index
> + * @svq Shadow Virtqueue
> + */
> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> +                           unsigned idx,
> +                           VhostShadowVirtqueue *svq)


It looks to me this assumes the vhost_dev is started before 
vhost_shadow_vq_start()?


> +{
> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> +    struct vhost_vring_file file = {
> +        .index = idx,
> +        .fd = event_notifier_get_fd(&svq->kick_notifier),
> +    };
> +    int r;
> +
> +    /* Check that notifications are still going directly to vhost dev */
> +    assert(virtio_queue_is_host_notifier_enabled(svq->vq));
> +
> +    /*
> +     * event_notifier_set_handler already checks for guest's notifications if
> +     * they arrive in the switch, so there is no need to explicitely check for
> +     * them.
> +     */
> +    event_notifier_init_fd(&svq->host_notifier,
> +                           event_notifier_get_fd(vq_host_notifier));
> +    event_notifier_set_handler(&svq->host_notifier, vhost_handle_guest_kick);
> +
> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> +    if (unlikely(r != 0)) {
> +        error_report("Couldn't set kick fd: %s", strerror(errno));
> +        goto err_set_vring_kick;
> +    }
> +
> +    return true;
> +
> +err_set_vring_kick:
> +    event_notifier_set_handler(&svq->host_notifier, NULL);
> +
> +    return false;
> +}
> +
> +/*
> + * Stop shadow virtqueue operation.
> + * @dev vhost device
> + * @idx vhost queue index
> + * @svq Shadow Virtqueue
> + */
> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> +                          unsigned idx,
> +                          VhostShadowVirtqueue *svq)
> +{
> +    int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq);
> +    if (unlikely(r < 0)) {
> +        error_report("Couldn't restore vq kick fd: %s", strerror(-r));
> +    }
> +
> +    event_notifier_set_handler(&svq->host_notifier, NULL);
> +}
> +
>   /*
>    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
>    * methods and file descriptors.
>    */
>   VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx)
>   {
> +    int vq_idx = dev->vq_index + idx;
>       g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
>       int r;
>   
> @@ -43,6 +153,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx)
>           goto err_init_call_notifier;
>       }
>   
> +    svq->vq = virtio_get_queue(dev->vdev, vq_idx);
>       return g_steal_pointer(&svq);
>   
>   err_init_call_notifier:
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 97f1bcfa42..4858a35df6 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -25,6 +25,7 @@
>   #include "exec/address-spaces.h"
>   #include "hw/virtio/virtio-bus.h"
>   #include "hw/virtio/virtio-access.h"
> +#include "hw/virtio/vhost-shadow-virtqueue.h"
>   #include "migration/blocker.h"
>   #include "migration/qemu-file-types.h"
>   #include "sysemu/dma.h"
> @@ -1219,6 +1220,74 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
>                          0, virtio_queue_get_desc_size(vdev, idx));
>   }
>   
> +static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
> +{
> +    int idx;
> +
> +    dev->shadow_vqs_enabled = false;
> +
> +    for (idx = 0; idx < dev->nvqs; ++idx) {
> +        vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]);
> +        vhost_shadow_vq_free(dev->shadow_vqs[idx]);
> +    }
> +
> +    g_free(dev->shadow_vqs);
> +    dev->shadow_vqs = NULL;
> +    return 0;
> +}
> +
> +static int vhost_sw_live_migration_start(struct vhost_dev *dev)
> +{
> +    int idx, stop_idx;
> +
> +    dev->shadow_vqs = g_new0(VhostShadowVirtqueue *, dev->nvqs);
> +    for (idx = 0; idx < dev->nvqs; ++idx) {
> +        dev->shadow_vqs[idx] = vhost_shadow_vq_new(dev, idx);
> +        if (unlikely(dev->shadow_vqs[idx] == NULL)) {
> +            goto err_new;
> +        }
> +    }
> +
> +    dev->shadow_vqs_enabled = true;
> +    for (idx = 0; idx < dev->nvqs; ++idx) {
> +        bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]);
> +        if (unlikely(!ok)) {
> +            goto err_start;
> +        }
> +    }
> +
> +    return 0;
> +
> +err_start:
> +    dev->shadow_vqs_enabled = false;
> +    for (stop_idx = 0; stop_idx < idx; stop_idx++) {
> +        vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]);
> +    }
> +
> +err_new:
> +    for (idx = 0; idx < dev->nvqs; ++idx) {
> +        vhost_shadow_vq_free(dev->shadow_vqs[idx]);
> +    }
> +    g_free(dev->shadow_vqs);
> +
> +    return -1;
> +}
> +
> +static int vhost_sw_live_migration_enable(struct vhost_dev *dev,
> +                                          bool enable_lm)
> +{


So the live migration part should be done in an separted patch.

Thanks


> +    int r;
> +
> +    if (enable_lm == dev->shadow_vqs_enabled) {
> +        return 0;
> +    }
> +
> +    r = enable_lm ? vhost_sw_live_migration_start(dev)
> +                  : vhost_sw_live_migration_stop(dev);
> +
> +    return r;
> +}
> +
>   static void vhost_eventfd_add(MemoryListener *listener,
>                                 MemoryRegionSection *section,
>                                 bool match_data, uint64_t data, EventNotifier *e)
> @@ -1381,6 +1450,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>       hdev->log = NULL;
>       hdev->log_size = 0;
>       hdev->log_enabled = false;
> +    hdev->shadow_vqs_enabled = false;
>       hdev->started = false;
>       memory_listener_register(&hdev->memory_listener, &address_space_memory);
>       QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
> @@ -1484,6 +1554,10 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
>       BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
>       int i, r;
>   
> +    if (hdev->shadow_vqs_enabled) {
> +        vhost_sw_live_migration_enable(hdev, false);
> +    }
> +
>       for (i = 0; i < hdev->nvqs; ++i) {
>           r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
>                                            false);
> @@ -1798,6 +1872,7 @@ fail_features:
>   void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>   {
>       int i;
> +    bool is_shadow_vqs_enabled = hdev->shadow_vqs_enabled;
>   
>       /* should only be called after backend is connected */
>       assert(hdev->vhost_ops);
> @@ -1805,7 +1880,16 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>       if (hdev->vhost_ops->vhost_dev_start) {
>           hdev->vhost_ops->vhost_dev_start(hdev, false);
>       }
> +    if (is_shadow_vqs_enabled) {
> +        /* Shadow virtqueue will be stopped */
> +        hdev->shadow_vqs_enabled = false;
> +    }
>       for (i = 0; i < hdev->nvqs; ++i) {
> +        if (is_shadow_vqs_enabled) {
> +            vhost_shadow_vq_stop(hdev, i, hdev->shadow_vqs[i]);
> +            vhost_shadow_vq_free(hdev->shadow_vqs[i]);
> +        }
> +
>           vhost_virtqueue_stop(hdev,
>                                vdev,
>                                hdev->vqs + i,
> @@ -1819,6 +1903,8 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>           memory_listener_unregister(&hdev->iommu_listener);
>       }
>       vhost_log_put(hdev, true);
> +    g_free(hdev->shadow_vqs);
> +    hdev->shadow_vqs_enabled = false;
>       hdev->started = false;
>       hdev->vdev = NULL;
>   }
> @@ -1835,5 +1921,60 @@ int vhost_net_set_backend(struct vhost_dev *hdev,
>   
>   void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp)
>   {
> -    error_setg(errp, "Shadow virtqueue still not implemented");
> +    struct vhost_dev *hdev, *hdev_err;
> +    VirtIODevice *vdev;
> +    const char *err_cause = NULL;
> +    int r;
> +    ErrorClass err_class = ERROR_CLASS_GENERIC_ERROR;
> +
> +    QLIST_FOREACH(hdev, &vhost_devices, entry) {
> +        if (hdev->vdev && 0 == strcmp(hdev->vdev->name, name)) {
> +            vdev = hdev->vdev;
> +            break;
> +        }
> +    }
> +
> +    if (!hdev) {
> +        err_class = ERROR_CLASS_DEVICE_NOT_FOUND;
> +        err_cause = "Device not found";
> +        goto not_found_err;
> +    }
> +
> +    for ( ; hdev; hdev = QLIST_NEXT(hdev, entry)) {
> +        if (vdev != hdev->vdev) {
> +            continue;
> +        }
> +
> +        if (!hdev->started) {
> +            err_cause = "Device is not started";
> +            goto err;
> +        }
> +
> +        r = vhost_sw_live_migration_enable(hdev, enable);
> +        if (unlikely(r)) {
> +            err_cause = "Error enabling (see monitor)";
> +            goto err;
> +        }
> +    }
> +
> +    return;
> +
> +err:
> +    QLIST_FOREACH(hdev_err, &vhost_devices, entry) {
> +        if (hdev_err == hdev) {
> +            break;
> +        }
> +
> +        if (vdev != hdev->vdev) {
> +            continue;
> +        }
> +
> +        vhost_sw_live_migration_enable(hdev, !enable);
> +    }
> +
> +not_found_err:
> +    if (err_cause) {
> +        error_set(errp, err_class,
> +                  "Can't enable shadow vq on %s: %s", name, err_cause);
> +    }
>   }
Eugenio Perez Martin March 16, 2021, 10:31 a.m. UTC | #2
On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
> > Shadow virtqueue notifications forwarding is disabled when vhost_dev
> > stops, so code flow follows usual cleanup.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >   hw/virtio/vhost-shadow-virtqueue.h |   7 ++
> >   include/hw/virtio/vhost.h          |   4 +
> >   hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
> >   hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
> >   4 files changed, 265 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > index 6cc18d6acb..c891c6510d 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > @@ -17,6 +17,13 @@
> >
> >   typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >
> > +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> > +                           unsigned idx,
> > +                           VhostShadowVirtqueue *svq);
> > +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> > +                          unsigned idx,
> > +                          VhostShadowVirtqueue *svq);
> > +
> >   VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
> >
> >   void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
> > diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> > index ac963bf23d..7ffdf9aea0 100644
> > --- a/include/hw/virtio/vhost.h
> > +++ b/include/hw/virtio/vhost.h
> > @@ -55,6 +55,8 @@ struct vhost_iommu {
> >       QLIST_ENTRY(vhost_iommu) iommu_next;
> >   };
> >
> > +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > +
> >   typedef struct VhostDevConfigOps {
> >       /* Vhost device config space changed callback
> >        */
> > @@ -83,7 +85,9 @@ struct vhost_dev {
> >       uint64_t backend_cap;
> >       bool started;
> >       bool log_enabled;
> > +    bool shadow_vqs_enabled;
> >       uint64_t log_size;
> > +    VhostShadowVirtqueue **shadow_vqs;
>
>
> Any reason that you don't embed the shadow virtqueue into
> vhost_virtqueue structure?
>

Not really, it could be relatively big and I would prefer SVQ
members/methods to remain hidden from any other part that includes
vhost.h. But it could be changed, for sure.

> (Note that there's a masked_notifier in struct vhost_virtqueue).
>

They are used differently: in SVQ the masked notifier is a pointer,
and if it's NULL the SVQ code knows that device is not masked. The
vhost_virtqueue is the real owner.

It could be replaced by a boolean in SVQ or something like that, I
experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
and let vhost.c code to manage all the transitions. But I find clearer
the pointer use, since it's the more natural for the
vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.

This masking/unmasking is the part I dislike the most from this
series, so I'm very open to alternatives.

>
> >       Error *migration_blocker;
> >       const VhostOps *vhost_ops;
> >       void *opaque;
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > index 4512e5b058..3e43399e9c 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -8,9 +8,12 @@
> >    */
> >
> >   #include "hw/virtio/vhost-shadow-virtqueue.h"
> > +#include "hw/virtio/vhost.h"
> > +
> > +#include "standard-headers/linux/vhost_types.h"
> >
> >   #include "qemu/error-report.h"
> > -#include "qemu/event_notifier.h"
> > +#include "qemu/main-loop.h"
> >
> >   /* Shadow virtqueue to relay notifications */
> >   typedef struct VhostShadowVirtqueue {
> > @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
> >       EventNotifier kick_notifier;
> >       /* Shadow call notifier, sent to vhost */
> >       EventNotifier call_notifier;
> > +
> > +    /*
> > +     * Borrowed virtqueue's guest to host notifier.
> > +     * To borrow it in this event notifier allows to register on the event
> > +     * loop and access the associated shadow virtqueue easily. If we use the
> > +     * VirtQueue, we don't have an easy way to retrieve it.
>
>
> So this is something that worries me. It looks like a layer violation
> that makes the codes harder to work correctly.
>

I don't follow you here.

The vhost code already depends on virtqueue in the same sense:
virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
if this behavior ever changes it is unlikely for vhost to keep working
without changes. vhost_virtqueue has a kick/call int where I think it
should be stored actually, but they are never used as far as I see.

Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
/* Stop processing guest IO notifications in vhost.
 * Start processing them in qemu.
 ...
But it was easier for this mode to miss a notification, since they
create a new host_notifier in virtio_bus_set_host_notifier right away.
So I decided to use the file descriptor already sent to vhost in
regular operation mode, so guest-related resources change less.

Having said that, maybe it's useful to assert that
vhost_dev_{enable,disable}_notifiers are never called on shadow
virtqueue mode. Also, it could be useful to retrieve it from
virtio_bus, not raw shadow virtqueue, so all get/set are performed
from it. Would that make more sense?

> I wonder if it would be simpler to start from a vDPA dedicated shadow
> virtqueue implementation:
>
> 1) have the above fields embeded in vhost_vdpa structure
> 2) Work at the level of
> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
>

This notifier is never sent to the device in shadow virtqueue mode.
It's for SVQ to react to guest's notifications, registering it on its
main event loop [1]. So if I perform these changes the way I
understand them, SVQ would still rely on this borrowed EventNotifier,
and it would send to the vDPA device the newly created kick_notifier
of VhostShadowVirtqueue.

> Then the layer is still isolated and you have a much simpler context to
> work that you don't need to care a lot of synchornization:
>
> 1) vq masking

This EventNotifier is not used for masking, it does not change from
the start of the shadow virtqueue operation through its end. Call fd
sent to vhost/vdpa device does not change either in shadow virtqueue
mode operation with masking/unmasking. I will try to document it
better.

I think that we will need to handle synchronization with
masking/unmasking from the guest and dynamically enabling SVQ
operation mode, since they can happen at the same time as long as we
let the guest run. There may be better ways of synchronizing them of
course, but I don't see how moving to the vhost-vdpa backend helps
with this. Please expand if I've missed it.

Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
to future patchsets?

> 2) vhost dev start and stop
>
> ?
>
>
> > +     *
> > +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
> > +     */
> > +    EventNotifier host_notifier;
> > +
> > +    /* Virtio queue shadowing */
> > +    VirtQueue *vq;
> >   } VhostShadowVirtqueue;
> >
> > +/* Forward guest notifications */
> > +static void vhost_handle_guest_kick(EventNotifier *n)
> > +{
> > +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> > +                                             host_notifier);
> > +
> > +    if (unlikely(!event_notifier_test_and_clear(n))) {
> > +        return;
> > +    }
> > +
> > +    event_notifier_set(&svq->kick_notifier);
> > +}
> > +
> > +/*
> > + * Restore the vhost guest to host notifier, i.e., disables svq effect.
> > + */
> > +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
> > +                                                     unsigned vhost_index,
> > +                                                     VhostShadowVirtqueue *svq)
> > +{
> > +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> > +    struct vhost_vring_file file = {
> > +        .index = vhost_index,
> > +        .fd = event_notifier_get_fd(vq_host_notifier),
> > +    };
> > +    int r;
> > +
> > +    /* Restore vhost kick */
> > +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> > +    return r ? -errno : 0;
> > +}
> > +
> > +/*
> > + * Start shadow virtqueue operation.
> > + * @dev vhost device
> > + * @hidx vhost virtqueue index
> > + * @svq Shadow Virtqueue
> > + */
> > +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> > +                           unsigned idx,
> > +                           VhostShadowVirtqueue *svq)
>
>
> It looks to me this assumes the vhost_dev is started before
> vhost_shadow_vq_start()?
>

Right.

>
> > +{
> > +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> > +    struct vhost_vring_file file = {
> > +        .index = idx,
> > +        .fd = event_notifier_get_fd(&svq->kick_notifier),
> > +    };
> > +    int r;
> > +
> > +    /* Check that notifications are still going directly to vhost dev */
> > +    assert(virtio_queue_is_host_notifier_enabled(svq->vq));
> > +
> > +    /*
> > +     * event_notifier_set_handler already checks for guest's notifications if
> > +     * they arrive in the switch, so there is no need to explicitely check for
> > +     * them.
> > +     */
> > +    event_notifier_init_fd(&svq->host_notifier,
> > +                           event_notifier_get_fd(vq_host_notifier));
> > +    event_notifier_set_handler(&svq->host_notifier, vhost_handle_guest_kick);
> > +
> > +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> > +    if (unlikely(r != 0)) {
> > +        error_report("Couldn't set kick fd: %s", strerror(errno));
> > +        goto err_set_vring_kick;
> > +    }
> > +
> > +    return true;
> > +
> > +err_set_vring_kick:
> > +    event_notifier_set_handler(&svq->host_notifier, NULL);
> > +
> > +    return false;
> > +}
> > +
> > +/*
> > + * Stop shadow virtqueue operation.
> > + * @dev vhost device
> > + * @idx vhost queue index
> > + * @svq Shadow Virtqueue
> > + */
> > +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> > +                          unsigned idx,
> > +                          VhostShadowVirtqueue *svq)
> > +{
> > +    int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq);
> > +    if (unlikely(r < 0)) {
> > +        error_report("Couldn't restore vq kick fd: %s", strerror(-r));
> > +    }
> > +
> > +    event_notifier_set_handler(&svq->host_notifier, NULL);
> > +}
> > +
> >   /*
> >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> >    * methods and file descriptors.
> >    */
> >   VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx)
> >   {
> > +    int vq_idx = dev->vq_index + idx;
> >       g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
> >       int r;
> >
> > @@ -43,6 +153,7 @@ VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx)
> >           goto err_init_call_notifier;
> >       }
> >
> > +    svq->vq = virtio_get_queue(dev->vdev, vq_idx);
> >       return g_steal_pointer(&svq);
> >
> >   err_init_call_notifier:
> > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> > index 97f1bcfa42..4858a35df6 100644
> > --- a/hw/virtio/vhost.c
> > +++ b/hw/virtio/vhost.c
> > @@ -25,6 +25,7 @@
> >   #include "exec/address-spaces.h"
> >   #include "hw/virtio/virtio-bus.h"
> >   #include "hw/virtio/virtio-access.h"
> > +#include "hw/virtio/vhost-shadow-virtqueue.h"
> >   #include "migration/blocker.h"
> >   #include "migration/qemu-file-types.h"
> >   #include "sysemu/dma.h"
> > @@ -1219,6 +1220,74 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
> >                          0, virtio_queue_get_desc_size(vdev, idx));
> >   }
> >
> > +static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
> > +{
> > +    int idx;
> > +
> > +    dev->shadow_vqs_enabled = false;
> > +
> > +    for (idx = 0; idx < dev->nvqs; ++idx) {
> > +        vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]);
> > +        vhost_shadow_vq_free(dev->shadow_vqs[idx]);
> > +    }
> > +
> > +    g_free(dev->shadow_vqs);
> > +    dev->shadow_vqs = NULL;
> > +    return 0;
> > +}
> > +
> > +static int vhost_sw_live_migration_start(struct vhost_dev *dev)
> > +{
> > +    int idx, stop_idx;
> > +
> > +    dev->shadow_vqs = g_new0(VhostShadowVirtqueue *, dev->nvqs);
> > +    for (idx = 0; idx < dev->nvqs; ++idx) {
> > +        dev->shadow_vqs[idx] = vhost_shadow_vq_new(dev, idx);
> > +        if (unlikely(dev->shadow_vqs[idx] == NULL)) {
> > +            goto err_new;
> > +        }
> > +    }
> > +
> > +    dev->shadow_vqs_enabled = true;
> > +    for (idx = 0; idx < dev->nvqs; ++idx) {
> > +        bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]);
> > +        if (unlikely(!ok)) {
> > +            goto err_start;
> > +        }
> > +    }
> > +
> > +    return 0;
> > +
> > +err_start:
> > +    dev->shadow_vqs_enabled = false;
> > +    for (stop_idx = 0; stop_idx < idx; stop_idx++) {
> > +        vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]);
> > +    }
> > +
> > +err_new:
> > +    for (idx = 0; idx < dev->nvqs; ++idx) {
> > +        vhost_shadow_vq_free(dev->shadow_vqs[idx]);
> > +    }
> > +    g_free(dev->shadow_vqs);
> > +
> > +    return -1;
> > +}
> > +
> > +static int vhost_sw_live_migration_enable(struct vhost_dev *dev,
> > +                                          bool enable_lm)
> > +{
>
>
> So the live migration part should be done in an separted patch.
>

Right, I missed the renaming of this one.

> Thanks
>

[1] or, hopefully in the future, in an independent aio context.


>
> > +    int r;
> > +
> > +    if (enable_lm == dev->shadow_vqs_enabled) {
> > +        return 0;
> > +    }
> > +
> > +    r = enable_lm ? vhost_sw_live_migration_start(dev)
> > +                  : vhost_sw_live_migration_stop(dev);
> > +
> > +    return r;
> > +}
> > +
> >   static void vhost_eventfd_add(MemoryListener *listener,
> >                                 MemoryRegionSection *section,
> >                                 bool match_data, uint64_t data, EventNotifier *e)
> > @@ -1381,6 +1450,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
> >       hdev->log = NULL;
> >       hdev->log_size = 0;
> >       hdev->log_enabled = false;
> > +    hdev->shadow_vqs_enabled = false;
> >       hdev->started = false;
> >       memory_listener_register(&hdev->memory_listener, &address_space_memory);
> >       QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
> > @@ -1484,6 +1554,10 @@ void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
> >       BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
> >       int i, r;
> >
> > +    if (hdev->shadow_vqs_enabled) {
> > +        vhost_sw_live_migration_enable(hdev, false);
> > +    }
> > +
> >       for (i = 0; i < hdev->nvqs; ++i) {
> >           r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
> >                                            false);
> > @@ -1798,6 +1872,7 @@ fail_features:
> >   void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
> >   {
> >       int i;
> > +    bool is_shadow_vqs_enabled = hdev->shadow_vqs_enabled;
> >
> >       /* should only be called after backend is connected */
> >       assert(hdev->vhost_ops);
> > @@ -1805,7 +1880,16 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
> >       if (hdev->vhost_ops->vhost_dev_start) {
> >           hdev->vhost_ops->vhost_dev_start(hdev, false);
> >       }
> > +    if (is_shadow_vqs_enabled) {
> > +        /* Shadow virtqueue will be stopped */
> > +        hdev->shadow_vqs_enabled = false;
> > +    }
> >       for (i = 0; i < hdev->nvqs; ++i) {
> > +        if (is_shadow_vqs_enabled) {
> > +            vhost_shadow_vq_stop(hdev, i, hdev->shadow_vqs[i]);
> > +            vhost_shadow_vq_free(hdev->shadow_vqs[i]);
> > +        }
> > +
> >           vhost_virtqueue_stop(hdev,
> >                                vdev,
> >                                hdev->vqs + i,
> > @@ -1819,6 +1903,8 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
> >           memory_listener_unregister(&hdev->iommu_listener);
> >       }
> >       vhost_log_put(hdev, true);
> > +    g_free(hdev->shadow_vqs);
> > +    hdev->shadow_vqs_enabled = false;
> >       hdev->started = false;
> >       hdev->vdev = NULL;
> >   }
> > @@ -1835,5 +1921,60 @@ int vhost_net_set_backend(struct vhost_dev *hdev,
> >
> >   void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp)
> >   {
> > -    error_setg(errp, "Shadow virtqueue still not implemented");
> > +    struct vhost_dev *hdev, *hdev_err;
> > +    VirtIODevice *vdev;
> > +    const char *err_cause = NULL;
> > +    int r;
> > +    ErrorClass err_class = ERROR_CLASS_GENERIC_ERROR;
> > +
> > +    QLIST_FOREACH(hdev, &vhost_devices, entry) {
> > +        if (hdev->vdev && 0 == strcmp(hdev->vdev->name, name)) {
> > +            vdev = hdev->vdev;
> > +            break;
> > +        }
> > +    }
> > +
> > +    if (!hdev) {
> > +        err_class = ERROR_CLASS_DEVICE_NOT_FOUND;
> > +        err_cause = "Device not found";
> > +        goto not_found_err;
> > +    }
> > +
> > +    for ( ; hdev; hdev = QLIST_NEXT(hdev, entry)) {
> > +        if (vdev != hdev->vdev) {
> > +            continue;
> > +        }
> > +
> > +        if (!hdev->started) {
> > +            err_cause = "Device is not started";
> > +            goto err;
> > +        }
> > +
> > +        r = vhost_sw_live_migration_enable(hdev, enable);
> > +        if (unlikely(r)) {
> > +            err_cause = "Error enabling (see monitor)";
> > +            goto err;
> > +        }
> > +    }
> > +
> > +    return;
> > +
> > +err:
> > +    QLIST_FOREACH(hdev_err, &vhost_devices, entry) {
> > +        if (hdev_err == hdev) {
> > +            break;
> > +        }
> > +
> > +        if (vdev != hdev->vdev) {
> > +            continue;
> > +        }
> > +
> > +        vhost_sw_live_migration_enable(hdev, !enable);
> > +    }
> > +
> > +not_found_err:
> > +    if (err_cause) {
> > +        error_set(errp, err_class,
> > +                  "Can't enable shadow vq on %s: %s", name, err_cause);
> > +    }
> >   }
>
Jason Wang March 17, 2021, 2:05 a.m. UTC | #3
在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
> On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
>>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
>>> stops, so code flow follows usual cleanup.
>>>
>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>> ---
>>>    hw/virtio/vhost-shadow-virtqueue.h |   7 ++
>>>    include/hw/virtio/vhost.h          |   4 +
>>>    hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
>>>    hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
>>>    4 files changed, 265 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
>>> index 6cc18d6acb..c891c6510d 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>> @@ -17,6 +17,13 @@
>>>
>>>    typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>>
>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>> +                           unsigned idx,
>>> +                           VhostShadowVirtqueue *svq);
>>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
>>> +                          unsigned idx,
>>> +                          VhostShadowVirtqueue *svq);
>>> +
>>>    VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
>>>
>>>    void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>> index ac963bf23d..7ffdf9aea0 100644
>>> --- a/include/hw/virtio/vhost.h
>>> +++ b/include/hw/virtio/vhost.h
>>> @@ -55,6 +55,8 @@ struct vhost_iommu {
>>>        QLIST_ENTRY(vhost_iommu) iommu_next;
>>>    };
>>>
>>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>> +
>>>    typedef struct VhostDevConfigOps {
>>>        /* Vhost device config space changed callback
>>>         */
>>> @@ -83,7 +85,9 @@ struct vhost_dev {
>>>        uint64_t backend_cap;
>>>        bool started;
>>>        bool log_enabled;
>>> +    bool shadow_vqs_enabled;
>>>        uint64_t log_size;
>>> +    VhostShadowVirtqueue **shadow_vqs;
>>
>> Any reason that you don't embed the shadow virtqueue into
>> vhost_virtqueue structure?
>>
> Not really, it could be relatively big and I would prefer SVQ
> members/methods to remain hidden from any other part that includes
> vhost.h. But it could be changed, for sure.
>
>> (Note that there's a masked_notifier in struct vhost_virtqueue).
>>
> They are used differently: in SVQ the masked notifier is a pointer,
> and if it's NULL the SVQ code knows that device is not masked. The
> vhost_virtqueue is the real owner.


Yes, but it's an example for embedding auxciliary data structures in the 
vhost_virtqueue.


>
> It could be replaced by a boolean in SVQ or something like that, I
> experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
> and let vhost.c code to manage all the transitions. But I find clearer
> the pointer use, since it's the more natural for the
> vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
>
> This masking/unmasking is the part I dislike the most from this
> series, so I'm very open to alternatives.


See below. I think we don't even need to care about that.


>
>>>        Error *migration_blocker;
>>>        const VhostOps *vhost_ops;
>>>        void *opaque;
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
>>> index 4512e5b058..3e43399e9c 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>> @@ -8,9 +8,12 @@
>>>     */
>>>
>>>    #include "hw/virtio/vhost-shadow-virtqueue.h"
>>> +#include "hw/virtio/vhost.h"
>>> +
>>> +#include "standard-headers/linux/vhost_types.h"
>>>
>>>    #include "qemu/error-report.h"
>>> -#include "qemu/event_notifier.h"
>>> +#include "qemu/main-loop.h"
>>>
>>>    /* Shadow virtqueue to relay notifications */
>>>    typedef struct VhostShadowVirtqueue {
>>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
>>>        EventNotifier kick_notifier;
>>>        /* Shadow call notifier, sent to vhost */
>>>        EventNotifier call_notifier;
>>> +
>>> +    /*
>>> +     * Borrowed virtqueue's guest to host notifier.
>>> +     * To borrow it in this event notifier allows to register on the event
>>> +     * loop and access the associated shadow virtqueue easily. If we use the
>>> +     * VirtQueue, we don't have an easy way to retrieve it.
>>
>> So this is something that worries me. It looks like a layer violation
>> that makes the codes harder to work correctly.
>>
> I don't follow you here.
>
> The vhost code already depends on virtqueue in the same sense:
> virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
> if this behavior ever changes it is unlikely for vhost to keep working
> without changes. vhost_virtqueue has a kick/call int where I think it
> should be stored actually, but they are never used as far as I see.
>
> Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
> /* Stop processing guest IO notifications in vhost.
>   * Start processing them in qemu.
>   ...
> But it was easier for this mode to miss a notification, since they
> create a new host_notifier in virtio_bus_set_host_notifier right away.
> So I decided to use the file descriptor already sent to vhost in
> regular operation mode, so guest-related resources change less.
>
> Having said that, maybe it's useful to assert that
> vhost_dev_{enable,disable}_notifiers are never called on shadow
> virtqueue mode. Also, it could be useful to retrieve it from
> virtio_bus, not raw shadow virtqueue, so all get/set are performed
> from it. Would that make more sense?
>
>> I wonder if it would be simpler to start from a vDPA dedicated shadow
>> virtqueue implementation:
>>
>> 1) have the above fields embeded in vhost_vdpa structure
>> 2) Work at the level of
>> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
>>
> This notifier is never sent to the device in shadow virtqueue mode.
> It's for SVQ to react to guest's notifications, registering it on its
> main event loop [1]. So if I perform these changes the way I
> understand them, SVQ would still rely on this borrowed EventNotifier,
> and it would send to the vDPA device the newly created kick_notifier
> of VhostShadowVirtqueue.


The point is that vhost code should be coupled loosely with virtio. If 
you try to "borrow" EventNotifier from virtio, you need to deal with a 
lot of synchrization. An exampleis the masking stuffs.


>
>> Then the layer is still isolated and you have a much simpler context to
>> work that you don't need to care a lot of synchornization:
>>
>> 1) vq masking
> This EventNotifier is not used for masking, it does not change from
> the start of the shadow virtqueue operation through its end. Call fd
> sent to vhost/vdpa device does not change either in shadow virtqueue
> mode operation with masking/unmasking. I will try to document it
> better.
>
> I think that we will need to handle synchronization with
> masking/unmasking from the guest and dynamically enabling SVQ
> operation mode, since they can happen at the same time as long as we
> let the guest run. There may be better ways of synchronizing them of
> course, but I don't see how moving to the vhost-vdpa backend helps
> with this. Please expand if I've missed it.
>
> Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
> to future patchsets?


So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and 
hide them from the upper layers like virtio. This means it works at 
vhost level which can see vhost_vring_file only. When enalbed, what it 
needs is just:

1) switch to use svq kickfd and relay ioeventfd to svq kickfd
2) switch to use svq callfd and relay svq callfd to irqfd

It will still behave like a vhost-backend that the switching is done 
internally in vhost-vDPA which is totally transparent to the virtio 
codes of Qemu.

E.g:

1) in the case of guest notifier masking, we don't need to do anything 
since virtio codes will replace another irqfd for us.
2) easily to deal with vhost dev start and stop

The advantages are obvious, simple and easy to implement.


>
>> 2) vhost dev start and stop
>>
>> ?
>>
>>
>>> +     *
>>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
>>> +     */
>>> +    EventNotifier host_notifier;
>>> +
>>> +    /* Virtio queue shadowing */
>>> +    VirtQueue *vq;
>>>    } VhostShadowVirtqueue;
>>>
>>> +/* Forward guest notifications */
>>> +static void vhost_handle_guest_kick(EventNotifier *n)
>>> +{
>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>>> +                                             host_notifier);
>>> +
>>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
>>> +        return;
>>> +    }
>>> +
>>> +    event_notifier_set(&svq->kick_notifier);
>>> +}
>>> +
>>> +/*
>>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
>>> + */
>>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
>>> +                                                     unsigned vhost_index,
>>> +                                                     VhostShadowVirtqueue *svq)
>>> +{
>>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
>>> +    struct vhost_vring_file file = {
>>> +        .index = vhost_index,
>>> +        .fd = event_notifier_get_fd(vq_host_notifier),
>>> +    };
>>> +    int r;
>>> +
>>> +    /* Restore vhost kick */
>>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
>>> +    return r ? -errno : 0;
>>> +}
>>> +
>>> +/*
>>> + * Start shadow virtqueue operation.
>>> + * @dev vhost device
>>> + * @hidx vhost virtqueue index
>>> + * @svq Shadow Virtqueue
>>> + */
>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>> +                           unsigned idx,
>>> +                           VhostShadowVirtqueue *svq)
>>
>> It looks to me this assumes the vhost_dev is started before
>> vhost_shadow_vq_start()?
>>
> Right.


This might not true. Guest may enable and disable virtio drivers after 
the shadow virtqueue is started. You need to deal with that.

Thanks
Eugenio Perez Martin March 17, 2021, 4:47 p.m. UTC | #4
On Wed, Mar 17, 2021 at 3:05 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
> > On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
> >>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
> >>> stops, so code flow follows usual cleanup.
> >>>
> >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>> ---
> >>>    hw/virtio/vhost-shadow-virtqueue.h |   7 ++
> >>>    include/hw/virtio/vhost.h          |   4 +
> >>>    hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
> >>>    hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
> >>>    4 files changed, 265 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> >>> index 6cc18d6acb..c891c6510d 100644
> >>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> >>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> >>> @@ -17,6 +17,13 @@
> >>>
> >>>    typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >>>
> >>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> >>> +                           unsigned idx,
> >>> +                           VhostShadowVirtqueue *svq);
> >>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> >>> +                          unsigned idx,
> >>> +                          VhostShadowVirtqueue *svq);
> >>> +
> >>>    VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
> >>>
> >>>    void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
> >>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >>> index ac963bf23d..7ffdf9aea0 100644
> >>> --- a/include/hw/virtio/vhost.h
> >>> +++ b/include/hw/virtio/vhost.h
> >>> @@ -55,6 +55,8 @@ struct vhost_iommu {
> >>>        QLIST_ENTRY(vhost_iommu) iommu_next;
> >>>    };
> >>>
> >>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >>> +
> >>>    typedef struct VhostDevConfigOps {
> >>>        /* Vhost device config space changed callback
> >>>         */
> >>> @@ -83,7 +85,9 @@ struct vhost_dev {
> >>>        uint64_t backend_cap;
> >>>        bool started;
> >>>        bool log_enabled;
> >>> +    bool shadow_vqs_enabled;
> >>>        uint64_t log_size;
> >>> +    VhostShadowVirtqueue **shadow_vqs;
> >>
> >> Any reason that you don't embed the shadow virtqueue into
> >> vhost_virtqueue structure?
> >>
> > Not really, it could be relatively big and I would prefer SVQ
> > members/methods to remain hidden from any other part that includes
> > vhost.h. But it could be changed, for sure.
> >
> >> (Note that there's a masked_notifier in struct vhost_virtqueue).
> >>
> > They are used differently: in SVQ the masked notifier is a pointer,
> > and if it's NULL the SVQ code knows that device is not masked. The
> > vhost_virtqueue is the real owner.
>
>
> Yes, but it's an example for embedding auxciliary data structures in the
> vhost_virtqueue.
>
>
> >
> > It could be replaced by a boolean in SVQ or something like that, I
> > experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
> > and let vhost.c code to manage all the transitions. But I find clearer
> > the pointer use, since it's the more natural for the
> > vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
> >
> > This masking/unmasking is the part I dislike the most from this
> > series, so I'm very open to alternatives.
>
>
> See below. I think we don't even need to care about that.
>
>
> >
> >>>        Error *migration_blocker;
> >>>        const VhostOps *vhost_ops;
> >>>        void *opaque;
> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> >>> index 4512e5b058..3e43399e9c 100644
> >>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> >>> @@ -8,9 +8,12 @@
> >>>     */
> >>>
> >>>    #include "hw/virtio/vhost-shadow-virtqueue.h"
> >>> +#include "hw/virtio/vhost.h"
> >>> +
> >>> +#include "standard-headers/linux/vhost_types.h"
> >>>
> >>>    #include "qemu/error-report.h"
> >>> -#include "qemu/event_notifier.h"
> >>> +#include "qemu/main-loop.h"
> >>>
> >>>    /* Shadow virtqueue to relay notifications */
> >>>    typedef struct VhostShadowVirtqueue {
> >>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
> >>>        EventNotifier kick_notifier;
> >>>        /* Shadow call notifier, sent to vhost */
> >>>        EventNotifier call_notifier;
> >>> +
> >>> +    /*
> >>> +     * Borrowed virtqueue's guest to host notifier.
> >>> +     * To borrow it in this event notifier allows to register on the event
> >>> +     * loop and access the associated shadow virtqueue easily. If we use the
> >>> +     * VirtQueue, we don't have an easy way to retrieve it.
> >>
> >> So this is something that worries me. It looks like a layer violation
> >> that makes the codes harder to work correctly.
> >>
> > I don't follow you here.
> >
> > The vhost code already depends on virtqueue in the same sense:
> > virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
> > if this behavior ever changes it is unlikely for vhost to keep working
> > without changes. vhost_virtqueue has a kick/call int where I think it
> > should be stored actually, but they are never used as far as I see.
> >
> > Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
> > /* Stop processing guest IO notifications in vhost.
> >   * Start processing them in qemu.
> >   ...
> > But it was easier for this mode to miss a notification, since they
> > create a new host_notifier in virtio_bus_set_host_notifier right away.
> > So I decided to use the file descriptor already sent to vhost in
> > regular operation mode, so guest-related resources change less.
> >
> > Having said that, maybe it's useful to assert that
> > vhost_dev_{enable,disable}_notifiers are never called on shadow
> > virtqueue mode. Also, it could be useful to retrieve it from
> > virtio_bus, not raw shadow virtqueue, so all get/set are performed
> > from it. Would that make more sense?
> >
> >> I wonder if it would be simpler to start from a vDPA dedicated shadow
> >> virtqueue implementation:
> >>
> >> 1) have the above fields embeded in vhost_vdpa structure
> >> 2) Work at the level of
> >> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
> >>
> > This notifier is never sent to the device in shadow virtqueue mode.
> > It's for SVQ to react to guest's notifications, registering it on its
> > main event loop [1]. So if I perform these changes the way I
> > understand them, SVQ would still rely on this borrowed EventNotifier,
> > and it would send to the vDPA device the newly created kick_notifier
> > of VhostShadowVirtqueue.
>
>
> The point is that vhost code should be coupled loosely with virtio. If
> you try to "borrow" EventNotifier from virtio, you need to deal with a
> lot of synchrization. An exampleis the masking stuffs.
>

I still don't follow this, sorry.

The svq->host_notifier event notifier is not affected by the masking
issue, it is completely private to SVQ. This commit creates and uses
it, and nothing related to masking is touched until the next commit.

>
> >
> >> Then the layer is still isolated and you have a much simpler context to
> >> work that you don't need to care a lot of synchornization:
> >>
> >> 1) vq masking
> > This EventNotifier is not used for masking, it does not change from
> > the start of the shadow virtqueue operation through its end. Call fd
> > sent to vhost/vdpa device does not change either in shadow virtqueue
> > mode operation with masking/unmasking. I will try to document it
> > better.
> >
> > I think that we will need to handle synchronization with
> > masking/unmasking from the guest and dynamically enabling SVQ
> > operation mode, since they can happen at the same time as long as we
> > let the guest run. There may be better ways of synchronizing them of
> > course, but I don't see how moving to the vhost-vdpa backend helps
> > with this. Please expand if I've missed it.
> >
> > Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
> > to future patchsets?
>
>
> So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and
> hide them from the upper layers like virtio. This means it works at
> vhost level which can see vhost_vring_file only. When enalbed, what it
> needs is just:
>
> 1) switch to use svq kickfd and relay ioeventfd to svq kickfd
> 2) switch to use svq callfd and relay svq callfd to irqfd
>
> It will still behave like a vhost-backend that the switching is done
> internally in vhost-vDPA which is totally transparent to the virtio
> codes of Qemu.
>
> E.g:
>
> 1) in the case of guest notifier masking, we don't need to do anything
> since virtio codes will replace another irqfd for us.

Assuming that we don't modify vhost masking code, but send shadow
virtqueue call descriptor to the vhost device:

If guest virtio code mask the virtqueue and replaces the vhost-vdpa
device call fd (VhostShadowVirtqueue.call_notifier in the next commit,
or the descriptor in your previous second point, svq callfd) with the
masked notifier, vhost_shadow_vq_handle_call will not be called
anymore, and no more used descriptors will be forwarded. They will be
stuck if the shadow virtqueue forever. Guest itself cannot recover
from this situation, since a masking will set irqfd, not SVQ call fd.

> 2) easily to deal with vhost dev start and stop
>
> The advantages are obvious, simple and easy to implement.
>

I still don't see how performing this step from backend code avoids
the synchronization problem, since they will be done from different
threads anyway. Not sure what piece I'm missing.

I can see / tested a few solutions but I don't like them a lot:

* Forbid hot-swapping from/to shadow virtqueue mode, and set it from
cmdline: We will have to deal with setting the SVQ mode dynamically
sooner or later if we want to use it for live migration.
* Forbid coming back to regular mode after switching to shadow
virtqueue mode: The heavy part of the synchronization comes from svq
stopping code, since we need to serialize the setting of device call
fd. This could be acceptable, but I'm not sure about the implications:
What happens if live migration fails and we need to step back? A mutex
is not needed in this scenario, it's ok with atomics and RCU code.

* Replace KVM_IRQFD instead and let SVQ poll the old one and masked
notifier: I haven't thought a lot of this one, I think it's better to
not touch guest notifiers.
* Monitor also masked notifier from SVQ: I think this could be
promising, but SVQ needs to be notified about masking/unmasking
anyway, and there is code that depends on checking the masked notifier
for the pending notification.

>
> >
> >> 2) vhost dev start and stop
> >>
> >> ?
> >>
> >>
> >>> +     *
> >>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
> >>> +     */
> >>> +    EventNotifier host_notifier;
> >>> +
> >>> +    /* Virtio queue shadowing */
> >>> +    VirtQueue *vq;
> >>>    } VhostShadowVirtqueue;
> >>>
> >>> +/* Forward guest notifications */
> >>> +static void vhost_handle_guest_kick(EventNotifier *n)
> >>> +{
> >>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> >>> +                                             host_notifier);
> >>> +
> >>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    event_notifier_set(&svq->kick_notifier);
> >>> +}
> >>> +
> >>> +/*
> >>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
> >>> + */
> >>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
> >>> +                                                     unsigned vhost_index,
> >>> +                                                     VhostShadowVirtqueue *svq)
> >>> +{
> >>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> >>> +    struct vhost_vring_file file = {
> >>> +        .index = vhost_index,
> >>> +        .fd = event_notifier_get_fd(vq_host_notifier),
> >>> +    };
> >>> +    int r;
> >>> +
> >>> +    /* Restore vhost kick */
> >>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> >>> +    return r ? -errno : 0;
> >>> +}
> >>> +
> >>> +/*
> >>> + * Start shadow virtqueue operation.
> >>> + * @dev vhost device
> >>> + * @hidx vhost virtqueue index
> >>> + * @svq Shadow Virtqueue
> >>> + */
> >>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> >>> +                           unsigned idx,
> >>> +                           VhostShadowVirtqueue *svq)
> >>
> >> It looks to me this assumes the vhost_dev is started before
> >> vhost_shadow_vq_start()?
> >>
> > Right.
>
>
> This might not true. Guest may enable and disable virtio drivers after
> the shadow virtqueue is started. You need to deal with that.
>

Right, I will test this scenario.

> Thanks
>
Jason Wang March 18, 2021, 3:10 a.m. UTC | #5
在 2021/3/18 上午12:47, Eugenio Perez Martin 写道:
> On Wed, Mar 17, 2021 at 3:05 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
>>> On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
>>>>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
>>>>> stops, so code flow follows usual cleanup.
>>>>>
>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>>>> ---
>>>>>     hw/virtio/vhost-shadow-virtqueue.h |   7 ++
>>>>>     include/hw/virtio/vhost.h          |   4 +
>>>>>     hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
>>>>>     hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
>>>>>     4 files changed, 265 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
>>>>> index 6cc18d6acb..c891c6510d 100644
>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>>>> @@ -17,6 +17,13 @@
>>>>>
>>>>>     typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>>>>
>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>>>> +                           unsigned idx,
>>>>> +                           VhostShadowVirtqueue *svq);
>>>>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
>>>>> +                          unsigned idx,
>>>>> +                          VhostShadowVirtqueue *svq);
>>>>> +
>>>>>     VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
>>>>>
>>>>>     void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>>> index ac963bf23d..7ffdf9aea0 100644
>>>>> --- a/include/hw/virtio/vhost.h
>>>>> +++ b/include/hw/virtio/vhost.h
>>>>> @@ -55,6 +55,8 @@ struct vhost_iommu {
>>>>>         QLIST_ENTRY(vhost_iommu) iommu_next;
>>>>>     };
>>>>>
>>>>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>>>> +
>>>>>     typedef struct VhostDevConfigOps {
>>>>>         /* Vhost device config space changed callback
>>>>>          */
>>>>> @@ -83,7 +85,9 @@ struct vhost_dev {
>>>>>         uint64_t backend_cap;
>>>>>         bool started;
>>>>>         bool log_enabled;
>>>>> +    bool shadow_vqs_enabled;
>>>>>         uint64_t log_size;
>>>>> +    VhostShadowVirtqueue **shadow_vqs;
>>>> Any reason that you don't embed the shadow virtqueue into
>>>> vhost_virtqueue structure?
>>>>
>>> Not really, it could be relatively big and I would prefer SVQ
>>> members/methods to remain hidden from any other part that includes
>>> vhost.h. But it could be changed, for sure.
>>>
>>>> (Note that there's a masked_notifier in struct vhost_virtqueue).
>>>>
>>> They are used differently: in SVQ the masked notifier is a pointer,
>>> and if it's NULL the SVQ code knows that device is not masked. The
>>> vhost_virtqueue is the real owner.
>>
>> Yes, but it's an example for embedding auxciliary data structures in the
>> vhost_virtqueue.
>>
>>
>>> It could be replaced by a boolean in SVQ or something like that, I
>>> experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
>>> and let vhost.c code to manage all the transitions. But I find clearer
>>> the pointer use, since it's the more natural for the
>>> vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
>>>
>>> This masking/unmasking is the part I dislike the most from this
>>> series, so I'm very open to alternatives.
>>
>> See below. I think we don't even need to care about that.
>>
>>
>>>>>         Error *migration_blocker;
>>>>>         const VhostOps *vhost_ops;
>>>>>         void *opaque;
>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
>>>>> index 4512e5b058..3e43399e9c 100644
>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>>>> @@ -8,9 +8,12 @@
>>>>>      */
>>>>>
>>>>>     #include "hw/virtio/vhost-shadow-virtqueue.h"
>>>>> +#include "hw/virtio/vhost.h"
>>>>> +
>>>>> +#include "standard-headers/linux/vhost_types.h"
>>>>>
>>>>>     #include "qemu/error-report.h"
>>>>> -#include "qemu/event_notifier.h"
>>>>> +#include "qemu/main-loop.h"
>>>>>
>>>>>     /* Shadow virtqueue to relay notifications */
>>>>>     typedef struct VhostShadowVirtqueue {
>>>>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
>>>>>         EventNotifier kick_notifier;
>>>>>         /* Shadow call notifier, sent to vhost */
>>>>>         EventNotifier call_notifier;
>>>>> +
>>>>> +    /*
>>>>> +     * Borrowed virtqueue's guest to host notifier.
>>>>> +     * To borrow it in this event notifier allows to register on the event
>>>>> +     * loop and access the associated shadow virtqueue easily. If we use the
>>>>> +     * VirtQueue, we don't have an easy way to retrieve it.
>>>> So this is something that worries me. It looks like a layer violation
>>>> that makes the codes harder to work correctly.
>>>>
>>> I don't follow you here.
>>>
>>> The vhost code already depends on virtqueue in the same sense:
>>> virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
>>> if this behavior ever changes it is unlikely for vhost to keep working
>>> without changes. vhost_virtqueue has a kick/call int where I think it
>>> should be stored actually, but they are never used as far as I see.
>>>
>>> Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
>>> /* Stop processing guest IO notifications in vhost.
>>>    * Start processing them in qemu.
>>>    ...
>>> But it was easier for this mode to miss a notification, since they
>>> create a new host_notifier in virtio_bus_set_host_notifier right away.
>>> So I decided to use the file descriptor already sent to vhost in
>>> regular operation mode, so guest-related resources change less.
>>>
>>> Having said that, maybe it's useful to assert that
>>> vhost_dev_{enable,disable}_notifiers are never called on shadow
>>> virtqueue mode. Also, it could be useful to retrieve it from
>>> virtio_bus, not raw shadow virtqueue, so all get/set are performed
>>> from it. Would that make more sense?
>>>
>>>> I wonder if it would be simpler to start from a vDPA dedicated shadow
>>>> virtqueue implementation:
>>>>
>>>> 1) have the above fields embeded in vhost_vdpa structure
>>>> 2) Work at the level of
>>>> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
>>>>
>>> This notifier is never sent to the device in shadow virtqueue mode.
>>> It's for SVQ to react to guest's notifications, registering it on its
>>> main event loop [1]. So if I perform these changes the way I
>>> understand them, SVQ would still rely on this borrowed EventNotifier,
>>> and it would send to the vDPA device the newly created kick_notifier
>>> of VhostShadowVirtqueue.
>>
>> The point is that vhost code should be coupled loosely with virtio. If
>> you try to "borrow" EventNotifier from virtio, you need to deal with a
>> lot of synchrization. An exampleis the masking stuffs.
>>
> I still don't follow this, sorry.
>
> The svq->host_notifier event notifier is not affected by the masking
> issue, it is completely private to SVQ. This commit creates and uses
> it, and nothing related to masking is touched until the next commit.
>
>>>> Then the layer is still isolated and you have a much simpler context to
>>>> work that you don't need to care a lot of synchornization:
>>>>
>>>> 1) vq masking
>>> This EventNotifier is not used for masking, it does not change from
>>> the start of the shadow virtqueue operation through its end. Call fd
>>> sent to vhost/vdpa device does not change either in shadow virtqueue
>>> mode operation with masking/unmasking. I will try to document it
>>> better.
>>>
>>> I think that we will need to handle synchronization with
>>> masking/unmasking from the guest and dynamically enabling SVQ
>>> operation mode, since they can happen at the same time as long as we
>>> let the guest run. There may be better ways of synchronizing them of
>>> course, but I don't see how moving to the vhost-vdpa backend helps
>>> with this. Please expand if I've missed it.
>>>
>>> Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
>>> to future patchsets?
>>
>> So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and
>> hide them from the upper layers like virtio. This means it works at
>> vhost level which can see vhost_vring_file only. When enalbed, what it
>> needs is just:
>>
>> 1) switch to use svq kickfd and relay ioeventfd to svq kickfd
>> 2) switch to use svq callfd and relay svq callfd to irqfd
>>
>> It will still behave like a vhost-backend that the switching is done
>> internally in vhost-vDPA which is totally transparent to the virtio
>> codes of Qemu.
>>
>> E.g:
>>
>> 1) in the case of guest notifier masking, we don't need to do anything
>> since virtio codes will replace another irqfd for us.
> Assuming that we don't modify vhost masking code, but send shadow
> virtqueue call descriptor to the vhost device:
>
> If guest virtio code mask the virtqueue and replaces the vhost-vdpa
> device call fd (VhostShadowVirtqueue.call_notifier in the next commit,
> or the descriptor in your previous second point, svq callfd) with the
> masked notifier, vhost_shadow_vq_handle_call will not be called
> anymore, and no more used descriptors will be forwarded. They will be
> stuck if the shadow virtqueue forever. Guest itself cannot recover
> from this situation, since a masking will set irqfd, not SVQ call fd.


Just to make sure we're in the same page. During vq masking, the virtio 
codes actually use the masked_notifier as callfd in vhost_virtqueue_mask():

     if (mask) {
         assert(vdev->use_guest_notifier_mask);
         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
     } else {
     file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
     }

     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);

So consider the shadow virtqueue in done at vhost-vDPA. We just need to 
make sure

1) update the callfd which passed by virtio layer via set_vring_kick()
2) always write to the callfd during vhost_shadow_vq_handle_call()

Then

3) When shadow vq is enabled, we just set the callfd of shadow virtqueue 
to vDPA via VHOST_SET_VRING_CALL, and poll the svq callfd
4) When shadow vq is disabled, we just set the callfd that is passed by 
virtio via VHOST_SET_VRING_CALL, stop poll the svq callfd

So you can see in step 2 and 4, we don't need to know whether or not the 
vq is masked since we follow the vhost protocol "VhostOps" and do 
everyhing transparently in the vhost-(vDPA) layer.


>
>> 2) easily to deal with vhost dev start and stop
>>
>> The advantages are obvious, simple and easy to implement.
>>
> I still don't see how performing this step from backend code avoids
> the synchronization problem, since they will be done from different
> threads anyway. Not sure what piece I'm missing.


See my reply in another thread. If you enable the shadow virtqueue via a 
OOB monitor that's a real issue.

But I don't think we need to do that since

1) SVQ should be transparnet to management
2) unncessary synchronization issue

We can enable the shadow virtqueue through cli, new parameter with 
vhost-vdpa probably. Then we don't need to care about threads. And in 
the final version with full live migration support, the shadow virtqueue 
should be enabled automatically. E.g for the device without 
VHOST_F_LOG_ALL or we can have a dedicated capability of vDPA via 
VHOST_GET_BACKEND_FEATURES.

Thanks


>
> I can see / tested a few solutions but I don't like them a lot:
>
> * Forbid hot-swapping from/to shadow virtqueue mode, and set it from
> cmdline: We will have to deal with setting the SVQ mode dynamically
> sooner or later if we want to use it for live migration.
> * Forbid coming back to regular mode after switching to shadow
> virtqueue mode: The heavy part of the synchronization comes from svq
> stopping code, since we need to serialize the setting of device call
> fd. This could be acceptable, but I'm not sure about the implications:
> What happens if live migration fails and we need to step back? A mutex
> is not needed in this scenario, it's ok with atomics and RCU code.
>
> * Replace KVM_IRQFD instead and let SVQ poll the old one and masked
> notifier: I haven't thought a lot of this one, I think it's better to
> not touch guest notifiers.
> * Monitor also masked notifier from SVQ: I think this could be
> promising, but SVQ needs to be notified about masking/unmasking
> anyway, and there is code that depends on checking the masked notifier
> for the pending notification.
>
>>>> 2) vhost dev start and stop
>>>>
>>>> ?
>>>>
>>>>
>>>>> +     *
>>>>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
>>>>> +     */
>>>>> +    EventNotifier host_notifier;
>>>>> +
>>>>> +    /* Virtio queue shadowing */
>>>>> +    VirtQueue *vq;
>>>>>     } VhostShadowVirtqueue;
>>>>>
>>>>> +/* Forward guest notifications */
>>>>> +static void vhost_handle_guest_kick(EventNotifier *n)
>>>>> +{
>>>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>>>>> +                                             host_notifier);
>>>>> +
>>>>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    event_notifier_set(&svq->kick_notifier);
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
>>>>> + */
>>>>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
>>>>> +                                                     unsigned vhost_index,
>>>>> +                                                     VhostShadowVirtqueue *svq)
>>>>> +{
>>>>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
>>>>> +    struct vhost_vring_file file = {
>>>>> +        .index = vhost_index,
>>>>> +        .fd = event_notifier_get_fd(vq_host_notifier),
>>>>> +    };
>>>>> +    int r;
>>>>> +
>>>>> +    /* Restore vhost kick */
>>>>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
>>>>> +    return r ? -errno : 0;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Start shadow virtqueue operation.
>>>>> + * @dev vhost device
>>>>> + * @hidx vhost virtqueue index
>>>>> + * @svq Shadow Virtqueue
>>>>> + */
>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>>>> +                           unsigned idx,
>>>>> +                           VhostShadowVirtqueue *svq)
>>>> It looks to me this assumes the vhost_dev is started before
>>>> vhost_shadow_vq_start()?
>>>>
>>> Right.
>>
>> This might not true. Guest may enable and disable virtio drivers after
>> the shadow virtqueue is started. You need to deal with that.
>>
> Right, I will test this scenario.
>
>> Thanks
>>
Eugenio Perez Martin March 18, 2021, 9:18 a.m. UTC | #6
On Thu, Mar 18, 2021 at 4:11 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/18 上午12:47, Eugenio Perez Martin 写道:
> > On Wed, Mar 17, 2021 at 3:05 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
> >>> On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
> >>>>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
> >>>>> stops, so code flow follows usual cleanup.
> >>>>>
> >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>>>> ---
> >>>>>     hw/virtio/vhost-shadow-virtqueue.h |   7 ++
> >>>>>     include/hw/virtio/vhost.h          |   4 +
> >>>>>     hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
> >>>>>     hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
> >>>>>     4 files changed, 265 insertions(+), 2 deletions(-)
> >>>>>
> >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> >>>>> index 6cc18d6acb..c891c6510d 100644
> >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> >>>>> @@ -17,6 +17,13 @@
> >>>>>
> >>>>>     typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >>>>>
> >>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> >>>>> +                           unsigned idx,
> >>>>> +                           VhostShadowVirtqueue *svq);
> >>>>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> >>>>> +                          unsigned idx,
> >>>>> +                          VhostShadowVirtqueue *svq);
> >>>>> +
> >>>>>     VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
> >>>>>
> >>>>>     void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
> >>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >>>>> index ac963bf23d..7ffdf9aea0 100644
> >>>>> --- a/include/hw/virtio/vhost.h
> >>>>> +++ b/include/hw/virtio/vhost.h
> >>>>> @@ -55,6 +55,8 @@ struct vhost_iommu {
> >>>>>         QLIST_ENTRY(vhost_iommu) iommu_next;
> >>>>>     };
> >>>>>
> >>>>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >>>>> +
> >>>>>     typedef struct VhostDevConfigOps {
> >>>>>         /* Vhost device config space changed callback
> >>>>>          */
> >>>>> @@ -83,7 +85,9 @@ struct vhost_dev {
> >>>>>         uint64_t backend_cap;
> >>>>>         bool started;
> >>>>>         bool log_enabled;
> >>>>> +    bool shadow_vqs_enabled;
> >>>>>         uint64_t log_size;
> >>>>> +    VhostShadowVirtqueue **shadow_vqs;
> >>>> Any reason that you don't embed the shadow virtqueue into
> >>>> vhost_virtqueue structure?
> >>>>
> >>> Not really, it could be relatively big and I would prefer SVQ
> >>> members/methods to remain hidden from any other part that includes
> >>> vhost.h. But it could be changed, for sure.
> >>>
> >>>> (Note that there's a masked_notifier in struct vhost_virtqueue).
> >>>>
> >>> They are used differently: in SVQ the masked notifier is a pointer,
> >>> and if it's NULL the SVQ code knows that device is not masked. The
> >>> vhost_virtqueue is the real owner.
> >>
> >> Yes, but it's an example for embedding auxciliary data structures in the
> >> vhost_virtqueue.
> >>
> >>
> >>> It could be replaced by a boolean in SVQ or something like that, I
> >>> experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
> >>> and let vhost.c code to manage all the transitions. But I find clearer
> >>> the pointer use, since it's the more natural for the
> >>> vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
> >>>
> >>> This masking/unmasking is the part I dislike the most from this
> >>> series, so I'm very open to alternatives.
> >>
> >> See below. I think we don't even need to care about that.
> >>
> >>
> >>>>>         Error *migration_blocker;
> >>>>>         const VhostOps *vhost_ops;
> >>>>>         void *opaque;
> >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> >>>>> index 4512e5b058..3e43399e9c 100644
> >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> >>>>> @@ -8,9 +8,12 @@
> >>>>>      */
> >>>>>
> >>>>>     #include "hw/virtio/vhost-shadow-virtqueue.h"
> >>>>> +#include "hw/virtio/vhost.h"
> >>>>> +
> >>>>> +#include "standard-headers/linux/vhost_types.h"
> >>>>>
> >>>>>     #include "qemu/error-report.h"
> >>>>> -#include "qemu/event_notifier.h"
> >>>>> +#include "qemu/main-loop.h"
> >>>>>
> >>>>>     /* Shadow virtqueue to relay notifications */
> >>>>>     typedef struct VhostShadowVirtqueue {
> >>>>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
> >>>>>         EventNotifier kick_notifier;
> >>>>>         /* Shadow call notifier, sent to vhost */
> >>>>>         EventNotifier call_notifier;
> >>>>> +
> >>>>> +    /*
> >>>>> +     * Borrowed virtqueue's guest to host notifier.
> >>>>> +     * To borrow it in this event notifier allows to register on the event
> >>>>> +     * loop and access the associated shadow virtqueue easily. If we use the
> >>>>> +     * VirtQueue, we don't have an easy way to retrieve it.
> >>>> So this is something that worries me. It looks like a layer violation
> >>>> that makes the codes harder to work correctly.
> >>>>
> >>> I don't follow you here.
> >>>
> >>> The vhost code already depends on virtqueue in the same sense:
> >>> virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
> >>> if this behavior ever changes it is unlikely for vhost to keep working
> >>> without changes. vhost_virtqueue has a kick/call int where I think it
> >>> should be stored actually, but they are never used as far as I see.
> >>>
> >>> Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
> >>> /* Stop processing guest IO notifications in vhost.
> >>>    * Start processing them in qemu.
> >>>    ...
> >>> But it was easier for this mode to miss a notification, since they
> >>> create a new host_notifier in virtio_bus_set_host_notifier right away.
> >>> So I decided to use the file descriptor already sent to vhost in
> >>> regular operation mode, so guest-related resources change less.
> >>>
> >>> Having said that, maybe it's useful to assert that
> >>> vhost_dev_{enable,disable}_notifiers are never called on shadow
> >>> virtqueue mode. Also, it could be useful to retrieve it from
> >>> virtio_bus, not raw shadow virtqueue, so all get/set are performed
> >>> from it. Would that make more sense?
> >>>
> >>>> I wonder if it would be simpler to start from a vDPA dedicated shadow
> >>>> virtqueue implementation:
> >>>>
> >>>> 1) have the above fields embeded in vhost_vdpa structure
> >>>> 2) Work at the level of
> >>>> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
> >>>>
> >>> This notifier is never sent to the device in shadow virtqueue mode.
> >>> It's for SVQ to react to guest's notifications, registering it on its
> >>> main event loop [1]. So if I perform these changes the way I
> >>> understand them, SVQ would still rely on this borrowed EventNotifier,
> >>> and it would send to the vDPA device the newly created kick_notifier
> >>> of VhostShadowVirtqueue.
> >>
> >> The point is that vhost code should be coupled loosely with virtio. If
> >> you try to "borrow" EventNotifier from virtio, you need to deal with a
> >> lot of synchrization. An exampleis the masking stuffs.
> >>
> > I still don't follow this, sorry.
> >
> > The svq->host_notifier event notifier is not affected by the masking
> > issue, it is completely private to SVQ. This commit creates and uses
> > it, and nothing related to masking is touched until the next commit.
> >
> >>>> Then the layer is still isolated and you have a much simpler context to
> >>>> work that you don't need to care a lot of synchornization:
> >>>>
> >>>> 1) vq masking
> >>> This EventNotifier is not used for masking, it does not change from
> >>> the start of the shadow virtqueue operation through its end. Call fd
> >>> sent to vhost/vdpa device does not change either in shadow virtqueue
> >>> mode operation with masking/unmasking. I will try to document it
> >>> better.
> >>>
> >>> I think that we will need to handle synchronization with
> >>> masking/unmasking from the guest and dynamically enabling SVQ
> >>> operation mode, since they can happen at the same time as long as we
> >>> let the guest run. There may be better ways of synchronizing them of
> >>> course, but I don't see how moving to the vhost-vdpa backend helps
> >>> with this. Please expand if I've missed it.
> >>>
> >>> Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
> >>> to future patchsets?
> >>
> >> So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and
> >> hide them from the upper layers like virtio. This means it works at
> >> vhost level which can see vhost_vring_file only. When enalbed, what it
> >> needs is just:
> >>
> >> 1) switch to use svq kickfd and relay ioeventfd to svq kickfd
> >> 2) switch to use svq callfd and relay svq callfd to irqfd
> >>
> >> It will still behave like a vhost-backend that the switching is done
> >> internally in vhost-vDPA which is totally transparent to the virtio
> >> codes of Qemu.
> >>
> >> E.g:
> >>
> >> 1) in the case of guest notifier masking, we don't need to do anything
> >> since virtio codes will replace another irqfd for us.
> > Assuming that we don't modify vhost masking code, but send shadow
> > virtqueue call descriptor to the vhost device:
> >
> > If guest virtio code mask the virtqueue and replaces the vhost-vdpa
> > device call fd (VhostShadowVirtqueue.call_notifier in the next commit,
> > or the descriptor in your previous second point, svq callfd) with the
> > masked notifier, vhost_shadow_vq_handle_call will not be called
> > anymore, and no more used descriptors will be forwarded. They will be
> > stuck if the shadow virtqueue forever. Guest itself cannot recover
> > from this situation, since a masking will set irqfd, not SVQ call fd.
>
>
> Just to make sure we're in the same page. During vq masking, the virtio
> codes actually use the masked_notifier as callfd in vhost_virtqueue_mask():
>
>      if (mask) {
>          assert(vdev->use_guest_notifier_mask);
>          file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
>      } else {
>      file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
>      }
>
>      file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
>      r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
>
> So consider the shadow virtqueue in done at vhost-vDPA. We just need to
> make sure
>
> 1) update the callfd which passed by virtio layer via set_vring_kick()
> 2) always write to the callfd during vhost_shadow_vq_handle_call()
>
> Then
>
> 3) When shadow vq is enabled, we just set the callfd of shadow virtqueue
> to vDPA via VHOST_SET_VRING_CALL, and poll the svq callfd
> 4) When shadow vq is disabled, we just set the callfd that is passed by
> virtio via VHOST_SET_VRING_CALL, stop poll the svq callfd
>
> So you can see in step 2 and 4, we don't need to know whether or not the
> vq is masked since we follow the vhost protocol "VhostOps" and do
> everyhing transparently in the vhost-(vDPA) layer.
>

All of this assumes that we can enable/disable SVQ dynamically while
the device is running. If it's not the case, there is no need for the
mutex neither in vhost.c code nor vdpa_backend.

As I see it, the issue is that step (2) and (4) happens in different
threads: (2) is in vCPU vmexit, and (4) is in main event loop.
Consider unmasking and disabling SVQ at the same time with no mutex:

vCPU vmexit thread                     aio thread
(unmask)                               (stops SVQ)
|                                      |
|                                      // Last callfd set was masked_notifier
|                                      vdpa_backend.callfd = \
|                                              atomic_read(masked_notifier).
|                                      |
vhost_set_vring_call(vq.guest_notifier)|
-> vdpa_backend.callfd = \             |
           vq.guest_notifier           |
|                                      |
|                                      ioctl(vdpa,
VHOST_SET_VRING_CALL, vdpa_backend.callfd)
|
// guest expects more interrupts, but
// device just set masked

And vhost_set_vring_call could happen entirely even while ioctl is
being executed.

So that is the reason for the mutex: vdpa_backend.call_fd and the
ioctl VHOST_SET_VRING_CALL must be serialized. I'm ok with moving to
vdpa backend, but it's the same code, just in vdpa_backend.c instead
of vhost.c, so it becomes less generic in my opinion.

>
> >
> >> 2) easily to deal with vhost dev start and stop
> >>
> >> The advantages are obvious, simple and easy to implement.
> >>
> > I still don't see how performing this step from backend code avoids
> > the synchronization problem, since they will be done from different
> > threads anyway. Not sure what piece I'm missing.
>
>
> See my reply in another thread. If you enable the shadow virtqueue via a
> OOB monitor that's a real issue.
>
> But I don't think we need to do that since
>
> 1) SVQ should be transparnet to management
> 2) unncessary synchronization issue
>
> We can enable the shadow virtqueue through cli, new parameter with
> vhost-vdpa probably. Then we don't need to care about threads. And in
> the final version with full live migration support, the shadow virtqueue
> should be enabled automatically. E.g for the device without
> VHOST_F_LOG_ALL or we can have a dedicated capability of vDPA via
> VHOST_GET_BACKEND_FEATURES.
>

It should be enabled automatically in those condition, but it also
needs to be dynamic, and only be active during migration. Otherwise,
guest should use regular vdpa operation. The problem with masking is
the same if we enable with QMP or because live migration event.

So we will have the previous synchronization problem sooner or later.
If we omit the rollback to regular vdpa operation (in other words,
disabling SVQ), code can be simplified, but I'm not sure if that is
desirable.

Thanks!

> Thanks
>
>
> >
> > I can see / tested a few solutions but I don't like them a lot:
> >
> > * Forbid hot-swapping from/to shadow virtqueue mode, and set it from
> > cmdline: We will have to deal with setting the SVQ mode dynamically
> > sooner or later if we want to use it for live migration.
> > * Forbid coming back to regular mode after switching to shadow
> > virtqueue mode: The heavy part of the synchronization comes from svq
> > stopping code, since we need to serialize the setting of device call
> > fd. This could be acceptable, but I'm not sure about the implications:
> > What happens if live migration fails and we need to step back? A mutex
> > is not needed in this scenario, it's ok with atomics and RCU code.
> >
> > * Replace KVM_IRQFD instead and let SVQ poll the old one and masked
> > notifier: I haven't thought a lot of this one, I think it's better to
> > not touch guest notifiers.
> > * Monitor also masked notifier from SVQ: I think this could be
> > promising, but SVQ needs to be notified about masking/unmasking
> > anyway, and there is code that depends on checking the masked notifier
> > for the pending notification.
> >
> >>>> 2) vhost dev start and stop
> >>>>
> >>>> ?
> >>>>
> >>>>
> >>>>> +     *
> >>>>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
> >>>>> +     */
> >>>>> +    EventNotifier host_notifier;
> >>>>> +
> >>>>> +    /* Virtio queue shadowing */
> >>>>> +    VirtQueue *vq;
> >>>>>     } VhostShadowVirtqueue;
> >>>>>
> >>>>> +/* Forward guest notifications */
> >>>>> +static void vhost_handle_guest_kick(EventNotifier *n)
> >>>>> +{
> >>>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> >>>>> +                                             host_notifier);
> >>>>> +
> >>>>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
> >>>>> +        return;
> >>>>> +    }
> >>>>> +
> >>>>> +    event_notifier_set(&svq->kick_notifier);
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
> >>>>> + */
> >>>>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
> >>>>> +                                                     unsigned vhost_index,
> >>>>> +                                                     VhostShadowVirtqueue *svq)
> >>>>> +{
> >>>>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> >>>>> +    struct vhost_vring_file file = {
> >>>>> +        .index = vhost_index,
> >>>>> +        .fd = event_notifier_get_fd(vq_host_notifier),
> >>>>> +    };
> >>>>> +    int r;
> >>>>> +
> >>>>> +    /* Restore vhost kick */
> >>>>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> >>>>> +    return r ? -errno : 0;
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Start shadow virtqueue operation.
> >>>>> + * @dev vhost device
> >>>>> + * @hidx vhost virtqueue index
> >>>>> + * @svq Shadow Virtqueue
> >>>>> + */
> >>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> >>>>> +                           unsigned idx,
> >>>>> +                           VhostShadowVirtqueue *svq)
> >>>> It looks to me this assumes the vhost_dev is started before
> >>>> vhost_shadow_vq_start()?
> >>>>
> >>> Right.
> >>
> >> This might not true. Guest may enable and disable virtio drivers after
> >> the shadow virtqueue is started. You need to deal with that.
> >>
> > Right, I will test this scenario.
> >
> >> Thanks
> >>
>
Jason Wang March 18, 2021, 9:29 a.m. UTC | #7
在 2021/3/18 下午5:18, Eugenio Perez Martin 写道:
> On Thu, Mar 18, 2021 at 4:11 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/3/18 上午12:47, Eugenio Perez Martin 写道:
>>> On Wed, Mar 17, 2021 at 3:05 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
>>>>> On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
>>>>>>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
>>>>>>> stops, so code flow follows usual cleanup.
>>>>>>>
>>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>>>>>> ---
>>>>>>>      hw/virtio/vhost-shadow-virtqueue.h |   7 ++
>>>>>>>      include/hw/virtio/vhost.h          |   4 +
>>>>>>>      hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
>>>>>>>      hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
>>>>>>>      4 files changed, 265 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
>>>>>>> index 6cc18d6acb..c891c6510d 100644
>>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>>>>>> @@ -17,6 +17,13 @@
>>>>>>>
>>>>>>>      typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>>>>>>
>>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>>>>>> +                           unsigned idx,
>>>>>>> +                           VhostShadowVirtqueue *svq);
>>>>>>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
>>>>>>> +                          unsigned idx,
>>>>>>> +                          VhostShadowVirtqueue *svq);
>>>>>>> +
>>>>>>>      VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
>>>>>>>
>>>>>>>      void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
>>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>>>>> index ac963bf23d..7ffdf9aea0 100644
>>>>>>> --- a/include/hw/virtio/vhost.h
>>>>>>> +++ b/include/hw/virtio/vhost.h
>>>>>>> @@ -55,6 +55,8 @@ struct vhost_iommu {
>>>>>>>          QLIST_ENTRY(vhost_iommu) iommu_next;
>>>>>>>      };
>>>>>>>
>>>>>>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>>>>>> +
>>>>>>>      typedef struct VhostDevConfigOps {
>>>>>>>          /* Vhost device config space changed callback
>>>>>>>           */
>>>>>>> @@ -83,7 +85,9 @@ struct vhost_dev {
>>>>>>>          uint64_t backend_cap;
>>>>>>>          bool started;
>>>>>>>          bool log_enabled;
>>>>>>> +    bool shadow_vqs_enabled;
>>>>>>>          uint64_t log_size;
>>>>>>> +    VhostShadowVirtqueue **shadow_vqs;
>>>>>> Any reason that you don't embed the shadow virtqueue into
>>>>>> vhost_virtqueue structure?
>>>>>>
>>>>> Not really, it could be relatively big and I would prefer SVQ
>>>>> members/methods to remain hidden from any other part that includes
>>>>> vhost.h. But it could be changed, for sure.
>>>>>
>>>>>> (Note that there's a masked_notifier in struct vhost_virtqueue).
>>>>>>
>>>>> They are used differently: in SVQ the masked notifier is a pointer,
>>>>> and if it's NULL the SVQ code knows that device is not masked. The
>>>>> vhost_virtqueue is the real owner.
>>>> Yes, but it's an example for embedding auxciliary data structures in the
>>>> vhost_virtqueue.
>>>>
>>>>
>>>>> It could be replaced by a boolean in SVQ or something like that, I
>>>>> experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
>>>>> and let vhost.c code to manage all the transitions. But I find clearer
>>>>> the pointer use, since it's the more natural for the
>>>>> vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
>>>>>
>>>>> This masking/unmasking is the part I dislike the most from this
>>>>> series, so I'm very open to alternatives.
>>>> See below. I think we don't even need to care about that.
>>>>
>>>>
>>>>>>>          Error *migration_blocker;
>>>>>>>          const VhostOps *vhost_ops;
>>>>>>>          void *opaque;
>>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
>>>>>>> index 4512e5b058..3e43399e9c 100644
>>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>>>>>> @@ -8,9 +8,12 @@
>>>>>>>       */
>>>>>>>
>>>>>>>      #include "hw/virtio/vhost-shadow-virtqueue.h"
>>>>>>> +#include "hw/virtio/vhost.h"
>>>>>>> +
>>>>>>> +#include "standard-headers/linux/vhost_types.h"
>>>>>>>
>>>>>>>      #include "qemu/error-report.h"
>>>>>>> -#include "qemu/event_notifier.h"
>>>>>>> +#include "qemu/main-loop.h"
>>>>>>>
>>>>>>>      /* Shadow virtqueue to relay notifications */
>>>>>>>      typedef struct VhostShadowVirtqueue {
>>>>>>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
>>>>>>>          EventNotifier kick_notifier;
>>>>>>>          /* Shadow call notifier, sent to vhost */
>>>>>>>          EventNotifier call_notifier;
>>>>>>> +
>>>>>>> +    /*
>>>>>>> +     * Borrowed virtqueue's guest to host notifier.
>>>>>>> +     * To borrow it in this event notifier allows to register on the event
>>>>>>> +     * loop and access the associated shadow virtqueue easily. If we use the
>>>>>>> +     * VirtQueue, we don't have an easy way to retrieve it.
>>>>>> So this is something that worries me. It looks like a layer violation
>>>>>> that makes the codes harder to work correctly.
>>>>>>
>>>>> I don't follow you here.
>>>>>
>>>>> The vhost code already depends on virtqueue in the same sense:
>>>>> virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
>>>>> if this behavior ever changes it is unlikely for vhost to keep working
>>>>> without changes. vhost_virtqueue has a kick/call int where I think it
>>>>> should be stored actually, but they are never used as far as I see.
>>>>>
>>>>> Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
>>>>> /* Stop processing guest IO notifications in vhost.
>>>>>     * Start processing them in qemu.
>>>>>     ...
>>>>> But it was easier for this mode to miss a notification, since they
>>>>> create a new host_notifier in virtio_bus_set_host_notifier right away.
>>>>> So I decided to use the file descriptor already sent to vhost in
>>>>> regular operation mode, so guest-related resources change less.
>>>>>
>>>>> Having said that, maybe it's useful to assert that
>>>>> vhost_dev_{enable,disable}_notifiers are never called on shadow
>>>>> virtqueue mode. Also, it could be useful to retrieve it from
>>>>> virtio_bus, not raw shadow virtqueue, so all get/set are performed
>>>>> from it. Would that make more sense?
>>>>>
>>>>>> I wonder if it would be simpler to start from a vDPA dedicated shadow
>>>>>> virtqueue implementation:
>>>>>>
>>>>>> 1) have the above fields embeded in vhost_vdpa structure
>>>>>> 2) Work at the level of
>>>>>> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
>>>>>>
>>>>> This notifier is never sent to the device in shadow virtqueue mode.
>>>>> It's for SVQ to react to guest's notifications, registering it on its
>>>>> main event loop [1]. So if I perform these changes the way I
>>>>> understand them, SVQ would still rely on this borrowed EventNotifier,
>>>>> and it would send to the vDPA device the newly created kick_notifier
>>>>> of VhostShadowVirtqueue.
>>>> The point is that vhost code should be coupled loosely with virtio. If
>>>> you try to "borrow" EventNotifier from virtio, you need to deal with a
>>>> lot of synchrization. An exampleis the masking stuffs.
>>>>
>>> I still don't follow this, sorry.
>>>
>>> The svq->host_notifier event notifier is not affected by the masking
>>> issue, it is completely private to SVQ. This commit creates and uses
>>> it, and nothing related to masking is touched until the next commit.
>>>
>>>>>> Then the layer is still isolated and you have a much simpler context to
>>>>>> work that you don't need to care a lot of synchornization:
>>>>>>
>>>>>> 1) vq masking
>>>>> This EventNotifier is not used for masking, it does not change from
>>>>> the start of the shadow virtqueue operation through its end. Call fd
>>>>> sent to vhost/vdpa device does not change either in shadow virtqueue
>>>>> mode operation with masking/unmasking. I will try to document it
>>>>> better.
>>>>>
>>>>> I think that we will need to handle synchronization with
>>>>> masking/unmasking from the guest and dynamically enabling SVQ
>>>>> operation mode, since they can happen at the same time as long as we
>>>>> let the guest run. There may be better ways of synchronizing them of
>>>>> course, but I don't see how moving to the vhost-vdpa backend helps
>>>>> with this. Please expand if I've missed it.
>>>>>
>>>>> Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
>>>>> to future patchsets?
>>>> So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and
>>>> hide them from the upper layers like virtio. This means it works at
>>>> vhost level which can see vhost_vring_file only. When enalbed, what it
>>>> needs is just:
>>>>
>>>> 1) switch to use svq kickfd and relay ioeventfd to svq kickfd
>>>> 2) switch to use svq callfd and relay svq callfd to irqfd
>>>>
>>>> It will still behave like a vhost-backend that the switching is done
>>>> internally in vhost-vDPA which is totally transparent to the virtio
>>>> codes of Qemu.
>>>>
>>>> E.g:
>>>>
>>>> 1) in the case of guest notifier masking, we don't need to do anything
>>>> since virtio codes will replace another irqfd for us.
>>> Assuming that we don't modify vhost masking code, but send shadow
>>> virtqueue call descriptor to the vhost device:
>>>
>>> If guest virtio code mask the virtqueue and replaces the vhost-vdpa
>>> device call fd (VhostShadowVirtqueue.call_notifier in the next commit,
>>> or the descriptor in your previous second point, svq callfd) with the
>>> masked notifier, vhost_shadow_vq_handle_call will not be called
>>> anymore, and no more used descriptors will be forwarded. They will be
>>> stuck if the shadow virtqueue forever. Guest itself cannot recover
>>> from this situation, since a masking will set irqfd, not SVQ call fd.
>>
>> Just to make sure we're in the same page. During vq masking, the virtio
>> codes actually use the masked_notifier as callfd in vhost_virtqueue_mask():
>>
>>       if (mask) {
>>           assert(vdev->use_guest_notifier_mask);
>>           file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
>>       } else {
>>       file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
>>       }
>>
>>       file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
>>       r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
>>
>> So consider the shadow virtqueue in done at vhost-vDPA. We just need to
>> make sure
>>
>> 1) update the callfd which passed by virtio layer via set_vring_kick()
>> 2) always write to the callfd during vhost_shadow_vq_handle_call()
>>
>> Then
>>
>> 3) When shadow vq is enabled, we just set the callfd of shadow virtqueue
>> to vDPA via VHOST_SET_VRING_CALL, and poll the svq callfd
>> 4) When shadow vq is disabled, we just set the callfd that is passed by
>> virtio via VHOST_SET_VRING_CALL, stop poll the svq callfd
>>
>> So you can see in step 2 and 4, we don't need to know whether or not the
>> vq is masked since we follow the vhost protocol "VhostOps" and do
>> everyhing transparently in the vhost-(vDPA) layer.
>>
> All of this assumes that we can enable/disable SVQ dynamically while
> the device is running. If it's not the case, there is no need for the
> mutex neither in vhost.c code nor vdpa_backend.
>
> As I see it, the issue is that step (2) and (4) happens in different
> threads: (2) is in vCPU vmexit, and (4) is in main event loop.
> Consider unmasking and disabling SVQ at the same time with no mutex:
>
> vCPU vmexit thread                     aio thread
> (unmask)                               (stops SVQ)
> |                                      |
> |                                      // Last callfd set was masked_notifier
> |                                      vdpa_backend.callfd = \
> |                                              atomic_read(masked_notifier).
> |                                      |
> vhost_set_vring_call(vq.guest_notifier)|
> -> vdpa_backend.callfd = \             |
>             vq.guest_notifier           |
> |                                      |
> |                                      ioctl(vdpa,
> VHOST_SET_VRING_CALL, vdpa_backend.callfd)
> |
> // guest expects more interrupts, but
> // device just set masked
>
> And vhost_set_vring_call could happen entirely even while ioctl is
> being executed.
>
> So that is the reason for the mutex: vdpa_backend.call_fd and the
> ioctl VHOST_SET_VRING_CALL must be serialized. I'm ok with moving to
> vdpa backend, but it's the same code, just in vdpa_backend.c instead
> of vhost.c, so it becomes less generic in my opinion.


You are right. But let's consider if we can avoid the dedicated mutex.

E.g can we use the BQL, bascially we need to synchronizae with iothread.

Or is it possible to schedule bh then things are serailzied automatically?


>
>>>> 2) easily to deal with vhost dev start and stop
>>>>
>>>> The advantages are obvious, simple and easy to implement.
>>>>
>>> I still don't see how performing this step from backend code avoids
>>> the synchronization problem, since they will be done from different
>>> threads anyway. Not sure what piece I'm missing.
>>
>> See my reply in another thread. If you enable the shadow virtqueue via a
>> OOB monitor that's a real issue.
>>
>> But I don't think we need to do that since
>>
>> 1) SVQ should be transparnet to management
>> 2) unncessary synchronization issue
>>
>> We can enable the shadow virtqueue through cli, new parameter with
>> vhost-vdpa probably. Then we don't need to care about threads. And in
>> the final version with full live migration support, the shadow virtqueue
>> should be enabled automatically. E.g for the device without
>> VHOST_F_LOG_ALL or we can have a dedicated capability of vDPA via
>> VHOST_GET_BACKEND_FEATURES.
>>
> It should be enabled automatically in those condition, but it also
> needs to be dynamic, and only be active during migration. Otherwise,
> guest should use regular vdpa operation. The problem with masking is
> the same if we enable with QMP or because live migration event.
>
> So we will have the previous synchronization problem sooner or later.
> If we omit the rollback to regular vdpa operation (in other words,
> disabling SVQ), code can be simplified, but I'm not sure if that is
> desirable.


Rgiht, so I'm ok to have the synchronziation from the start if you wish.

But we need to figure out what to synchronize and how to do synchronize.

THanks


>
> Thanks!
>
>> Thanks
>>
>>
>>> I can see / tested a few solutions but I don't like them a lot:
>>>
>>> * Forbid hot-swapping from/to shadow virtqueue mode, and set it from
>>> cmdline: We will have to deal with setting the SVQ mode dynamically
>>> sooner or later if we want to use it for live migration.
>>> * Forbid coming back to regular mode after switching to shadow
>>> virtqueue mode: The heavy part of the synchronization comes from svq
>>> stopping code, since we need to serialize the setting of device call
>>> fd. This could be acceptable, but I'm not sure about the implications:
>>> What happens if live migration fails and we need to step back? A mutex
>>> is not needed in this scenario, it's ok with atomics and RCU code.
>>>
>>> * Replace KVM_IRQFD instead and let SVQ poll the old one and masked
>>> notifier: I haven't thought a lot of this one, I think it's better to
>>> not touch guest notifiers.
>>> * Monitor also masked notifier from SVQ: I think this could be
>>> promising, but SVQ needs to be notified about masking/unmasking
>>> anyway, and there is code that depends on checking the masked notifier
>>> for the pending notification.
>>>
>>>>>> 2) vhost dev start and stop
>>>>>>
>>>>>> ?
>>>>>>
>>>>>>
>>>>>>> +     *
>>>>>>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
>>>>>>> +     */
>>>>>>> +    EventNotifier host_notifier;
>>>>>>> +
>>>>>>> +    /* Virtio queue shadowing */
>>>>>>> +    VirtQueue *vq;
>>>>>>>      } VhostShadowVirtqueue;
>>>>>>>
>>>>>>> +/* Forward guest notifications */
>>>>>>> +static void vhost_handle_guest_kick(EventNotifier *n)
>>>>>>> +{
>>>>>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>>>>>>> +                                             host_notifier);
>>>>>>> +
>>>>>>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    event_notifier_set(&svq->kick_notifier);
>>>>>>> +}
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
>>>>>>> + */
>>>>>>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
>>>>>>> +                                                     unsigned vhost_index,
>>>>>>> +                                                     VhostShadowVirtqueue *svq)
>>>>>>> +{
>>>>>>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
>>>>>>> +    struct vhost_vring_file file = {
>>>>>>> +        .index = vhost_index,
>>>>>>> +        .fd = event_notifier_get_fd(vq_host_notifier),
>>>>>>> +    };
>>>>>>> +    int r;
>>>>>>> +
>>>>>>> +    /* Restore vhost kick */
>>>>>>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
>>>>>>> +    return r ? -errno : 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +/*
>>>>>>> + * Start shadow virtqueue operation.
>>>>>>> + * @dev vhost device
>>>>>>> + * @hidx vhost virtqueue index
>>>>>>> + * @svq Shadow Virtqueue
>>>>>>> + */
>>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>>>>>> +                           unsigned idx,
>>>>>>> +                           VhostShadowVirtqueue *svq)
>>>>>> It looks to me this assumes the vhost_dev is started before
>>>>>> vhost_shadow_vq_start()?
>>>>>>
>>>>> Right.
>>>> This might not true. Guest may enable and disable virtio drivers after
>>>> the shadow virtqueue is started. You need to deal with that.
>>>>
>>> Right, I will test this scenario.
>>>
>>>> Thanks
>>>>
Eugenio Perez Martin March 18, 2021, 10:48 a.m. UTC | #8
On Thu, Mar 18, 2021 at 10:29 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/3/18 下午5:18, Eugenio Perez Martin 写道:
> > On Thu, Mar 18, 2021 at 4:11 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2021/3/18 上午12:47, Eugenio Perez Martin 写道:
> >>> On Wed, Mar 17, 2021 at 3:05 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> 在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
> >>>>> On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
> >>>>>>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
> >>>>>>> stops, so code flow follows usual cleanup.
> >>>>>>>
> >>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>>>>>> ---
> >>>>>>>      hw/virtio/vhost-shadow-virtqueue.h |   7 ++
> >>>>>>>      include/hw/virtio/vhost.h          |   4 +
> >>>>>>>      hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
> >>>>>>>      hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
> >>>>>>>      4 files changed, 265 insertions(+), 2 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> >>>>>>> index 6cc18d6acb..c891c6510d 100644
> >>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> >>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> >>>>>>> @@ -17,6 +17,13 @@
> >>>>>>>
> >>>>>>>      typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >>>>>>>
> >>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> >>>>>>> +                           unsigned idx,
> >>>>>>> +                           VhostShadowVirtqueue *svq);
> >>>>>>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> >>>>>>> +                          unsigned idx,
> >>>>>>> +                          VhostShadowVirtqueue *svq);
> >>>>>>> +
> >>>>>>>      VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
> >>>>>>>
> >>>>>>>      void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
> >>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> >>>>>>> index ac963bf23d..7ffdf9aea0 100644
> >>>>>>> --- a/include/hw/virtio/vhost.h
> >>>>>>> +++ b/include/hw/virtio/vhost.h
> >>>>>>> @@ -55,6 +55,8 @@ struct vhost_iommu {
> >>>>>>>          QLIST_ENTRY(vhost_iommu) iommu_next;
> >>>>>>>      };
> >>>>>>>
> >>>>>>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >>>>>>> +
> >>>>>>>      typedef struct VhostDevConfigOps {
> >>>>>>>          /* Vhost device config space changed callback
> >>>>>>>           */
> >>>>>>> @@ -83,7 +85,9 @@ struct vhost_dev {
> >>>>>>>          uint64_t backend_cap;
> >>>>>>>          bool started;
> >>>>>>>          bool log_enabled;
> >>>>>>> +    bool shadow_vqs_enabled;
> >>>>>>>          uint64_t log_size;
> >>>>>>> +    VhostShadowVirtqueue **shadow_vqs;
> >>>>>> Any reason that you don't embed the shadow virtqueue into
> >>>>>> vhost_virtqueue structure?
> >>>>>>
> >>>>> Not really, it could be relatively big and I would prefer SVQ
> >>>>> members/methods to remain hidden from any other part that includes
> >>>>> vhost.h. But it could be changed, for sure.
> >>>>>
> >>>>>> (Note that there's a masked_notifier in struct vhost_virtqueue).
> >>>>>>
> >>>>> They are used differently: in SVQ the masked notifier is a pointer,
> >>>>> and if it's NULL the SVQ code knows that device is not masked. The
> >>>>> vhost_virtqueue is the real owner.
> >>>> Yes, but it's an example for embedding auxciliary data structures in the
> >>>> vhost_virtqueue.
> >>>>
> >>>>
> >>>>> It could be replaced by a boolean in SVQ or something like that, I
> >>>>> experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
> >>>>> and let vhost.c code to manage all the transitions. But I find clearer
> >>>>> the pointer use, since it's the more natural for the
> >>>>> vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
> >>>>>
> >>>>> This masking/unmasking is the part I dislike the most from this
> >>>>> series, so I'm very open to alternatives.
> >>>> See below. I think we don't even need to care about that.
> >>>>
> >>>>
> >>>>>>>          Error *migration_blocker;
> >>>>>>>          const VhostOps *vhost_ops;
> >>>>>>>          void *opaque;
> >>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> >>>>>>> index 4512e5b058..3e43399e9c 100644
> >>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> >>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> >>>>>>> @@ -8,9 +8,12 @@
> >>>>>>>       */
> >>>>>>>
> >>>>>>>      #include "hw/virtio/vhost-shadow-virtqueue.h"
> >>>>>>> +#include "hw/virtio/vhost.h"
> >>>>>>> +
> >>>>>>> +#include "standard-headers/linux/vhost_types.h"
> >>>>>>>
> >>>>>>>      #include "qemu/error-report.h"
> >>>>>>> -#include "qemu/event_notifier.h"
> >>>>>>> +#include "qemu/main-loop.h"
> >>>>>>>
> >>>>>>>      /* Shadow virtqueue to relay notifications */
> >>>>>>>      typedef struct VhostShadowVirtqueue {
> >>>>>>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
> >>>>>>>          EventNotifier kick_notifier;
> >>>>>>>          /* Shadow call notifier, sent to vhost */
> >>>>>>>          EventNotifier call_notifier;
> >>>>>>> +
> >>>>>>> +    /*
> >>>>>>> +     * Borrowed virtqueue's guest to host notifier.
> >>>>>>> +     * To borrow it in this event notifier allows to register on the event
> >>>>>>> +     * loop and access the associated shadow virtqueue easily. If we use the
> >>>>>>> +     * VirtQueue, we don't have an easy way to retrieve it.
> >>>>>> So this is something that worries me. It looks like a layer violation
> >>>>>> that makes the codes harder to work correctly.
> >>>>>>
> >>>>> I don't follow you here.
> >>>>>
> >>>>> The vhost code already depends on virtqueue in the same sense:
> >>>>> virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
> >>>>> if this behavior ever changes it is unlikely for vhost to keep working
> >>>>> without changes. vhost_virtqueue has a kick/call int where I think it
> >>>>> should be stored actually, but they are never used as far as I see.
> >>>>>
> >>>>> Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
> >>>>> /* Stop processing guest IO notifications in vhost.
> >>>>>     * Start processing them in qemu.
> >>>>>     ...
> >>>>> But it was easier for this mode to miss a notification, since they
> >>>>> create a new host_notifier in virtio_bus_set_host_notifier right away.
> >>>>> So I decided to use the file descriptor already sent to vhost in
> >>>>> regular operation mode, so guest-related resources change less.
> >>>>>
> >>>>> Having said that, maybe it's useful to assert that
> >>>>> vhost_dev_{enable,disable}_notifiers are never called on shadow
> >>>>> virtqueue mode. Also, it could be useful to retrieve it from
> >>>>> virtio_bus, not raw shadow virtqueue, so all get/set are performed
> >>>>> from it. Would that make more sense?
> >>>>>
> >>>>>> I wonder if it would be simpler to start from a vDPA dedicated shadow
> >>>>>> virtqueue implementation:
> >>>>>>
> >>>>>> 1) have the above fields embeded in vhost_vdpa structure
> >>>>>> 2) Work at the level of
> >>>>>> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
> >>>>>>
> >>>>> This notifier is never sent to the device in shadow virtqueue mode.
> >>>>> It's for SVQ to react to guest's notifications, registering it on its
> >>>>> main event loop [1]. So if I perform these changes the way I
> >>>>> understand them, SVQ would still rely on this borrowed EventNotifier,
> >>>>> and it would send to the vDPA device the newly created kick_notifier
> >>>>> of VhostShadowVirtqueue.
> >>>> The point is that vhost code should be coupled loosely with virtio. If
> >>>> you try to "borrow" EventNotifier from virtio, you need to deal with a
> >>>> lot of synchrization. An exampleis the masking stuffs.
> >>>>
> >>> I still don't follow this, sorry.
> >>>
> >>> The svq->host_notifier event notifier is not affected by the masking
> >>> issue, it is completely private to SVQ. This commit creates and uses
> >>> it, and nothing related to masking is touched until the next commit.
> >>>
> >>>>>> Then the layer is still isolated and you have a much simpler context to
> >>>>>> work that you don't need to care a lot of synchornization:
> >>>>>>
> >>>>>> 1) vq masking
> >>>>> This EventNotifier is not used for masking, it does not change from
> >>>>> the start of the shadow virtqueue operation through its end. Call fd
> >>>>> sent to vhost/vdpa device does not change either in shadow virtqueue
> >>>>> mode operation with masking/unmasking. I will try to document it
> >>>>> better.
> >>>>>
> >>>>> I think that we will need to handle synchronization with
> >>>>> masking/unmasking from the guest and dynamically enabling SVQ
> >>>>> operation mode, since they can happen at the same time as long as we
> >>>>> let the guest run. There may be better ways of synchronizing them of
> >>>>> course, but I don't see how moving to the vhost-vdpa backend helps
> >>>>> with this. Please expand if I've missed it.
> >>>>>
> >>>>> Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
> >>>>> to future patchsets?
> >>>> So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and
> >>>> hide them from the upper layers like virtio. This means it works at
> >>>> vhost level which can see vhost_vring_file only. When enalbed, what it
> >>>> needs is just:
> >>>>
> >>>> 1) switch to use svq kickfd and relay ioeventfd to svq kickfd
> >>>> 2) switch to use svq callfd and relay svq callfd to irqfd
> >>>>
> >>>> It will still behave like a vhost-backend that the switching is done
> >>>> internally in vhost-vDPA which is totally transparent to the virtio
> >>>> codes of Qemu.
> >>>>
> >>>> E.g:
> >>>>
> >>>> 1) in the case of guest notifier masking, we don't need to do anything
> >>>> since virtio codes will replace another irqfd for us.
> >>> Assuming that we don't modify vhost masking code, but send shadow
> >>> virtqueue call descriptor to the vhost device:
> >>>
> >>> If guest virtio code mask the virtqueue and replaces the vhost-vdpa
> >>> device call fd (VhostShadowVirtqueue.call_notifier in the next commit,
> >>> or the descriptor in your previous second point, svq callfd) with the
> >>> masked notifier, vhost_shadow_vq_handle_call will not be called
> >>> anymore, and no more used descriptors will be forwarded. They will be
> >>> stuck if the shadow virtqueue forever. Guest itself cannot recover
> >>> from this situation, since a masking will set irqfd, not SVQ call fd.
> >>
> >> Just to make sure we're in the same page. During vq masking, the virtio
> >> codes actually use the masked_notifier as callfd in vhost_virtqueue_mask():
> >>
> >>       if (mask) {
> >>           assert(vdev->use_guest_notifier_mask);
> >>           file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
> >>       } else {
> >>       file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
> >>       }
> >>
> >>       file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
> >>       r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
> >>
> >> So consider the shadow virtqueue in done at vhost-vDPA. We just need to
> >> make sure
> >>
> >> 1) update the callfd which passed by virtio layer via set_vring_kick()
> >> 2) always write to the callfd during vhost_shadow_vq_handle_call()
> >>
> >> Then
> >>
> >> 3) When shadow vq is enabled, we just set the callfd of shadow virtqueue
> >> to vDPA via VHOST_SET_VRING_CALL, and poll the svq callfd
> >> 4) When shadow vq is disabled, we just set the callfd that is passed by
> >> virtio via VHOST_SET_VRING_CALL, stop poll the svq callfd
> >>
> >> So you can see in step 2 and 4, we don't need to know whether or not the
> >> vq is masked since we follow the vhost protocol "VhostOps" and do
> >> everyhing transparently in the vhost-(vDPA) layer.
> >>
> > All of this assumes that we can enable/disable SVQ dynamically while
> > the device is running. If it's not the case, there is no need for the
> > mutex neither in vhost.c code nor vdpa_backend.
> >
> > As I see it, the issue is that step (2) and (4) happens in different
> > threads: (2) is in vCPU vmexit, and (4) is in main event loop.
> > Consider unmasking and disabling SVQ at the same time with no mutex:
> >
> > vCPU vmexit thread                     aio thread
> > (unmask)                               (stops SVQ)
> > |                                      |
> > |                                      // Last callfd set was masked_notifier
> > |                                      vdpa_backend.callfd = \
> > |                                              atomic_read(masked_notifier).
> > |                                      |
> > vhost_set_vring_call(vq.guest_notifier)|
> > -> vdpa_backend.callfd = \             |
> >             vq.guest_notifier           |
> > |                                      |
> > |                                      ioctl(vdpa,
> > VHOST_SET_VRING_CALL, vdpa_backend.callfd)
> > |
> > // guest expects more interrupts, but
> > // device just set masked
> >
> > And vhost_set_vring_call could happen entirely even while ioctl is
> > being executed.
> >
> > So that is the reason for the mutex: vdpa_backend.call_fd and the
> > ioctl VHOST_SET_VRING_CALL must be serialized. I'm ok with moving to
> > vdpa backend, but it's the same code, just in vdpa_backend.c instead
> > of vhost.c, so it becomes less generic in my opinion.
>
>
> You are right. But let's consider if we can avoid the dedicated mutex.
>
> E.g can we use the BQL, bascially we need to synchronizae with iothread.
>
> Or is it possible to schedule bh then things are serailzied automatically?
>

I tried RCU with no success, and I think the same issues apply to bh.
I will try to explain the best I can what I achieved in the past, and
why I discarded it. I will explore BQL approaches, it could be simpler
that way actually.

The hard part to achieve if that no notification can be forwarded to
the guest once masking vmexit returns (isn't it?). Unmasking scenario
is easy with RCU, since the pending notification could reach the guest
asynchronously if it exists.

On the other hand, whatever guest set should take priority over
whatever shadow_vq_stop sets.

With RCU, the problem is that the synchronization point should be the
vmexit thread. The function vhost_virtqueue_mask is already called
within RCU, so it could happen that RCU lock is held longer than it
returns, so the effective masking could happen after vmexit returns. I
see no way to make something like "call_rcu but only in this thread"
or, in other words, " rcu_synchronize after the rcu_unlock of this
thread and then run this".

I tried to explore to synchronize that situation in the event loop,
but the guest is able to call unmask/mask again, making the race
condition. If I can mark a range where unmask/mask cannot return, I'm
creating an artificial mutex. If I retry in the main event loop, there
is a window where notification can reach the guest after masking.

In order to reduce this window, shadow_virtqueue_stop could set the
masked notifier fd unconditionally, and then check if it should unmask
under the mutex. I'm not sure if this is worth however, since
enabling/disabling already involves a system call.

I think it would be way more natural to at least protect
vhost_virtqueue.notifier_is_masked with the BQL, however, so I will
check that possibility. It would be great to be able to do it on bh,
but I think this opens a window for the host to send notifications
when the guest has masked the vq, since mask/unmasking could happen
while bh is running as far as I see.

Actually, in my test I override vhost_virtqueue_mask, so it looks more
similar to your proposal with VhostOps.

Thanks!

>
> >
> >>>> 2) easily to deal with vhost dev start and stop
> >>>>
> >>>> The advantages are obvious, simple and easy to implement.
> >>>>
> >>> I still don't see how performing this step from backend code avoids
> >>> the synchronization problem, since they will be done from different
> >>> threads anyway. Not sure what piece I'm missing.
> >>
> >> See my reply in another thread. If you enable the shadow virtqueue via a
> >> OOB monitor that's a real issue.
> >>
> >> But I don't think we need to do that since
> >>
> >> 1) SVQ should be transparnet to management
> >> 2) unncessary synchronization issue
> >>
> >> We can enable the shadow virtqueue through cli, new parameter with
> >> vhost-vdpa probably. Then we don't need to care about threads. And in
> >> the final version with full live migration support, the shadow virtqueue
> >> should be enabled automatically. E.g for the device without
> >> VHOST_F_LOG_ALL or we can have a dedicated capability of vDPA via
> >> VHOST_GET_BACKEND_FEATURES.
> >>
> > It should be enabled automatically in those condition, but it also
> > needs to be dynamic, and only be active during migration. Otherwise,
> > guest should use regular vdpa operation. The problem with masking is
> > the same if we enable with QMP or because live migration event.
> >
> > So we will have the previous synchronization problem sooner or later.
> > If we omit the rollback to regular vdpa operation (in other words,
> > disabling SVQ), code can be simplified, but I'm not sure if that is
> > desirable.
>
>
> Rgiht, so I'm ok to have the synchronziation from the start if you wish.
>
> But we need to figure out what to synchronize and how to do synchronize.
>
> THanks
>
>
> >
> > Thanks!
> >
> >> Thanks
> >>
> >>
> >>> I can see / tested a few solutions but I don't like them a lot:
> >>>
> >>> * Forbid hot-swapping from/to shadow virtqueue mode, and set it from
> >>> cmdline: We will have to deal with setting the SVQ mode dynamically
> >>> sooner or later if we want to use it for live migration.
> >>> * Forbid coming back to regular mode after switching to shadow
> >>> virtqueue mode: The heavy part of the synchronization comes from svq
> >>> stopping code, since we need to serialize the setting of device call
> >>> fd. This could be acceptable, but I'm not sure about the implications:
> >>> What happens if live migration fails and we need to step back? A mutex
> >>> is not needed in this scenario, it's ok with atomics and RCU code.
> >>>
> >>> * Replace KVM_IRQFD instead and let SVQ poll the old one and masked
> >>> notifier: I haven't thought a lot of this one, I think it's better to
> >>> not touch guest notifiers.
> >>> * Monitor also masked notifier from SVQ: I think this could be
> >>> promising, but SVQ needs to be notified about masking/unmasking
> >>> anyway, and there is code that depends on checking the masked notifier
> >>> for the pending notification.
> >>>
> >>>>>> 2) vhost dev start and stop
> >>>>>>
> >>>>>> ?
> >>>>>>
> >>>>>>
> >>>>>>> +     *
> >>>>>>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
> >>>>>>> +     */
> >>>>>>> +    EventNotifier host_notifier;
> >>>>>>> +
> >>>>>>> +    /* Virtio queue shadowing */
> >>>>>>> +    VirtQueue *vq;
> >>>>>>>      } VhostShadowVirtqueue;
> >>>>>>>
> >>>>>>> +/* Forward guest notifications */
> >>>>>>> +static void vhost_handle_guest_kick(EventNotifier *n)
> >>>>>>> +{
> >>>>>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> >>>>>>> +                                             host_notifier);
> >>>>>>> +
> >>>>>>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
> >>>>>>> +        return;
> >>>>>>> +    }
> >>>>>>> +
> >>>>>>> +    event_notifier_set(&svq->kick_notifier);
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +/*
> >>>>>>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
> >>>>>>> + */
> >>>>>>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
> >>>>>>> +                                                     unsigned vhost_index,
> >>>>>>> +                                                     VhostShadowVirtqueue *svq)
> >>>>>>> +{
> >>>>>>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> >>>>>>> +    struct vhost_vring_file file = {
> >>>>>>> +        .index = vhost_index,
> >>>>>>> +        .fd = event_notifier_get_fd(vq_host_notifier),
> >>>>>>> +    };
> >>>>>>> +    int r;
> >>>>>>> +
> >>>>>>> +    /* Restore vhost kick */
> >>>>>>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> >>>>>>> +    return r ? -errno : 0;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +/*
> >>>>>>> + * Start shadow virtqueue operation.
> >>>>>>> + * @dev vhost device
> >>>>>>> + * @hidx vhost virtqueue index
> >>>>>>> + * @svq Shadow Virtqueue
> >>>>>>> + */
> >>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> >>>>>>> +                           unsigned idx,
> >>>>>>> +                           VhostShadowVirtqueue *svq)
> >>>>>> It looks to me this assumes the vhost_dev is started before
> >>>>>> vhost_shadow_vq_start()?
> >>>>>>
> >>>>> Right.
> >>>> This might not true. Guest may enable and disable virtio drivers after
> >>>> the shadow virtqueue is started. You need to deal with that.
> >>>>
> >>> Right, I will test this scenario.
> >>>
> >>>> Thanks
> >>>>
>
Eugenio Perez Martin March 18, 2021, 12:04 p.m. UTC | #9
On Thu, Mar 18, 2021 at 11:48 AM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Thu, Mar 18, 2021 at 10:29 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2021/3/18 下午5:18, Eugenio Perez Martin 写道:
> > > On Thu, Mar 18, 2021 at 4:11 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>
> > >> 在 2021/3/18 上午12:47, Eugenio Perez Martin 写道:
> > >>> On Wed, Mar 17, 2021 at 3:05 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>>> 在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
> > >>>>> On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>>>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
> > >>>>>>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
> > >>>>>>> stops, so code flow follows usual cleanup.
> > >>>>>>>
> > >>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > >>>>>>> ---
> > >>>>>>>      hw/virtio/vhost-shadow-virtqueue.h |   7 ++
> > >>>>>>>      include/hw/virtio/vhost.h          |   4 +
> > >>>>>>>      hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
> > >>>>>>>      hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
> > >>>>>>>      4 files changed, 265 insertions(+), 2 deletions(-)
> > >>>>>>>
> > >>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > >>>>>>> index 6cc18d6acb..c891c6510d 100644
> > >>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> > >>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > >>>>>>> @@ -17,6 +17,13 @@
> > >>>>>>>
> > >>>>>>>      typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > >>>>>>>
> > >>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> > >>>>>>> +                           unsigned idx,
> > >>>>>>> +                           VhostShadowVirtqueue *svq);
> > >>>>>>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
> > >>>>>>> +                          unsigned idx,
> > >>>>>>> +                          VhostShadowVirtqueue *svq);
> > >>>>>>> +
> > >>>>>>>      VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
> > >>>>>>>
> > >>>>>>>      void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
> > >>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> > >>>>>>> index ac963bf23d..7ffdf9aea0 100644
> > >>>>>>> --- a/include/hw/virtio/vhost.h
> > >>>>>>> +++ b/include/hw/virtio/vhost.h
> > >>>>>>> @@ -55,6 +55,8 @@ struct vhost_iommu {
> > >>>>>>>          QLIST_ENTRY(vhost_iommu) iommu_next;
> > >>>>>>>      };
> > >>>>>>>
> > >>>>>>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> > >>>>>>> +
> > >>>>>>>      typedef struct VhostDevConfigOps {
> > >>>>>>>          /* Vhost device config space changed callback
> > >>>>>>>           */
> > >>>>>>> @@ -83,7 +85,9 @@ struct vhost_dev {
> > >>>>>>>          uint64_t backend_cap;
> > >>>>>>>          bool started;
> > >>>>>>>          bool log_enabled;
> > >>>>>>> +    bool shadow_vqs_enabled;
> > >>>>>>>          uint64_t log_size;
> > >>>>>>> +    VhostShadowVirtqueue **shadow_vqs;
> > >>>>>> Any reason that you don't embed the shadow virtqueue into
> > >>>>>> vhost_virtqueue structure?
> > >>>>>>
> > >>>>> Not really, it could be relatively big and I would prefer SVQ
> > >>>>> members/methods to remain hidden from any other part that includes
> > >>>>> vhost.h. But it could be changed, for sure.
> > >>>>>
> > >>>>>> (Note that there's a masked_notifier in struct vhost_virtqueue).
> > >>>>>>
> > >>>>> They are used differently: in SVQ the masked notifier is a pointer,
> > >>>>> and if it's NULL the SVQ code knows that device is not masked. The
> > >>>>> vhost_virtqueue is the real owner.
> > >>>> Yes, but it's an example for embedding auxciliary data structures in the
> > >>>> vhost_virtqueue.
> > >>>>
> > >>>>
> > >>>>> It could be replaced by a boolean in SVQ or something like that, I
> > >>>>> experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
> > >>>>> and let vhost.c code to manage all the transitions. But I find clearer
> > >>>>> the pointer use, since it's the more natural for the
> > >>>>> vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
> > >>>>>
> > >>>>> This masking/unmasking is the part I dislike the most from this
> > >>>>> series, so I'm very open to alternatives.
> > >>>> See below. I think we don't even need to care about that.
> > >>>>
> > >>>>
> > >>>>>>>          Error *migration_blocker;
> > >>>>>>>          const VhostOps *vhost_ops;
> > >>>>>>>          void *opaque;
> > >>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > >>>>>>> index 4512e5b058..3e43399e9c 100644
> > >>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> > >>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > >>>>>>> @@ -8,9 +8,12 @@
> > >>>>>>>       */
> > >>>>>>>
> > >>>>>>>      #include "hw/virtio/vhost-shadow-virtqueue.h"
> > >>>>>>> +#include "hw/virtio/vhost.h"
> > >>>>>>> +
> > >>>>>>> +#include "standard-headers/linux/vhost_types.h"
> > >>>>>>>
> > >>>>>>>      #include "qemu/error-report.h"
> > >>>>>>> -#include "qemu/event_notifier.h"
> > >>>>>>> +#include "qemu/main-loop.h"
> > >>>>>>>
> > >>>>>>>      /* Shadow virtqueue to relay notifications */
> > >>>>>>>      typedef struct VhostShadowVirtqueue {
> > >>>>>>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
> > >>>>>>>          EventNotifier kick_notifier;
> > >>>>>>>          /* Shadow call notifier, sent to vhost */
> > >>>>>>>          EventNotifier call_notifier;
> > >>>>>>> +
> > >>>>>>> +    /*
> > >>>>>>> +     * Borrowed virtqueue's guest to host notifier.
> > >>>>>>> +     * To borrow it in this event notifier allows to register on the event
> > >>>>>>> +     * loop and access the associated shadow virtqueue easily. If we use the
> > >>>>>>> +     * VirtQueue, we don't have an easy way to retrieve it.
> > >>>>>> So this is something that worries me. It looks like a layer violation
> > >>>>>> that makes the codes harder to work correctly.
> > >>>>>>
> > >>>>> I don't follow you here.
> > >>>>>
> > >>>>> The vhost code already depends on virtqueue in the same sense:
> > >>>>> virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
> > >>>>> if this behavior ever changes it is unlikely for vhost to keep working
> > >>>>> without changes. vhost_virtqueue has a kick/call int where I think it
> > >>>>> should be stored actually, but they are never used as far as I see.
> > >>>>>
> > >>>>> Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
> > >>>>> /* Stop processing guest IO notifications in vhost.
> > >>>>>     * Start processing them in qemu.
> > >>>>>     ...
> > >>>>> But it was easier for this mode to miss a notification, since they
> > >>>>> create a new host_notifier in virtio_bus_set_host_notifier right away.
> > >>>>> So I decided to use the file descriptor already sent to vhost in
> > >>>>> regular operation mode, so guest-related resources change less.
> > >>>>>
> > >>>>> Having said that, maybe it's useful to assert that
> > >>>>> vhost_dev_{enable,disable}_notifiers are never called on shadow
> > >>>>> virtqueue mode. Also, it could be useful to retrieve it from
> > >>>>> virtio_bus, not raw shadow virtqueue, so all get/set are performed
> > >>>>> from it. Would that make more sense?
> > >>>>>
> > >>>>>> I wonder if it would be simpler to start from a vDPA dedicated shadow
> > >>>>>> virtqueue implementation:
> > >>>>>>
> > >>>>>> 1) have the above fields embeded in vhost_vdpa structure
> > >>>>>> 2) Work at the level of
> > >>>>>> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
> > >>>>>>
> > >>>>> This notifier is never sent to the device in shadow virtqueue mode.
> > >>>>> It's for SVQ to react to guest's notifications, registering it on its
> > >>>>> main event loop [1]. So if I perform these changes the way I
> > >>>>> understand them, SVQ would still rely on this borrowed EventNotifier,
> > >>>>> and it would send to the vDPA device the newly created kick_notifier
> > >>>>> of VhostShadowVirtqueue.
> > >>>> The point is that vhost code should be coupled loosely with virtio. If
> > >>>> you try to "borrow" EventNotifier from virtio, you need to deal with a
> > >>>> lot of synchrization. An exampleis the masking stuffs.
> > >>>>
> > >>> I still don't follow this, sorry.
> > >>>
> > >>> The svq->host_notifier event notifier is not affected by the masking
> > >>> issue, it is completely private to SVQ. This commit creates and uses
> > >>> it, and nothing related to masking is touched until the next commit.
> > >>>
> > >>>>>> Then the layer is still isolated and you have a much simpler context to
> > >>>>>> work that you don't need to care a lot of synchornization:
> > >>>>>>
> > >>>>>> 1) vq masking
> > >>>>> This EventNotifier is not used for masking, it does not change from
> > >>>>> the start of the shadow virtqueue operation through its end. Call fd
> > >>>>> sent to vhost/vdpa device does not change either in shadow virtqueue
> > >>>>> mode operation with masking/unmasking. I will try to document it
> > >>>>> better.
> > >>>>>
> > >>>>> I think that we will need to handle synchronization with
> > >>>>> masking/unmasking from the guest and dynamically enabling SVQ
> > >>>>> operation mode, since they can happen at the same time as long as we
> > >>>>> let the guest run. There may be better ways of synchronizing them of
> > >>>>> course, but I don't see how moving to the vhost-vdpa backend helps
> > >>>>> with this. Please expand if I've missed it.
> > >>>>>
> > >>>>> Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
> > >>>>> to future patchsets?
> > >>>> So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and
> > >>>> hide them from the upper layers like virtio. This means it works at
> > >>>> vhost level which can see vhost_vring_file only. When enalbed, what it
> > >>>> needs is just:
> > >>>>
> > >>>> 1) switch to use svq kickfd and relay ioeventfd to svq kickfd
> > >>>> 2) switch to use svq callfd and relay svq callfd to irqfd
> > >>>>
> > >>>> It will still behave like a vhost-backend that the switching is done
> > >>>> internally in vhost-vDPA which is totally transparent to the virtio
> > >>>> codes of Qemu.
> > >>>>
> > >>>> E.g:
> > >>>>
> > >>>> 1) in the case of guest notifier masking, we don't need to do anything
> > >>>> since virtio codes will replace another irqfd for us.
> > >>> Assuming that we don't modify vhost masking code, but send shadow
> > >>> virtqueue call descriptor to the vhost device:
> > >>>
> > >>> If guest virtio code mask the virtqueue and replaces the vhost-vdpa
> > >>> device call fd (VhostShadowVirtqueue.call_notifier in the next commit,
> > >>> or the descriptor in your previous second point, svq callfd) with the
> > >>> masked notifier, vhost_shadow_vq_handle_call will not be called
> > >>> anymore, and no more used descriptors will be forwarded. They will be
> > >>> stuck if the shadow virtqueue forever. Guest itself cannot recover
> > >>> from this situation, since a masking will set irqfd, not SVQ call fd.
> > >>
> > >> Just to make sure we're in the same page. During vq masking, the virtio
> > >> codes actually use the masked_notifier as callfd in vhost_virtqueue_mask():
> > >>
> > >>       if (mask) {
> > >>           assert(vdev->use_guest_notifier_mask);
> > >>           file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
> > >>       } else {
> > >>       file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
> > >>       }
> > >>
> > >>       file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
> > >>       r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
> > >>
> > >> So consider the shadow virtqueue in done at vhost-vDPA. We just need to
> > >> make sure
> > >>
> > >> 1) update the callfd which passed by virtio layer via set_vring_kick()
> > >> 2) always write to the callfd during vhost_shadow_vq_handle_call()
> > >>
> > >> Then
> > >>
> > >> 3) When shadow vq is enabled, we just set the callfd of shadow virtqueue
> > >> to vDPA via VHOST_SET_VRING_CALL, and poll the svq callfd
> > >> 4) When shadow vq is disabled, we just set the callfd that is passed by
> > >> virtio via VHOST_SET_VRING_CALL, stop poll the svq callfd
> > >>
> > >> So you can see in step 2 and 4, we don't need to know whether or not the
> > >> vq is masked since we follow the vhost protocol "VhostOps" and do
> > >> everyhing transparently in the vhost-(vDPA) layer.
> > >>
> > > All of this assumes that we can enable/disable SVQ dynamically while
> > > the device is running. If it's not the case, there is no need for the
> > > mutex neither in vhost.c code nor vdpa_backend.
> > >
> > > As I see it, the issue is that step (2) and (4) happens in different
> > > threads: (2) is in vCPU vmexit, and (4) is in main event loop.
> > > Consider unmasking and disabling SVQ at the same time with no mutex:
> > >
> > > vCPU vmexit thread                     aio thread
> > > (unmask)                               (stops SVQ)
> > > |                                      |
> > > |                                      // Last callfd set was masked_notifier
> > > |                                      vdpa_backend.callfd = \
> > > |                                              atomic_read(masked_notifier).
> > > |                                      |
> > > vhost_set_vring_call(vq.guest_notifier)|
> > > -> vdpa_backend.callfd = \             |
> > >             vq.guest_notifier           |
> > > |                                      |
> > > |                                      ioctl(vdpa,
> > > VHOST_SET_VRING_CALL, vdpa_backend.callfd)
> > > |
> > > // guest expects more interrupts, but
> > > // device just set masked
> > >
> > > And vhost_set_vring_call could happen entirely even while ioctl is
> > > being executed.
> > >
> > > So that is the reason for the mutex: vdpa_backend.call_fd and the
> > > ioctl VHOST_SET_VRING_CALL must be serialized. I'm ok with moving to
> > > vdpa backend, but it's the same code, just in vdpa_backend.c instead
> > > of vhost.c, so it becomes less generic in my opinion.
> >
> >
> > You are right. But let's consider if we can avoid the dedicated mutex.
> >
> > E.g can we use the BQL, bascially we need to synchronizae with iothread.
> >
> > Or is it possible to schedule bh then things are serailzied automatically?
> >
>
> I tried RCU with no success, and I think the same issues apply to bh.
> I will try to explain the best I can what I achieved in the past, and
> why I discarded it. I will explore BQL approaches, it could be simpler
> that way actually.
>
> The hard part to achieve if that no notification can be forwarded to
> the guest once masking vmexit returns (isn't it?). Unmasking scenario
> is easy with RCU, since the pending notification could reach the guest
> asynchronously if it exists.
>
> On the other hand, whatever guest set should take priority over
> whatever shadow_vq_stop sets.
>
> With RCU, the problem is that the synchronization point should be the
> vmexit thread. The function vhost_virtqueue_mask is already called
> within RCU, so it could happen that RCU lock is held longer than it
> returns, so the effective masking could happen after vmexit returns. I
> see no way to make something like "call_rcu but only in this thread"
> or, in other words, " rcu_synchronize after the rcu_unlock of this
> thread and then run this".
>
> I tried to explore to synchronize that situation in the event loop,
> but the guest is able to call unmask/mask again, making the race
> condition. If I can mark a range where unmask/mask cannot return, I'm
> creating an artificial mutex. If I retry in the main event loop, there
> is a window where notification can reach the guest after masking.
>
> In order to reduce this window, shadow_virtqueue_stop could set the
> masked notifier fd unconditionally, and then check if it should unmask
> under the mutex. I'm not sure if this is worth however, since
> enabling/disabling already involves a system call.
>
> I think it would be way more natural to at least protect
> vhost_virtqueue.notifier_is_masked with the BQL, however, so I will
> check that possibility. It would be great to be able to do it on bh,
> but I think this opens a window for the host to send notifications
> when the guest has masked the vq, since mask/unmasking could happen
> while bh is running as far as I see.
>

So actually vhost_shadow_virtqueue_start/stop (being a QMP command)
and vhost_virtqueue_mask (vmexit) already runs under BQL.

The ideal scenario would be to run kick/call handlers in its own aio
context and do not take BQL on them, but I think this is doable with
just atomics and maybe events. For this series I think we can delete
the introduced mutex (and maybe replace it with an assertion?)

Thanks!

> Actually, in my test I override vhost_virtqueue_mask, so it looks more
> similar to your proposal with VhostOps.
>
> Thanks!
>
> >
> > >
> > >>>> 2) easily to deal with vhost dev start and stop
> > >>>>
> > >>>> The advantages are obvious, simple and easy to implement.
> > >>>>
> > >>> I still don't see how performing this step from backend code avoids
> > >>> the synchronization problem, since they will be done from different
> > >>> threads anyway. Not sure what piece I'm missing.
> > >>
> > >> See my reply in another thread. If you enable the shadow virtqueue via a
> > >> OOB monitor that's a real issue.
> > >>
> > >> But I don't think we need to do that since
> > >>
> > >> 1) SVQ should be transparnet to management
> > >> 2) unncessary synchronization issue
> > >>
> > >> We can enable the shadow virtqueue through cli, new parameter with
> > >> vhost-vdpa probably. Then we don't need to care about threads. And in
> > >> the final version with full live migration support, the shadow virtqueue
> > >> should be enabled automatically. E.g for the device without
> > >> VHOST_F_LOG_ALL or we can have a dedicated capability of vDPA via
> > >> VHOST_GET_BACKEND_FEATURES.
> > >>
> > > It should be enabled automatically in those condition, but it also
> > > needs to be dynamic, and only be active during migration. Otherwise,
> > > guest should use regular vdpa operation. The problem with masking is
> > > the same if we enable with QMP or because live migration event.
> > >
> > > So we will have the previous synchronization problem sooner or later.
> > > If we omit the rollback to regular vdpa operation (in other words,
> > > disabling SVQ), code can be simplified, but I'm not sure if that is
> > > desirable.
> >
> >
> > Rgiht, so I'm ok to have the synchronziation from the start if you wish.
> >
> > But we need to figure out what to synchronize and how to do synchronize.
> >
> > THanks
> >
> >
> > >
> > > Thanks!
> > >
> > >> Thanks
> > >>
> > >>
> > >>> I can see / tested a few solutions but I don't like them a lot:
> > >>>
> > >>> * Forbid hot-swapping from/to shadow virtqueue mode, and set it from
> > >>> cmdline: We will have to deal with setting the SVQ mode dynamically
> > >>> sooner or later if we want to use it for live migration.
> > >>> * Forbid coming back to regular mode after switching to shadow
> > >>> virtqueue mode: The heavy part of the synchronization comes from svq
> > >>> stopping code, since we need to serialize the setting of device call
> > >>> fd. This could be acceptable, but I'm not sure about the implications:
> > >>> What happens if live migration fails and we need to step back? A mutex
> > >>> is not needed in this scenario, it's ok with atomics and RCU code.
> > >>>
> > >>> * Replace KVM_IRQFD instead and let SVQ poll the old one and masked
> > >>> notifier: I haven't thought a lot of this one, I think it's better to
> > >>> not touch guest notifiers.
> > >>> * Monitor also masked notifier from SVQ: I think this could be
> > >>> promising, but SVQ needs to be notified about masking/unmasking
> > >>> anyway, and there is code that depends on checking the masked notifier
> > >>> for the pending notification.
> > >>>
> > >>>>>> 2) vhost dev start and stop
> > >>>>>>
> > >>>>>> ?
> > >>>>>>
> > >>>>>>
> > >>>>>>> +     *
> > >>>>>>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
> > >>>>>>> +     */
> > >>>>>>> +    EventNotifier host_notifier;
> > >>>>>>> +
> > >>>>>>> +    /* Virtio queue shadowing */
> > >>>>>>> +    VirtQueue *vq;
> > >>>>>>>      } VhostShadowVirtqueue;
> > >>>>>>>
> > >>>>>>> +/* Forward guest notifications */
> > >>>>>>> +static void vhost_handle_guest_kick(EventNotifier *n)
> > >>>>>>> +{
> > >>>>>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
> > >>>>>>> +                                             host_notifier);
> > >>>>>>> +
> > >>>>>>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
> > >>>>>>> +        return;
> > >>>>>>> +    }
> > >>>>>>> +
> > >>>>>>> +    event_notifier_set(&svq->kick_notifier);
> > >>>>>>> +}
> > >>>>>>> +
> > >>>>>>> +/*
> > >>>>>>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
> > >>>>>>> + */
> > >>>>>>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
> > >>>>>>> +                                                     unsigned vhost_index,
> > >>>>>>> +                                                     VhostShadowVirtqueue *svq)
> > >>>>>>> +{
> > >>>>>>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
> > >>>>>>> +    struct vhost_vring_file file = {
> > >>>>>>> +        .index = vhost_index,
> > >>>>>>> +        .fd = event_notifier_get_fd(vq_host_notifier),
> > >>>>>>> +    };
> > >>>>>>> +    int r;
> > >>>>>>> +
> > >>>>>>> +    /* Restore vhost kick */
> > >>>>>>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
> > >>>>>>> +    return r ? -errno : 0;
> > >>>>>>> +}
> > >>>>>>> +
> > >>>>>>> +/*
> > >>>>>>> + * Start shadow virtqueue operation.
> > >>>>>>> + * @dev vhost device
> > >>>>>>> + * @hidx vhost virtqueue index
> > >>>>>>> + * @svq Shadow Virtqueue
> > >>>>>>> + */
> > >>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
> > >>>>>>> +                           unsigned idx,
> > >>>>>>> +                           VhostShadowVirtqueue *svq)
> > >>>>>> It looks to me this assumes the vhost_dev is started before
> > >>>>>> vhost_shadow_vq_start()?
> > >>>>>>
> > >>>>> Right.
> > >>>> This might not true. Guest may enable and disable virtio drivers after
> > >>>> the shadow virtqueue is started. You need to deal with that.
> > >>>>
> > >>> Right, I will test this scenario.
> > >>>
> > >>>> Thanks
> > >>>>
> >
Jason Wang March 19, 2021, 6:55 a.m. UTC | #10
在 2021/3/18 下午8:04, Eugenio Perez Martin 写道:
> On Thu, Mar 18, 2021 at 11:48 AM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
>> On Thu, Mar 18, 2021 at 10:29 AM Jason Wang <jasowang@redhat.com> wrote:
>>>
>>> 在 2021/3/18 下午5:18, Eugenio Perez Martin 写道:
>>>> On Thu, Mar 18, 2021 at 4:11 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>> 在 2021/3/18 上午12:47, Eugenio Perez Martin 写道:
>>>>>> On Wed, Mar 17, 2021 at 3:05 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>> 在 2021/3/16 下午6:31, Eugenio Perez Martin 写道:
>>>>>>>> On Tue, Mar 16, 2021 at 8:18 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>>> 在 2021/3/16 上午3:48, Eugenio Pérez 写道:
>>>>>>>>>> Shadow virtqueue notifications forwarding is disabled when vhost_dev
>>>>>>>>>> stops, so code flow follows usual cleanup.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>>>>>>>>> ---
>>>>>>>>>>       hw/virtio/vhost-shadow-virtqueue.h |   7 ++
>>>>>>>>>>       include/hw/virtio/vhost.h          |   4 +
>>>>>>>>>>       hw/virtio/vhost-shadow-virtqueue.c | 113 ++++++++++++++++++++++-
>>>>>>>>>>       hw/virtio/vhost.c                  | 143 ++++++++++++++++++++++++++++-
>>>>>>>>>>       4 files changed, 265 insertions(+), 2 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
>>>>>>>>>> index 6cc18d6acb..c891c6510d 100644
>>>>>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>>>>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>>>>>>>>> @@ -17,6 +17,13 @@
>>>>>>>>>>
>>>>>>>>>>       typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>>>>>>>>>
>>>>>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>>>>>>>>> +                           unsigned idx,
>>>>>>>>>> +                           VhostShadowVirtqueue *svq);
>>>>>>>>>> +void vhost_shadow_vq_stop(struct vhost_dev *dev,
>>>>>>>>>> +                          unsigned idx,
>>>>>>>>>> +                          VhostShadowVirtqueue *svq);
>>>>>>>>>> +
>>>>>>>>>>       VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
>>>>>>>>>>
>>>>>>>>>>       void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
>>>>>>>>>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>>>>>>>>>> index ac963bf23d..7ffdf9aea0 100644
>>>>>>>>>> --- a/include/hw/virtio/vhost.h
>>>>>>>>>> +++ b/include/hw/virtio/vhost.h
>>>>>>>>>> @@ -55,6 +55,8 @@ struct vhost_iommu {
>>>>>>>>>>           QLIST_ENTRY(vhost_iommu) iommu_next;
>>>>>>>>>>       };
>>>>>>>>>>
>>>>>>>>>> +typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
>>>>>>>>>> +
>>>>>>>>>>       typedef struct VhostDevConfigOps {
>>>>>>>>>>           /* Vhost device config space changed callback
>>>>>>>>>>            */
>>>>>>>>>> @@ -83,7 +85,9 @@ struct vhost_dev {
>>>>>>>>>>           uint64_t backend_cap;
>>>>>>>>>>           bool started;
>>>>>>>>>>           bool log_enabled;
>>>>>>>>>> +    bool shadow_vqs_enabled;
>>>>>>>>>>           uint64_t log_size;
>>>>>>>>>> +    VhostShadowVirtqueue **shadow_vqs;
>>>>>>>>> Any reason that you don't embed the shadow virtqueue into
>>>>>>>>> vhost_virtqueue structure?
>>>>>>>>>
>>>>>>>> Not really, it could be relatively big and I would prefer SVQ
>>>>>>>> members/methods to remain hidden from any other part that includes
>>>>>>>> vhost.h. But it could be changed, for sure.
>>>>>>>>
>>>>>>>>> (Note that there's a masked_notifier in struct vhost_virtqueue).
>>>>>>>>>
>>>>>>>> They are used differently: in SVQ the masked notifier is a pointer,
>>>>>>>> and if it's NULL the SVQ code knows that device is not masked. The
>>>>>>>> vhost_virtqueue is the real owner.
>>>>>>> Yes, but it's an example for embedding auxciliary data structures in the
>>>>>>> vhost_virtqueue.
>>>>>>>
>>>>>>>
>>>>>>>> It could be replaced by a boolean in SVQ or something like that, I
>>>>>>>> experimented with a tri-state too (UNMASKED, MASKED, MASKED_NOTIFIED)
>>>>>>>> and let vhost.c code to manage all the transitions. But I find clearer
>>>>>>>> the pointer use, since it's the more natural for the
>>>>>>>> vhost_virtqueue_mask, vhost_virtqueue_pending existing functions.
>>>>>>>>
>>>>>>>> This masking/unmasking is the part I dislike the most from this
>>>>>>>> series, so I'm very open to alternatives.
>>>>>>> See below. I think we don't even need to care about that.
>>>>>>>
>>>>>>>
>>>>>>>>>>           Error *migration_blocker;
>>>>>>>>>>           const VhostOps *vhost_ops;
>>>>>>>>>>           void *opaque;
>>>>>>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
>>>>>>>>>> index 4512e5b058..3e43399e9c 100644
>>>>>>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>>>>>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>>>>>>>>> @@ -8,9 +8,12 @@
>>>>>>>>>>        */
>>>>>>>>>>
>>>>>>>>>>       #include "hw/virtio/vhost-shadow-virtqueue.h"
>>>>>>>>>> +#include "hw/virtio/vhost.h"
>>>>>>>>>> +
>>>>>>>>>> +#include "standard-headers/linux/vhost_types.h"
>>>>>>>>>>
>>>>>>>>>>       #include "qemu/error-report.h"
>>>>>>>>>> -#include "qemu/event_notifier.h"
>>>>>>>>>> +#include "qemu/main-loop.h"
>>>>>>>>>>
>>>>>>>>>>       /* Shadow virtqueue to relay notifications */
>>>>>>>>>>       typedef struct VhostShadowVirtqueue {
>>>>>>>>>> @@ -18,14 +21,121 @@ typedef struct VhostShadowVirtqueue {
>>>>>>>>>>           EventNotifier kick_notifier;
>>>>>>>>>>           /* Shadow call notifier, sent to vhost */
>>>>>>>>>>           EventNotifier call_notifier;
>>>>>>>>>> +
>>>>>>>>>> +    /*
>>>>>>>>>> +     * Borrowed virtqueue's guest to host notifier.
>>>>>>>>>> +     * To borrow it in this event notifier allows to register on the event
>>>>>>>>>> +     * loop and access the associated shadow virtqueue easily. If we use the
>>>>>>>>>> +     * VirtQueue, we don't have an easy way to retrieve it.
>>>>>>>>> So this is something that worries me. It looks like a layer violation
>>>>>>>>> that makes the codes harder to work correctly.
>>>>>>>>>
>>>>>>>> I don't follow you here.
>>>>>>>>
>>>>>>>> The vhost code already depends on virtqueue in the same sense:
>>>>>>>> virtio_queue_get_host_notifier is called on vhost_virtqueue_start. So
>>>>>>>> if this behavior ever changes it is unlikely for vhost to keep working
>>>>>>>> without changes. vhost_virtqueue has a kick/call int where I think it
>>>>>>>> should be stored actually, but they are never used as far as I see.
>>>>>>>>
>>>>>>>> Previous RFC did rely on vhost_dev_disable_notifiers. From its documentation:
>>>>>>>> /* Stop processing guest IO notifications in vhost.
>>>>>>>>      * Start processing them in qemu.
>>>>>>>>      ...
>>>>>>>> But it was easier for this mode to miss a notification, since they
>>>>>>>> create a new host_notifier in virtio_bus_set_host_notifier right away.
>>>>>>>> So I decided to use the file descriptor already sent to vhost in
>>>>>>>> regular operation mode, so guest-related resources change less.
>>>>>>>>
>>>>>>>> Having said that, maybe it's useful to assert that
>>>>>>>> vhost_dev_{enable,disable}_notifiers are never called on shadow
>>>>>>>> virtqueue mode. Also, it could be useful to retrieve it from
>>>>>>>> virtio_bus, not raw shadow virtqueue, so all get/set are performed
>>>>>>>> from it. Would that make more sense?
>>>>>>>>
>>>>>>>>> I wonder if it would be simpler to start from a vDPA dedicated shadow
>>>>>>>>> virtqueue implementation:
>>>>>>>>>
>>>>>>>>> 1) have the above fields embeded in vhost_vdpa structure
>>>>>>>>> 2) Work at the level of
>>>>>>>>> vhost_vdpa_set_vring_kick()/vhost_vdpa_set_vring_call()
>>>>>>>>>
>>>>>>>> This notifier is never sent to the device in shadow virtqueue mode.
>>>>>>>> It's for SVQ to react to guest's notifications, registering it on its
>>>>>>>> main event loop [1]. So if I perform these changes the way I
>>>>>>>> understand them, SVQ would still rely on this borrowed EventNotifier,
>>>>>>>> and it would send to the vDPA device the newly created kick_notifier
>>>>>>>> of VhostShadowVirtqueue.
>>>>>>> The point is that vhost code should be coupled loosely with virtio. If
>>>>>>> you try to "borrow" EventNotifier from virtio, you need to deal with a
>>>>>>> lot of synchrization. An exampleis the masking stuffs.
>>>>>>>
>>>>>> I still don't follow this, sorry.
>>>>>>
>>>>>> The svq->host_notifier event notifier is not affected by the masking
>>>>>> issue, it is completely private to SVQ. This commit creates and uses
>>>>>> it, and nothing related to masking is touched until the next commit.
>>>>>>
>>>>>>>>> Then the layer is still isolated and you have a much simpler context to
>>>>>>>>> work that you don't need to care a lot of synchornization:
>>>>>>>>>
>>>>>>>>> 1) vq masking
>>>>>>>> This EventNotifier is not used for masking, it does not change from
>>>>>>>> the start of the shadow virtqueue operation through its end. Call fd
>>>>>>>> sent to vhost/vdpa device does not change either in shadow virtqueue
>>>>>>>> mode operation with masking/unmasking. I will try to document it
>>>>>>>> better.
>>>>>>>>
>>>>>>>> I think that we will need to handle synchronization with
>>>>>>>> masking/unmasking from the guest and dynamically enabling SVQ
>>>>>>>> operation mode, since they can happen at the same time as long as we
>>>>>>>> let the guest run. There may be better ways of synchronizing them of
>>>>>>>> course, but I don't see how moving to the vhost-vdpa backend helps
>>>>>>>> with this. Please expand if I've missed it.
>>>>>>>>
>>>>>>>> Or do you mean to forbid regular <-> SVQ operation mode transitions and delay it
>>>>>>>> to future patchsets?
>>>>>>> So my idea is to do all the shadow virtqueue in the vhost-vDPA codes and
>>>>>>> hide them from the upper layers like virtio. This means it works at
>>>>>>> vhost level which can see vhost_vring_file only. When enalbed, what it
>>>>>>> needs is just:
>>>>>>>
>>>>>>> 1) switch to use svq kickfd and relay ioeventfd to svq kickfd
>>>>>>> 2) switch to use svq callfd and relay svq callfd to irqfd
>>>>>>>
>>>>>>> It will still behave like a vhost-backend that the switching is done
>>>>>>> internally in vhost-vDPA which is totally transparent to the virtio
>>>>>>> codes of Qemu.
>>>>>>>
>>>>>>> E.g:
>>>>>>>
>>>>>>> 1) in the case of guest notifier masking, we don't need to do anything
>>>>>>> since virtio codes will replace another irqfd for us.
>>>>>> Assuming that we don't modify vhost masking code, but send shadow
>>>>>> virtqueue call descriptor to the vhost device:
>>>>>>
>>>>>> If guest virtio code mask the virtqueue and replaces the vhost-vdpa
>>>>>> device call fd (VhostShadowVirtqueue.call_notifier in the next commit,
>>>>>> or the descriptor in your previous second point, svq callfd) with the
>>>>>> masked notifier, vhost_shadow_vq_handle_call will not be called
>>>>>> anymore, and no more used descriptors will be forwarded. They will be
>>>>>> stuck if the shadow virtqueue forever. Guest itself cannot recover
>>>>>> from this situation, since a masking will set irqfd, not SVQ call fd.
>>>>> Just to make sure we're in the same page. During vq masking, the virtio
>>>>> codes actually use the masked_notifier as callfd in vhost_virtqueue_mask():
>>>>>
>>>>>        if (mask) {
>>>>>            assert(vdev->use_guest_notifier_mask);
>>>>>            file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
>>>>>        } else {
>>>>>        file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
>>>>>        }
>>>>>
>>>>>        file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
>>>>>        r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
>>>>>
>>>>> So consider the shadow virtqueue in done at vhost-vDPA. We just need to
>>>>> make sure
>>>>>
>>>>> 1) update the callfd which passed by virtio layer via set_vring_kick()
>>>>> 2) always write to the callfd during vhost_shadow_vq_handle_call()
>>>>>
>>>>> Then
>>>>>
>>>>> 3) When shadow vq is enabled, we just set the callfd of shadow virtqueue
>>>>> to vDPA via VHOST_SET_VRING_CALL, and poll the svq callfd
>>>>> 4) When shadow vq is disabled, we just set the callfd that is passed by
>>>>> virtio via VHOST_SET_VRING_CALL, stop poll the svq callfd
>>>>>
>>>>> So you can see in step 2 and 4, we don't need to know whether or not the
>>>>> vq is masked since we follow the vhost protocol "VhostOps" and do
>>>>> everyhing transparently in the vhost-(vDPA) layer.
>>>>>
>>>> All of this assumes that we can enable/disable SVQ dynamically while
>>>> the device is running. If it's not the case, there is no need for the
>>>> mutex neither in vhost.c code nor vdpa_backend.
>>>>
>>>> As I see it, the issue is that step (2) and (4) happens in different
>>>> threads: (2) is in vCPU vmexit, and (4) is in main event loop.
>>>> Consider unmasking and disabling SVQ at the same time with no mutex:
>>>>
>>>> vCPU vmexit thread                     aio thread
>>>> (unmask)                               (stops SVQ)
>>>> |                                      |
>>>> |                                      // Last callfd set was masked_notifier
>>>> |                                      vdpa_backend.callfd = \
>>>> |                                              atomic_read(masked_notifier).
>>>> |                                      |
>>>> vhost_set_vring_call(vq.guest_notifier)|
>>>> -> vdpa_backend.callfd = \             |
>>>>              vq.guest_notifier           |
>>>> |                                      |
>>>> |                                      ioctl(vdpa,
>>>> VHOST_SET_VRING_CALL, vdpa_backend.callfd)
>>>> |
>>>> // guest expects more interrupts, but
>>>> // device just set masked
>>>>
>>>> And vhost_set_vring_call could happen entirely even while ioctl is
>>>> being executed.
>>>>
>>>> So that is the reason for the mutex: vdpa_backend.call_fd and the
>>>> ioctl VHOST_SET_VRING_CALL must be serialized. I'm ok with moving to
>>>> vdpa backend, but it's the same code, just in vdpa_backend.c instead
>>>> of vhost.c, so it becomes less generic in my opinion.
>>>
>>> You are right. But let's consider if we can avoid the dedicated mutex.
>>>
>>> E.g can we use the BQL, bascially we need to synchronizae with iothread.
>>>
>>> Or is it possible to schedule bh then things are serailzied automatically?
>>>
>> I tried RCU with no success, and I think the same issues apply to bh.
>> I will try to explain the best I can what I achieved in the past, and
>> why I discarded it. I will explore BQL approaches, it could be simpler
>> that way actually.
>>
>> The hard part to achieve if that no notification can be forwarded to
>> the guest once masking vmexit returns (isn't it?). Unmasking scenario
>> is easy with RCU, since the pending notification could reach the guest
>> asynchronously if it exists.
>>
>> On the other hand, whatever guest set should take priority over
>> whatever shadow_vq_stop sets.
>>
>> With RCU, the problem is that the synchronization point should be the
>> vmexit thread. The function vhost_virtqueue_mask is already called
>> within RCU, so it could happen that RCU lock is held longer than it
>> returns, so the effective masking could happen after vmexit returns. I
>> see no way to make something like "call_rcu but only in this thread"
>> or, in other words, " rcu_synchronize after the rcu_unlock of this
>> thread and then run this".
>>
>> I tried to explore to synchronize that situation in the event loop,
>> but the guest is able to call unmask/mask again, making the race
>> condition. If I can mark a range where unmask/mask cannot return, I'm
>> creating an artificial mutex. If I retry in the main event loop, there
>> is a window where notification can reach the guest after masking.
>>
>> In order to reduce this window, shadow_virtqueue_stop could set the
>> masked notifier fd unconditionally, and then check if it should unmask
>> under the mutex. I'm not sure if this is worth however, since
>> enabling/disabling already involves a system call.
>>
>> I think it would be way more natural to at least protect
>> vhost_virtqueue.notifier_is_masked with the BQL, however, so I will
>> check that possibility. It would be great to be able to do it on bh,
>> but I think this opens a window for the host to send notifications
>> when the guest has masked the vq, since mask/unmasking could happen
>> while bh is running as far as I see.
>>
> So actually vhost_shadow_virtqueue_start/stop (being a QMP command)
> and vhost_virtqueue_mask (vmexit) already runs under BQL.


So actually everyhing is alreay synchornized?

1) MSI-X MMIO handlers from vcpu thread with BQL
2) Poll for ioeventfd and vhost callfd from iothread with BQL
3) the monitor comand with BQL held


>
> The ideal scenario would be to run kick/call handlers in its own aio
> context and do not take BQL on them,


Well, then you still need to synchronize with QMP, MSI-X MMIO.


> but I think this is doable with
> just atomics and maybe events. For this series I think we can delete
> the introduced mutex (and maybe replace it with an assertion?)


Let's try to avoid assertion here.

Thanks


>
> Thanks!
>
>> Actually, in my test I override vhost_virtqueue_mask, so it looks more
>> similar to your proposal with VhostOps.
>>
>> Thanks!
>>
>>>>>>> 2) easily to deal with vhost dev start and stop
>>>>>>>
>>>>>>> The advantages are obvious, simple and easy to implement.
>>>>>>>
>>>>>> I still don't see how performing this step from backend code avoids
>>>>>> the synchronization problem, since they will be done from different
>>>>>> threads anyway. Not sure what piece I'm missing.
>>>>> See my reply in another thread. If you enable the shadow virtqueue via a
>>>>> OOB monitor that's a real issue.
>>>>>
>>>>> But I don't think we need to do that since
>>>>>
>>>>> 1) SVQ should be transparnet to management
>>>>> 2) unncessary synchronization issue
>>>>>
>>>>> We can enable the shadow virtqueue through cli, new parameter with
>>>>> vhost-vdpa probably. Then we don't need to care about threads. And in
>>>>> the final version with full live migration support, the shadow virtqueue
>>>>> should be enabled automatically. E.g for the device without
>>>>> VHOST_F_LOG_ALL or we can have a dedicated capability of vDPA via
>>>>> VHOST_GET_BACKEND_FEATURES.
>>>>>
>>>> It should be enabled automatically in those condition, but it also
>>>> needs to be dynamic, and only be active during migration. Otherwise,
>>>> guest should use regular vdpa operation. The problem with masking is
>>>> the same if we enable with QMP or because live migration event.
>>>>
>>>> So we will have the previous synchronization problem sooner or later.
>>>> If we omit the rollback to regular vdpa operation (in other words,
>>>> disabling SVQ), code can be simplified, but I'm not sure if that is
>>>> desirable.
>>>
>>> Rgiht, so I'm ok to have the synchronziation from the start if you wish.
>>>
>>> But we need to figure out what to synchronize and how to do synchronize.
>>>
>>> THanks
>>>
>>>
>>>> Thanks!
>>>>
>>>>> Thanks
>>>>>
>>>>>
>>>>>> I can see / tested a few solutions but I don't like them a lot:
>>>>>>
>>>>>> * Forbid hot-swapping from/to shadow virtqueue mode, and set it from
>>>>>> cmdline: We will have to deal with setting the SVQ mode dynamically
>>>>>> sooner or later if we want to use it for live migration.
>>>>>> * Forbid coming back to regular mode after switching to shadow
>>>>>> virtqueue mode: The heavy part of the synchronization comes from svq
>>>>>> stopping code, since we need to serialize the setting of device call
>>>>>> fd. This could be acceptable, but I'm not sure about the implications:
>>>>>> What happens if live migration fails and we need to step back? A mutex
>>>>>> is not needed in this scenario, it's ok with atomics and RCU code.
>>>>>>
>>>>>> * Replace KVM_IRQFD instead and let SVQ poll the old one and masked
>>>>>> notifier: I haven't thought a lot of this one, I think it's better to
>>>>>> not touch guest notifiers.
>>>>>> * Monitor also masked notifier from SVQ: I think this could be
>>>>>> promising, but SVQ needs to be notified about masking/unmasking
>>>>>> anyway, and there is code that depends on checking the masked notifier
>>>>>> for the pending notification.
>>>>>>
>>>>>>>>> 2) vhost dev start and stop
>>>>>>>>>
>>>>>>>>> ?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> +     *
>>>>>>>>>> +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
>>>>>>>>>> +     */
>>>>>>>>>> +    EventNotifier host_notifier;
>>>>>>>>>> +
>>>>>>>>>> +    /* Virtio queue shadowing */
>>>>>>>>>> +    VirtQueue *vq;
>>>>>>>>>>       } VhostShadowVirtqueue;
>>>>>>>>>>
>>>>>>>>>> +/* Forward guest notifications */
>>>>>>>>>> +static void vhost_handle_guest_kick(EventNotifier *n)
>>>>>>>>>> +{
>>>>>>>>>> +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
>>>>>>>>>> +                                             host_notifier);
>>>>>>>>>> +
>>>>>>>>>> +    if (unlikely(!event_notifier_test_and_clear(n))) {
>>>>>>>>>> +        return;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    event_notifier_set(&svq->kick_notifier);
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * Restore the vhost guest to host notifier, i.e., disables svq effect.
>>>>>>>>>> + */
>>>>>>>>>> +static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
>>>>>>>>>> +                                                     unsigned vhost_index,
>>>>>>>>>> +                                                     VhostShadowVirtqueue *svq)
>>>>>>>>>> +{
>>>>>>>>>> +    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
>>>>>>>>>> +    struct vhost_vring_file file = {
>>>>>>>>>> +        .index = vhost_index,
>>>>>>>>>> +        .fd = event_notifier_get_fd(vq_host_notifier),
>>>>>>>>>> +    };
>>>>>>>>>> +    int r;
>>>>>>>>>> +
>>>>>>>>>> +    /* Restore vhost kick */
>>>>>>>>>> +    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
>>>>>>>>>> +    return r ? -errno : 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * Start shadow virtqueue operation.
>>>>>>>>>> + * @dev vhost device
>>>>>>>>>> + * @hidx vhost virtqueue index
>>>>>>>>>> + * @svq Shadow Virtqueue
>>>>>>>>>> + */
>>>>>>>>>> +bool vhost_shadow_vq_start(struct vhost_dev *dev,
>>>>>>>>>> +                           unsigned idx,
>>>>>>>>>> +                           VhostShadowVirtqueue *svq)
>>>>>>>>> It looks to me this assumes the vhost_dev is started before
>>>>>>>>> vhost_shadow_vq_start()?
>>>>>>>>>
>>>>>>>> Right.
>>>>>>> This might not true. Guest may enable and disable virtio drivers after
>>>>>>> the shadow virtqueue is started. You need to deal with that.
>>>>>>>
>>>>>> Right, I will test this scenario.
>>>>>>
>>>>>>> Thanks
>>>>>>>
diff mbox series

Patch

diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index 6cc18d6acb..c891c6510d 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -17,6 +17,13 @@ 
 
 typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
 
+bool vhost_shadow_vq_start(struct vhost_dev *dev,
+                           unsigned idx,
+                           VhostShadowVirtqueue *svq);
+void vhost_shadow_vq_stop(struct vhost_dev *dev,
+                          unsigned idx,
+                          VhostShadowVirtqueue *svq);
+
 VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx);
 
 void vhost_shadow_vq_free(VhostShadowVirtqueue *vq);
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index ac963bf23d..7ffdf9aea0 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -55,6 +55,8 @@  struct vhost_iommu {
     QLIST_ENTRY(vhost_iommu) iommu_next;
 };
 
+typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
+
 typedef struct VhostDevConfigOps {
     /* Vhost device config space changed callback
      */
@@ -83,7 +85,9 @@  struct vhost_dev {
     uint64_t backend_cap;
     bool started;
     bool log_enabled;
+    bool shadow_vqs_enabled;
     uint64_t log_size;
+    VhostShadowVirtqueue **shadow_vqs;
     Error *migration_blocker;
     const VhostOps *vhost_ops;
     void *opaque;
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index 4512e5b058..3e43399e9c 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -8,9 +8,12 @@ 
  */
 
 #include "hw/virtio/vhost-shadow-virtqueue.h"
+#include "hw/virtio/vhost.h"
+
+#include "standard-headers/linux/vhost_types.h"
 
 #include "qemu/error-report.h"
-#include "qemu/event_notifier.h"
+#include "qemu/main-loop.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -18,14 +21,121 @@  typedef struct VhostShadowVirtqueue {
     EventNotifier kick_notifier;
     /* Shadow call notifier, sent to vhost */
     EventNotifier call_notifier;
+
+    /*
+     * Borrowed virtqueue's guest to host notifier.
+     * To borrow it in this event notifier allows to register on the event
+     * loop and access the associated shadow virtqueue easily. If we use the
+     * VirtQueue, we don't have an easy way to retrieve it.
+     *
+     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
+     */
+    EventNotifier host_notifier;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
 } VhostShadowVirtqueue;
 
+/* Forward guest notifications */
+static void vhost_handle_guest_kick(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             host_notifier);
+
+    if (unlikely(!event_notifier_test_and_clear(n))) {
+        return;
+    }
+
+    event_notifier_set(&svq->kick_notifier);
+}
+
+/*
+ * Restore the vhost guest to host notifier, i.e., disables svq effect.
+ */
+static int vhost_shadow_vq_restore_vdev_host_notifier(struct vhost_dev *dev,
+                                                     unsigned vhost_index,
+                                                     VhostShadowVirtqueue *svq)
+{
+    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
+    struct vhost_vring_file file = {
+        .index = vhost_index,
+        .fd = event_notifier_get_fd(vq_host_notifier),
+    };
+    int r;
+
+    /* Restore vhost kick */
+    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
+    return r ? -errno : 0;
+}
+
+/*
+ * Start shadow virtqueue operation.
+ * @dev vhost device
+ * @hidx vhost virtqueue index
+ * @svq Shadow Virtqueue
+ */
+bool vhost_shadow_vq_start(struct vhost_dev *dev,
+                           unsigned idx,
+                           VhostShadowVirtqueue *svq)
+{
+    EventNotifier *vq_host_notifier = virtio_queue_get_host_notifier(svq->vq);
+    struct vhost_vring_file file = {
+        .index = idx,
+        .fd = event_notifier_get_fd(&svq->kick_notifier),
+    };
+    int r;
+
+    /* Check that notifications are still going directly to vhost dev */
+    assert(virtio_queue_is_host_notifier_enabled(svq->vq));
+
+    /*
+     * event_notifier_set_handler already checks for guest's notifications if
+     * they arrive in the switch, so there is no need to explicitely check for
+     * them.
+     */
+    event_notifier_init_fd(&svq->host_notifier,
+                           event_notifier_get_fd(vq_host_notifier));
+    event_notifier_set_handler(&svq->host_notifier, vhost_handle_guest_kick);
+
+    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
+    if (unlikely(r != 0)) {
+        error_report("Couldn't set kick fd: %s", strerror(errno));
+        goto err_set_vring_kick;
+    }
+
+    return true;
+
+err_set_vring_kick:
+    event_notifier_set_handler(&svq->host_notifier, NULL);
+
+    return false;
+}
+
+/*
+ * Stop shadow virtqueue operation.
+ * @dev vhost device
+ * @idx vhost queue index
+ * @svq Shadow Virtqueue
+ */
+void vhost_shadow_vq_stop(struct vhost_dev *dev,
+                          unsigned idx,
+                          VhostShadowVirtqueue *svq)
+{
+    int r = vhost_shadow_vq_restore_vdev_host_notifier(dev, idx, svq);
+    if (unlikely(r < 0)) {
+        error_report("Couldn't restore vq kick fd: %s", strerror(-r));
+    }
+
+    event_notifier_set_handler(&svq->host_notifier, NULL);
+}
+
 /*
  * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
  * methods and file descriptors.
  */
 VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx)
 {
+    int vq_idx = dev->vq_index + idx;
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
 
@@ -43,6 +153,7 @@  VhostShadowVirtqueue *vhost_shadow_vq_new(struct vhost_dev *dev, int idx)
         goto err_init_call_notifier;
     }
 
+    svq->vq = virtio_get_queue(dev->vdev, vq_idx);
     return g_steal_pointer(&svq);
 
 err_init_call_notifier:
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 97f1bcfa42..4858a35df6 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -25,6 +25,7 @@ 
 #include "exec/address-spaces.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file-types.h"
 #include "sysemu/dma.h"
@@ -1219,6 +1220,74 @@  static void vhost_virtqueue_stop(struct vhost_dev *dev,
                        0, virtio_queue_get_desc_size(vdev, idx));
 }
 
+static int vhost_sw_live_migration_stop(struct vhost_dev *dev)
+{
+    int idx;
+
+    dev->shadow_vqs_enabled = false;
+
+    for (idx = 0; idx < dev->nvqs; ++idx) {
+        vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[idx]);
+        vhost_shadow_vq_free(dev->shadow_vqs[idx]);
+    }
+
+    g_free(dev->shadow_vqs);
+    dev->shadow_vqs = NULL;
+    return 0;
+}
+
+static int vhost_sw_live_migration_start(struct vhost_dev *dev)
+{
+    int idx, stop_idx;
+
+    dev->shadow_vqs = g_new0(VhostShadowVirtqueue *, dev->nvqs);
+    for (idx = 0; idx < dev->nvqs; ++idx) {
+        dev->shadow_vqs[idx] = vhost_shadow_vq_new(dev, idx);
+        if (unlikely(dev->shadow_vqs[idx] == NULL)) {
+            goto err_new;
+        }
+    }
+
+    dev->shadow_vqs_enabled = true;
+    for (idx = 0; idx < dev->nvqs; ++idx) {
+        bool ok = vhost_shadow_vq_start(dev, idx, dev->shadow_vqs[idx]);
+        if (unlikely(!ok)) {
+            goto err_start;
+        }
+    }
+
+    return 0;
+
+err_start:
+    dev->shadow_vqs_enabled = false;
+    for (stop_idx = 0; stop_idx < idx; stop_idx++) {
+        vhost_shadow_vq_stop(dev, idx, dev->shadow_vqs[stop_idx]);
+    }
+
+err_new:
+    for (idx = 0; idx < dev->nvqs; ++idx) {
+        vhost_shadow_vq_free(dev->shadow_vqs[idx]);
+    }
+    g_free(dev->shadow_vqs);
+
+    return -1;
+}
+
+static int vhost_sw_live_migration_enable(struct vhost_dev *dev,
+                                          bool enable_lm)
+{
+    int r;
+
+    if (enable_lm == dev->shadow_vqs_enabled) {
+        return 0;
+    }
+
+    r = enable_lm ? vhost_sw_live_migration_start(dev)
+                  : vhost_sw_live_migration_stop(dev);
+
+    return r;
+}
+
 static void vhost_eventfd_add(MemoryListener *listener,
                               MemoryRegionSection *section,
                               bool match_data, uint64_t data, EventNotifier *e)
@@ -1381,6 +1450,7 @@  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
     hdev->log = NULL;
     hdev->log_size = 0;
     hdev->log_enabled = false;
+    hdev->shadow_vqs_enabled = false;
     hdev->started = false;
     memory_listener_register(&hdev->memory_listener, &address_space_memory);
     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
@@ -1484,6 +1554,10 @@  void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
     int i, r;
 
+    if (hdev->shadow_vqs_enabled) {
+        vhost_sw_live_migration_enable(hdev, false);
+    }
+
     for (i = 0; i < hdev->nvqs; ++i) {
         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
                                          false);
@@ -1798,6 +1872,7 @@  fail_features:
 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
     int i;
+    bool is_shadow_vqs_enabled = hdev->shadow_vqs_enabled;
 
     /* should only be called after backend is connected */
     assert(hdev->vhost_ops);
@@ -1805,7 +1880,16 @@  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
     if (hdev->vhost_ops->vhost_dev_start) {
         hdev->vhost_ops->vhost_dev_start(hdev, false);
     }
+    if (is_shadow_vqs_enabled) {
+        /* Shadow virtqueue will be stopped */
+        hdev->shadow_vqs_enabled = false;
+    }
     for (i = 0; i < hdev->nvqs; ++i) {
+        if (is_shadow_vqs_enabled) {
+            vhost_shadow_vq_stop(hdev, i, hdev->shadow_vqs[i]);
+            vhost_shadow_vq_free(hdev->shadow_vqs[i]);
+        }
+
         vhost_virtqueue_stop(hdev,
                              vdev,
                              hdev->vqs + i,
@@ -1819,6 +1903,8 @@  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
         memory_listener_unregister(&hdev->iommu_listener);
     }
     vhost_log_put(hdev, true);
+    g_free(hdev->shadow_vqs);
+    hdev->shadow_vqs_enabled = false;
     hdev->started = false;
     hdev->vdev = NULL;
 }
@@ -1835,5 +1921,60 @@  int vhost_net_set_backend(struct vhost_dev *hdev,
 
 void qmp_x_vhost_enable_shadow_vq(const char *name, bool enable, Error **errp)
 {
-    error_setg(errp, "Shadow virtqueue still not implemented");
+    struct vhost_dev *hdev, *hdev_err;
+    VirtIODevice *vdev;
+    const char *err_cause = NULL;
+    int r;
+    ErrorClass err_class = ERROR_CLASS_GENERIC_ERROR;
+
+    QLIST_FOREACH(hdev, &vhost_devices, entry) {
+        if (hdev->vdev && 0 == strcmp(hdev->vdev->name, name)) {
+            vdev = hdev->vdev;
+            break;
+        }
+    }
+
+    if (!hdev) {
+        err_class = ERROR_CLASS_DEVICE_NOT_FOUND;
+        err_cause = "Device not found";
+        goto not_found_err;
+    }
+
+    for ( ; hdev; hdev = QLIST_NEXT(hdev, entry)) {
+        if (vdev != hdev->vdev) {
+            continue;
+        }
+
+        if (!hdev->started) {
+            err_cause = "Device is not started";
+            goto err;
+        }
+
+        r = vhost_sw_live_migration_enable(hdev, enable);
+        if (unlikely(r)) {
+            err_cause = "Error enabling (see monitor)";
+            goto err;
+        }
+    }
+
+    return;
+
+err:
+    QLIST_FOREACH(hdev_err, &vhost_devices, entry) {
+        if (hdev_err == hdev) {
+            break;
+        }
+
+        if (vdev != hdev->vdev) {
+            continue;
+        }
+
+        vhost_sw_live_migration_enable(hdev, !enable);
+    }
+
+not_found_err:
+    if (err_cause) {
+        error_set(errp, err_class,
+                  "Can't enable shadow vq on %s: %s", name, err_cause);
+    }
 }