diff mbox series

[17/31] vdpa: adapt vhost_ops callbacks to svq

Message ID 20220121202733.404989-18-eperezma@redhat.com
State New
Headers show
Series vDPA shadow virtqueue | expand

Commit Message

Eugenio Perez Martin Jan. 21, 2022, 8:27 p.m. UTC
First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.h |   2 +-
 hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
 hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
 3 files changed, 143 insertions(+), 13 deletions(-)

Comments

Jason Wang Jan. 30, 2022, 4:03 a.m. UTC | #1
在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> First half of the buffers forwarding part, preparing vhost-vdpa
> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
> this is effectively dead code at the moment, but it helps to reduce
> patch size.
>
> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> ---
>   hw/virtio/vhost-shadow-virtqueue.h |   2 +-
>   hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
>   hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
>   3 files changed, 143 insertions(+), 13 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> index 035207a469..39aef5ffdf 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.h
> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
>   
>   void vhost_svq_stop(VhostShadowVirtqueue *svq);
>   
> -VhostShadowVirtqueue *vhost_svq_new(void);
> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
>   
>   void vhost_svq_free(VhostShadowVirtqueue *vq);
>   
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> index f129ec8395..7c168075d7 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
>   /**
>    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
>    * methods and file descriptors.
> + *
> + * @qsize Shadow VirtQueue size
> + *
> + * Returns the new virtqueue or NULL.
> + *
> + * In case of error, reason is reported through error_report.
>    */
> -VhostShadowVirtqueue *vhost_svq_new(void)
> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
>   {
> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
> +    size_t device_size, driver_size;
>       g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
>       int r;
>   
> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
>       /* Placeholder descriptor, it should be deleted at set_kick_fd */
>       event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
>   
> +    svq->vring.num = qsize;


I wonder if this is the best. E.g some hardware can support up to 32K 
queue size. So this will probably end up with:

1) SVQ use 32K queue size
2) hardware queue uses 256

? Or we SVQ can stick to 256 but this will this cause trouble if we want 
to add event index support?


> +    driver_size = vhost_svq_driver_area_size(svq);
> +    device_size = vhost_svq_device_area_size(svq);
> +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
> +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> +    memset(svq->vring.desc, 0, driver_size);
> +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> +    memset(svq->vring.used, 0, device_size);
> +
>       event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
>       return g_steal_pointer(&svq);
>   
> @@ -318,5 +335,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
>       event_notifier_cleanup(&vq->hdev_kick);
>       event_notifier_set_handler(&vq->hdev_call, NULL);
>       event_notifier_cleanup(&vq->hdev_call);
> +    qemu_vfree(vq->vring.desc);
> +    qemu_vfree(vq->vring.used);
>       g_free(vq);
>   }
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 9d801cf907..53e14bafa0 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -641,20 +641,52 @@ static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
>       return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
>   }
>   
> -static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
> -                                      struct vhost_vring_state *ring)
> +static int vhost_vdpa_set_dev_vring_num(struct vhost_dev *dev,
> +                                        struct vhost_vring_state *ring)
>   {
>       trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
>       return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
>   }
>   
> -static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
> -                                       struct vhost_vring_state *ring)
> +static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
> +                                    struct vhost_vring_state *ring)
> +{
> +    struct vhost_vdpa *v = dev->opaque;
> +
> +    if (v->shadow_vqs_enabled) {
> +        /*
> +         * Vring num was set at device start. SVQ num is handled by VirtQueue
> +         * code
> +         */
> +        return 0;
> +    }
> +
> +    return vhost_vdpa_set_dev_vring_num(dev, ring);
> +}
> +
> +static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
> +                                         struct vhost_vring_state *ring)
>   {
>       trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
>       return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
>   }
>   
> +static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
> +                                     struct vhost_vring_state *ring)
> +{
> +    struct vhost_vdpa *v = dev->opaque;
> +
> +    if (v->shadow_vqs_enabled) {
> +        /*
> +         * Vring base was set at device start. SVQ base is handled by VirtQueue
> +         * code
> +         */
> +        return 0;
> +    }
> +
> +    return vhost_vdpa_set_dev_vring_base(dev, ring);
> +}
> +
>   static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
>                                          struct vhost_vring_state *ring)
>   {
> @@ -784,8 +816,8 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
>       }
>   }
>   
> -static int vhost_vdpa_get_features(struct vhost_dev *dev,
> -                                     uint64_t *features)
> +static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
> +                                       uint64_t *features)
>   {
>       int ret;
>   
> @@ -794,15 +826,64 @@ static int vhost_vdpa_get_features(struct vhost_dev *dev,
>       return ret;
>   }
>   
> +static int vhost_vdpa_get_features(struct vhost_dev *dev, uint64_t *features)
> +{
> +    struct vhost_vdpa *v = dev->opaque;
> +    int ret = vhost_vdpa_get_dev_features(dev, features);
> +
> +    if (ret == 0 && v->shadow_vqs_enabled) {
> +        /* Filter only features that SVQ can offer to guest */
> +        vhost_svq_valid_guest_features(features);
> +    }


Sorry if I've asked before, I think it's sufficient to filter out the 
device features that we don't support during and fail the vhost 
initialization. Any reason we need do it again here?


> +
> +    return ret;
> +}
> +
>   static int vhost_vdpa_set_features(struct vhost_dev *dev,
>                                      uint64_t features)
>   {
> +    struct vhost_vdpa *v = dev->opaque;
>       int ret;
>   
>       if (vhost_vdpa_one_time_request(dev)) {
>           return 0;
>       }
>   
> +    if (v->shadow_vqs_enabled) {
> +        uint64_t dev_features, svq_features, acked_features;
> +        bool ok;
> +
> +        ret = vhost_vdpa_get_dev_features(dev, &dev_features);
> +        if (ret != 0) {
> +            error_report("Can't get vdpa device features, got (%d)", ret);
> +            return ret;
> +        }
> +
> +        svq_features = dev_features;
> +        ok = vhost_svq_valid_device_features(&svq_features);
> +        if (unlikely(!ok)) {
> +            error_report("SVQ Invalid device feature flags, offer: 0x%"
> +                         PRIx64", ok: 0x%"PRIx64, dev->features, svq_features);
> +            return -1;
> +        }
> +
> +        ok = vhost_svq_valid_guest_features(&features);
> +        if (unlikely(!ok)) {
> +            error_report(
> +                "Invalid guest acked feature flag, acked: 0x%"
> +                PRIx64", ok: 0x%"PRIx64, dev->acked_features, features);
> +            return -1;
> +        }
> +
> +        ok = vhost_svq_ack_guest_features(svq_features, features,
> +                                          &acked_features);
> +        if (unlikely(!ok)) {
> +            return -1;
> +        }
> +
> +        features = acked_features;
> +    }
> +
>       trace_vhost_vdpa_set_features(dev, features);
>       ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
>       if (ret) {
> @@ -822,13 +903,31 @@ static int vhost_vdpa_set_owner(struct vhost_dev *dev)
>       return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
>   }
>   
> -static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
> -                    struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
> +static void vhost_vdpa_vq_get_guest_addr(struct vhost_vring_addr *addr,
> +                                         struct vhost_virtqueue *vq)
>   {
> -    assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
>       addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
>       addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
>       addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
> +}
> +
> +static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
> +                                  struct vhost_vring_addr *addr,
> +                                  struct vhost_virtqueue *vq)
> +{
> +    struct vhost_vdpa *v = dev->opaque;
> +
> +    assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
> +
> +    if (v->shadow_vqs_enabled) {
> +        int idx = vhost_vdpa_get_vq_index(dev, addr->index);
> +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, idx);
> +
> +        vhost_svq_get_vring_addr(svq, addr);
> +    } else {
> +        vhost_vdpa_vq_get_guest_addr(addr, vq);
> +    }
> +
>       trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
>                                    addr->avail_user_addr, addr->used_user_addr);
>       return 0;
> @@ -849,6 +948,12 @@ static void vhost_psvq_free(gpointer svq)
>       vhost_svq_free(svq);
>   }
>   
> +static int vhost_vdpa_get_max_queue_size(struct vhost_dev *dev,
> +                                         uint16_t *qsize)
> +{
> +    return vhost_vdpa_call(dev, VHOST_VDPA_GET_VRING_NUM, qsize);
> +}
> +
>   static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
>                                  Error **errp)
>   {
> @@ -857,6 +962,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
>                                                              vhost_psvq_free);
>       uint64_t dev_features;
>       uint64_t svq_features;
> +    uint16_t qsize;
>       int r;
>       bool ok;
>   
> @@ -864,7 +970,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
>           goto out;
>       }
>   
> -    r = vhost_vdpa_get_features(hdev, &dev_features);
> +    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
>       if (r != 0) {
>           error_setg(errp, "Can't get vdpa device features, got (%d)", r);
>           return r;
> @@ -879,9 +985,14 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
>           return -1;
>       }
>   
> +    r = vhost_vdpa_get_max_queue_size(hdev, &qsize);
> +    if (unlikely(r)) {
> +        qsize = 256;
> +    }


Should we fail instead of having a "default" value here?

Thanks


> +
>       shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_psvq_free);
>       for (unsigned n = 0; n < hdev->nvqs; ++n) {
> -        VhostShadowVirtqueue *svq = vhost_svq_new();
> +        VhostShadowVirtqueue *svq = vhost_svq_new(qsize);
>   
>           if (unlikely(!svq)) {
>               error_setg(errp, "Cannot create svq %u", n);
Eugenio Perez Martin Jan. 31, 2022, 6:58 p.m. UTC | #2
On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > First half of the buffers forwarding part, preparing vhost-vdpa
> > callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
> > this is effectively dead code at the moment, but it helps to reduce
> > patch size.
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >   hw/virtio/vhost-shadow-virtqueue.h |   2 +-
> >   hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
> >   hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
> >   3 files changed, 143 insertions(+), 13 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > index 035207a469..39aef5ffdf 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> >
> >   void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >
> > -VhostShadowVirtqueue *vhost_svq_new(void);
> > +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> >
> >   void vhost_svq_free(VhostShadowVirtqueue *vq);
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > index f129ec8395..7c168075d7 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >   /**
> >    * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> >    * methods and file descriptors.
> > + *
> > + * @qsize Shadow VirtQueue size
> > + *
> > + * Returns the new virtqueue or NULL.
> > + *
> > + * In case of error, reason is reported through error_report.
> >    */
> > -VhostShadowVirtqueue *vhost_svq_new(void)
> > +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> >   {
> > +    size_t desc_size = sizeof(vring_desc_t) * qsize;
> > +    size_t device_size, driver_size;
> >       g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
> >       int r;
> >
> > @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> >       /* Placeholder descriptor, it should be deleted at set_kick_fd */
> >       event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
> >
> > +    svq->vring.num = qsize;
>
>
> I wonder if this is the best. E.g some hardware can support up to 32K
> queue size. So this will probably end up with:
>
> 1) SVQ use 32K queue size
> 2) hardware queue uses 256
>

In that case SVQ vring queue size will be 32K and guest's vring can
negotiate any number with SVQ equal or less than 32K, including 256.
Is that what you mean?

If with hardware queues you mean guest's vring, not sure why it is
"probably 256". I'd say that in that case with the virtio-net kernel
driver the ring size will be the same as the device export, for
example, isn't it?

The implementation should support any combination of sizes, but the
ring size exposed to the guest is never bigger than hardware one.

> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
> to add event index support?
>

I think we should not have any problem with event idx. If you mean
that the guest could mark more buffers available than SVQ vring's
size, that should not happen because there must be less entries in the
guest than SVQ.

But if I understood you correctly, a similar situation could happen if
a guest's contiguous buffer is scattered across many qemu's VA chunks.
Even if that would happen, the situation should be ok too: SVQ knows
the guest's avail idx and, if SVQ is full, it will continue forwarding
avail buffers when the device uses more buffers.

Does that make sense to you?

>
> > +    driver_size = vhost_svq_driver_area_size(svq);
> > +    device_size = vhost_svq_device_area_size(svq);
> > +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
> > +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
> > +    memset(svq->vring.desc, 0, driver_size);
> > +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
> > +    memset(svq->vring.used, 0, device_size);
> > +
> >       event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> >       return g_steal_pointer(&svq);
> >
> > @@ -318,5 +335,7 @@ void vhost_svq_free(VhostShadowVirtqueue *vq)
> >       event_notifier_cleanup(&vq->hdev_kick);
> >       event_notifier_set_handler(&vq->hdev_call, NULL);
> >       event_notifier_cleanup(&vq->hdev_call);
> > +    qemu_vfree(vq->vring.desc);
> > +    qemu_vfree(vq->vring.used);
> >       g_free(vq);
> >   }
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 9d801cf907..53e14bafa0 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -641,20 +641,52 @@ static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
> >       return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
> >   }
> >
> > -static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
> > -                                      struct vhost_vring_state *ring)
> > +static int vhost_vdpa_set_dev_vring_num(struct vhost_dev *dev,
> > +                                        struct vhost_vring_state *ring)
> >   {
> >       trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
> >       return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
> >   }
> >
> > -static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
> > -                                       struct vhost_vring_state *ring)
> > +static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
> > +                                    struct vhost_vring_state *ring)
> > +{
> > +    struct vhost_vdpa *v = dev->opaque;
> > +
> > +    if (v->shadow_vqs_enabled) {
> > +        /*
> > +         * Vring num was set at device start. SVQ num is handled by VirtQueue
> > +         * code
> > +         */
> > +        return 0;
> > +    }
> > +
> > +    return vhost_vdpa_set_dev_vring_num(dev, ring);
> > +}
> > +
> > +static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
> > +                                         struct vhost_vring_state *ring)
> >   {
> >       trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
> >       return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
> >   }
> >
> > +static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
> > +                                     struct vhost_vring_state *ring)
> > +{
> > +    struct vhost_vdpa *v = dev->opaque;
> > +
> > +    if (v->shadow_vqs_enabled) {
> > +        /*
> > +         * Vring base was set at device start. SVQ base is handled by VirtQueue
> > +         * code
> > +         */
> > +        return 0;
> > +    }
> > +
> > +    return vhost_vdpa_set_dev_vring_base(dev, ring);
> > +}
> > +
> >   static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
> >                                          struct vhost_vring_state *ring)
> >   {
> > @@ -784,8 +816,8 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
> >       }
> >   }
> >
> > -static int vhost_vdpa_get_features(struct vhost_dev *dev,
> > -                                     uint64_t *features)
> > +static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
> > +                                       uint64_t *features)
> >   {
> >       int ret;
> >
> > @@ -794,15 +826,64 @@ static int vhost_vdpa_get_features(struct vhost_dev *dev,
> >       return ret;
> >   }
> >
> > +static int vhost_vdpa_get_features(struct vhost_dev *dev, uint64_t *features)
> > +{
> > +    struct vhost_vdpa *v = dev->opaque;
> > +    int ret = vhost_vdpa_get_dev_features(dev, features);
> > +
> > +    if (ret == 0 && v->shadow_vqs_enabled) {
> > +        /* Filter only features that SVQ can offer to guest */
> > +        vhost_svq_valid_guest_features(features);
> > +    }
>
>
> Sorry if I've asked before, I think it's sufficient to filter out the
> device features that we don't support during and fail the vhost
> initialization. Any reason we need do it again here?
>

On the contrary, if something needs to be asked that means that is not
clear enough :).

At initialization we validate that the device offers all the needed
features (ACCESS_PLATFORM, VERSION_1). We don't have the features
acknowledged by the guest at that point.

This is checking the written features by the guest. For example, we
accept _F_INDIRECT here, so the guest can write indirect descriptors
to SVQ, since qemu VirtQueue code is handling it for us. I've stayed
on the safe side and I've not included packed or event_idx, but I
might try to run with them. These would not have been accepted for the
device.

>
> > +
> > +    return ret;
> > +}
> > +
> >   static int vhost_vdpa_set_features(struct vhost_dev *dev,
> >                                      uint64_t features)
> >   {
> > +    struct vhost_vdpa *v = dev->opaque;
> >       int ret;
> >
> >       if (vhost_vdpa_one_time_request(dev)) {
> >           return 0;
> >       }
> >
> > +    if (v->shadow_vqs_enabled) {
> > +        uint64_t dev_features, svq_features, acked_features;
> > +        bool ok;
> > +
> > +        ret = vhost_vdpa_get_dev_features(dev, &dev_features);
> > +        if (ret != 0) {
> > +            error_report("Can't get vdpa device features, got (%d)", ret);
> > +            return ret;
> > +        }
> > +
> > +        svq_features = dev_features;
> > +        ok = vhost_svq_valid_device_features(&svq_features);
> > +        if (unlikely(!ok)) {
> > +            error_report("SVQ Invalid device feature flags, offer: 0x%"
> > +                         PRIx64", ok: 0x%"PRIx64, dev->features, svq_features);
> > +            return -1;
> > +        }
> > +
> > +        ok = vhost_svq_valid_guest_features(&features);
> > +        if (unlikely(!ok)) {
> > +            error_report(
> > +                "Invalid guest acked feature flag, acked: 0x%"
> > +                PRIx64", ok: 0x%"PRIx64, dev->acked_features, features);
> > +            return -1;
> > +        }
> > +
> > +        ok = vhost_svq_ack_guest_features(svq_features, features,
> > +                                          &acked_features);
> > +        if (unlikely(!ok)) {
> > +            return -1;
> > +        }
> > +
> > +        features = acked_features;
> > +    }
> > +
> >       trace_vhost_vdpa_set_features(dev, features);
> >       ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
> >       if (ret) {
> > @@ -822,13 +903,31 @@ static int vhost_vdpa_set_owner(struct vhost_dev *dev)
> >       return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
> >   }
> >
> > -static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
> > -                    struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
> > +static void vhost_vdpa_vq_get_guest_addr(struct vhost_vring_addr *addr,
> > +                                         struct vhost_virtqueue *vq)
> >   {
> > -    assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
> >       addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
> >       addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
> >       addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
> > +}
> > +
> > +static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
> > +                                  struct vhost_vring_addr *addr,
> > +                                  struct vhost_virtqueue *vq)
> > +{
> > +    struct vhost_vdpa *v = dev->opaque;
> > +
> > +    assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
> > +
> > +    if (v->shadow_vqs_enabled) {
> > +        int idx = vhost_vdpa_get_vq_index(dev, addr->index);
> > +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, idx);
> > +
> > +        vhost_svq_get_vring_addr(svq, addr);
> > +    } else {
> > +        vhost_vdpa_vq_get_guest_addr(addr, vq);
> > +    }
> > +
> >       trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
> >                                    addr->avail_user_addr, addr->used_user_addr);
> >       return 0;
> > @@ -849,6 +948,12 @@ static void vhost_psvq_free(gpointer svq)
> >       vhost_svq_free(svq);
> >   }
> >
> > +static int vhost_vdpa_get_max_queue_size(struct vhost_dev *dev,
> > +                                         uint16_t *qsize)
> > +{
> > +    return vhost_vdpa_call(dev, VHOST_VDPA_GET_VRING_NUM, qsize);
> > +}
> > +
> >   static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
> >                                  Error **errp)
> >   {
> > @@ -857,6 +962,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
> >                                                              vhost_psvq_free);
> >       uint64_t dev_features;
> >       uint64_t svq_features;
> > +    uint16_t qsize;
> >       int r;
> >       bool ok;
> >
> > @@ -864,7 +970,7 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
> >           goto out;
> >       }
> >
> > -    r = vhost_vdpa_get_features(hdev, &dev_features);
> > +    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
> >       if (r != 0) {
> >           error_setg(errp, "Can't get vdpa device features, got (%d)", r);
> >           return r;
> > @@ -879,9 +985,14 @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
> >           return -1;
> >       }
> >
> > +    r = vhost_vdpa_get_max_queue_size(hdev, &qsize);
> > +    if (unlikely(r)) {
> > +        qsize = 256;
> > +    }
>
>
> Should we fail instead of having a "default" value here?
>

Maybe it is better to fail here, yes. I guess there is no safe default value.

Thanks!

> Thanks
>
>
> > +
> >       shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_psvq_free);
> >       for (unsigned n = 0; n < hdev->nvqs; ++n) {
> > -        VhostShadowVirtqueue *svq = vhost_svq_new();
> > +        VhostShadowVirtqueue *svq = vhost_svq_new(qsize);
> >
> >           if (unlikely(!svq)) {
> >               error_setg(errp, "Cannot create svq %u", n);
>
Jason Wang Feb. 8, 2022, 3:57 a.m. UTC | #3
在 2022/2/1 上午2:58, Eugenio Perez Martin 写道:
> On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
>>> First half of the buffers forwarding part, preparing vhost-vdpa
>>> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
>>> this is effectively dead code at the moment, but it helps to reduce
>>> patch size.
>>>
>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>> ---
>>>    hw/virtio/vhost-shadow-virtqueue.h |   2 +-
>>>    hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
>>>    hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
>>>    3 files changed, 143 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
>>> index 035207a469..39aef5ffdf 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
>>>
>>>    void vhost_svq_stop(VhostShadowVirtqueue *svq);
>>>
>>> -VhostShadowVirtqueue *vhost_svq_new(void);
>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
>>>
>>>    void vhost_svq_free(VhostShadowVirtqueue *vq);
>>>
>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
>>> index f129ec8395..7c168075d7 100644
>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
>>>    /**
>>>     * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
>>>     * methods and file descriptors.
>>> + *
>>> + * @qsize Shadow VirtQueue size
>>> + *
>>> + * Returns the new virtqueue or NULL.
>>> + *
>>> + * In case of error, reason is reported through error_report.
>>>     */
>>> -VhostShadowVirtqueue *vhost_svq_new(void)
>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
>>>    {
>>> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
>>> +    size_t device_size, driver_size;
>>>        g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
>>>        int r;
>>>
>>> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
>>>        /* Placeholder descriptor, it should be deleted at set_kick_fd */
>>>        event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
>>>
>>> +    svq->vring.num = qsize;
>>
>> I wonder if this is the best. E.g some hardware can support up to 32K
>> queue size. So this will probably end up with:
>>
>> 1) SVQ use 32K queue size
>> 2) hardware queue uses 256
>>
> In that case SVQ vring queue size will be 32K and guest's vring can
> negotiate any number with SVQ equal or less than 32K,


Sorry for being unclear what I meant is actually

1) SVQ uses 32K queue size

2) guest vq uses 256

This looks like a burden that needs extra logic and may damage the 
performance.

And this can lead other interesting situation:

1) SVQ uses 256

2) guest vq uses 1024

Where a lot of more SVQ logic is needed.


> including 256.
> Is that what you mean?


I mean, it looks to me the logic will be much more simplified if we just 
allocate the shadow virtqueue with the size what guest can see (guest 
vring).

Then we don't need to think if the difference of the queue size can have 
any side effects.


>
> If with hardware queues you mean guest's vring, not sure why it is
> "probably 256". I'd say that in that case with the virtio-net kernel
> driver the ring size will be the same as the device export, for
> example, isn't it?
>
> The implementation should support any combination of sizes, but the
> ring size exposed to the guest is never bigger than hardware one.
>
>> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
>> to add event index support?
>>
> I think we should not have any problem with event idx. If you mean
> that the guest could mark more buffers available than SVQ vring's
> size, that should not happen because there must be less entries in the
> guest than SVQ.
>
> But if I understood you correctly, a similar situation could happen if
> a guest's contiguous buffer is scattered across many qemu's VA chunks.
> Even if that would happen, the situation should be ok too: SVQ knows
> the guest's avail idx and, if SVQ is full, it will continue forwarding
> avail buffers when the device uses more buffers.
>
> Does that make sense to you?


Yes.

Thanks
Eugenio Perez Martin Feb. 17, 2022, 5:13 p.m. UTC | #4
On Tue, Feb 8, 2022 at 4:58 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/1 上午2:58, Eugenio Perez Martin 写道:
> > On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> >>> First half of the buffers forwarding part, preparing vhost-vdpa
> >>> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
> >>> this is effectively dead code at the moment, but it helps to reduce
> >>> patch size.
> >>>
> >>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>> ---
> >>>    hw/virtio/vhost-shadow-virtqueue.h |   2 +-
> >>>    hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
> >>>    hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
> >>>    3 files changed, 143 insertions(+), 13 deletions(-)
> >>>
> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> >>> index 035207a469..39aef5ffdf 100644
> >>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> >>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> >>> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> >>>
> >>>    void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >>>
> >>> -VhostShadowVirtqueue *vhost_svq_new(void);
> >>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> >>>
> >>>    void vhost_svq_free(VhostShadowVirtqueue *vq);
> >>>
> >>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> >>> index f129ec8395..7c168075d7 100644
> >>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> >>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> >>> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >>>    /**
> >>>     * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> >>>     * methods and file descriptors.
> >>> + *
> >>> + * @qsize Shadow VirtQueue size
> >>> + *
> >>> + * Returns the new virtqueue or NULL.
> >>> + *
> >>> + * In case of error, reason is reported through error_report.
> >>>     */
> >>> -VhostShadowVirtqueue *vhost_svq_new(void)
> >>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> >>>    {
> >>> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
> >>> +    size_t device_size, driver_size;
> >>>        g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
> >>>        int r;
> >>>
> >>> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> >>>        /* Placeholder descriptor, it should be deleted at set_kick_fd */
> >>>        event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
> >>>
> >>> +    svq->vring.num = qsize;
> >>
> >> I wonder if this is the best. E.g some hardware can support up to 32K
> >> queue size. So this will probably end up with:
> >>
> >> 1) SVQ use 32K queue size
> >> 2) hardware queue uses 256
> >>
> > In that case SVQ vring queue size will be 32K and guest's vring can
> > negotiate any number with SVQ equal or less than 32K,
>
>
> Sorry for being unclear what I meant is actually
>
> 1) SVQ uses 32K queue size
>
> 2) guest vq uses 256
>
> This looks like a burden that needs extra logic and may damage the
> performance.
>

Still not getting this point.

An available guest buffer, although contiguous in GPA/GVA, can expand
in multiple buffers if it's not contiguous in qemu's VA (by the while
loop in virtqueue_map_desc [1]). In that scenario it is better to have
"plenty" of SVQ buffers.

I'm ok if we decide to put an upper limit though, or if we decide not
to handle this situation. But we would leave out valid virtio drivers.
Maybe to set a fixed upper limit (1024?)? To add another parameter
(x-svq-size-n=N)?

If you mean we lose performance because memory gets more sparse I
think the only possibility is to limit that way.

> And this can lead other interesting situation:
>
> 1) SVQ uses 256
>
> 2) guest vq uses 1024
>
> Where a lot of more SVQ logic is needed.
>

If we agree that a guest descriptor can expand in multiple SVQ
descriptors, this should be already handled by the previous logic too.

But this should only happen in case that qemu is launched with a "bad"
cmdline, isn't it?

If I run that example with vp_vdpa, L0 qemu will happily accept 1024
as a queue size [2]. But if the vdpa device maximum queue size is
effectively 256, this will result in an error: We're not exposing it
to the guest at any moment but with qemu's cmdline.

>
> > including 256.
> > Is that what you mean?
>
>
> I mean, it looks to me the logic will be much more simplified if we just
> allocate the shadow virtqueue with the size what guest can see (guest
> vring).
>
> Then we don't need to think if the difference of the queue size can have
> any side effects.
>

I think that we cannot avoid that extra logic unless we force GPA to
be contiguous in IOVA. If we are sure the guest's buffers cannot be at
more than one descriptor in SVQ, then yes, we can simplify things. If
not, I think we are forced to carry all of it.

But if we prove it I'm not opposed to simplifying things and making
head at SVQ == head at guest.

Thanks!

[1] https://gitlab.com/qemu-project/qemu/-/blob/17e31340/hw/virtio/virtio.c#L1297
[2] But that's not the whole story: I've been running limited in tx
descriptors because of virtio_net_max_tx_queue_size, which predates
vdpa. I'll send a patch to also un-limit it.

>
> >
> > If with hardware queues you mean guest's vring, not sure why it is
> > "probably 256". I'd say that in that case with the virtio-net kernel
> > driver the ring size will be the same as the device export, for
> > example, isn't it?
> >
> > The implementation should support any combination of sizes, but the
> > ring size exposed to the guest is never bigger than hardware one.
> >
> >> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
> >> to add event index support?
> >>
> > I think we should not have any problem with event idx. If you mean
> > that the guest could mark more buffers available than SVQ vring's
> > size, that should not happen because there must be less entries in the
> > guest than SVQ.
> >
> > But if I understood you correctly, a similar situation could happen if
> > a guest's contiguous buffer is scattered across many qemu's VA chunks.
> > Even if that would happen, the situation should be ok too: SVQ knows
> > the guest's avail idx and, if SVQ is full, it will continue forwarding
> > avail buffers when the device uses more buffers.
> >
> > Does that make sense to you?
>
>
> Yes.
>
> Thanks
>
Jason Wang Feb. 21, 2022, 7:15 a.m. UTC | #5
在 2022/2/18 上午1:13, Eugenio Perez Martin 写道:
> On Tue, Feb 8, 2022 at 4:58 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2022/2/1 上午2:58, Eugenio Perez Martin 写道:
>>> On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
>>>>> First half of the buffers forwarding part, preparing vhost-vdpa
>>>>> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
>>>>> this is effectively dead code at the moment, but it helps to reduce
>>>>> patch size.
>>>>>
>>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
>>>>> ---
>>>>>     hw/virtio/vhost-shadow-virtqueue.h |   2 +-
>>>>>     hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
>>>>>     hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
>>>>>     3 files changed, 143 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
>>>>> index 035207a469..39aef5ffdf 100644
>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
>>>>> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
>>>>>
>>>>>     void vhost_svq_stop(VhostShadowVirtqueue *svq);
>>>>>
>>>>> -VhostShadowVirtqueue *vhost_svq_new(void);
>>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
>>>>>
>>>>>     void vhost_svq_free(VhostShadowVirtqueue *vq);
>>>>>
>>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
>>>>> index f129ec8395..7c168075d7 100644
>>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
>>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
>>>>> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
>>>>>     /**
>>>>>      * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
>>>>>      * methods and file descriptors.
>>>>> + *
>>>>> + * @qsize Shadow VirtQueue size
>>>>> + *
>>>>> + * Returns the new virtqueue or NULL.
>>>>> + *
>>>>> + * In case of error, reason is reported through error_report.
>>>>>      */
>>>>> -VhostShadowVirtqueue *vhost_svq_new(void)
>>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
>>>>>     {
>>>>> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
>>>>> +    size_t device_size, driver_size;
>>>>>         g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
>>>>>         int r;
>>>>>
>>>>> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
>>>>>         /* Placeholder descriptor, it should be deleted at set_kick_fd */
>>>>>         event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
>>>>>
>>>>> +    svq->vring.num = qsize;
>>>> I wonder if this is the best. E.g some hardware can support up to 32K
>>>> queue size. So this will probably end up with:
>>>>
>>>> 1) SVQ use 32K queue size
>>>> 2) hardware queue uses 256
>>>>
>>> In that case SVQ vring queue size will be 32K and guest's vring can
>>> negotiate any number with SVQ equal or less than 32K,
>>
>> Sorry for being unclear what I meant is actually
>>
>> 1) SVQ uses 32K queue size
>>
>> 2) guest vq uses 256
>>
>> This looks like a burden that needs extra logic and may damage the
>> performance.
>>
> Still not getting this point.
>
> An available guest buffer, although contiguous in GPA/GVA, can expand
> in multiple buffers if it's not contiguous in qemu's VA (by the while
> loop in virtqueue_map_desc [1]). In that scenario it is better to have
> "plenty" of SVQ buffers.


Yes, but this case should be rare. So in this case we should deal with 
overrun on SVQ, that is

1) SVQ is full
2) guest VQ isn't

We need to

1) check the available buffer slots
2) disable guest kick and wait for the used buffers

But it looks to me the current code is not ready for dealing with this case?


>
> I'm ok if we decide to put an upper limit though, or if we decide not
> to handle this situation. But we would leave out valid virtio drivers.
> Maybe to set a fixed upper limit (1024?)? To add another parameter
> (x-svq-size-n=N)?
>
> If you mean we lose performance because memory gets more sparse I
> think the only possibility is to limit that way.


If guest is not using 32K, having a 32K for svq may gives extra stress 
on the cache since we will end up with a pretty large working set.


>
>> And this can lead other interesting situation:
>>
>> 1) SVQ uses 256
>>
>> 2) guest vq uses 1024
>>
>> Where a lot of more SVQ logic is needed.
>>
> If we agree that a guest descriptor can expand in multiple SVQ
> descriptors, this should be already handled by the previous logic too.
>
> But this should only happen in case that qemu is launched with a "bad"
> cmdline, isn't it?


This seems can happen when we use -device 
virtio-net-pci,tx_queue_size=1024 with a 256 size vp_vdpa device at least?


>
> If I run that example with vp_vdpa, L0 qemu will happily accept 1024
> as a queue size [2]. But if the vdpa device maximum queue size is
> effectively 256, this will result in an error: We're not exposing it
> to the guest at any moment but with qemu's cmdline.
>
>>> including 256.
>>> Is that what you mean?
>>
>> I mean, it looks to me the logic will be much more simplified if we just
>> allocate the shadow virtqueue with the size what guest can see (guest
>> vring).
>>
>> Then we don't need to think if the difference of the queue size can have
>> any side effects.
>>
> I think that we cannot avoid that extra logic unless we force GPA to
> be contiguous in IOVA. If we are sure the guest's buffers cannot be at
> more than one descriptor in SVQ, then yes, we can simplify things. If
> not, I think we are forced to carry all of it.


Yes, I agree, the code should be robust to handle any case.

Thanks


>
> But if we prove it I'm not opposed to simplifying things and making
> head at SVQ == head at guest.
>
> Thanks!
>
> [1] https://gitlab.com/qemu-project/qemu/-/blob/17e31340/hw/virtio/virtio.c#L1297
> [2] But that's not the whole story: I've been running limited in tx
> descriptors because of virtio_net_max_tx_queue_size, which predates
> vdpa. I'll send a patch to also un-limit it.
>
>>> If with hardware queues you mean guest's vring, not sure why it is
>>> "probably 256". I'd say that in that case with the virtio-net kernel
>>> driver the ring size will be the same as the device export, for
>>> example, isn't it?
>>>
>>> The implementation should support any combination of sizes, but the
>>> ring size exposed to the guest is never bigger than hardware one.
>>>
>>>> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
>>>> to add event index support?
>>>>
>>> I think we should not have any problem with event idx. If you mean
>>> that the guest could mark more buffers available than SVQ vring's
>>> size, that should not happen because there must be less entries in the
>>> guest than SVQ.
>>>
>>> But if I understood you correctly, a similar situation could happen if
>>> a guest's contiguous buffer is scattered across many qemu's VA chunks.
>>> Even if that would happen, the situation should be ok too: SVQ knows
>>> the guest's avail idx and, if SVQ is full, it will continue forwarding
>>> avail buffers when the device uses more buffers.
>>>
>>> Does that make sense to you?
>>
>> Yes.
>>
>> Thanks
>>
Eugenio Perez Martin Feb. 21, 2022, 5:22 p.m. UTC | #6
On Mon, Feb 21, 2022 at 8:15 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2022/2/18 上午1:13, Eugenio Perez Martin 写道:
> > On Tue, Feb 8, 2022 at 4:58 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2022/2/1 上午2:58, Eugenio Perez Martin 写道:
> >>> On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> >>>>> First half of the buffers forwarding part, preparing vhost-vdpa
> >>>>> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
> >>>>> this is effectively dead code at the moment, but it helps to reduce
> >>>>> patch size.
> >>>>>
> >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> >>>>> ---
> >>>>>     hw/virtio/vhost-shadow-virtqueue.h |   2 +-
> >>>>>     hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
> >>>>>     hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
> >>>>>     3 files changed, 143 insertions(+), 13 deletions(-)
> >>>>>
> >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> >>>>> index 035207a469..39aef5ffdf 100644
> >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> >>>>> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> >>>>>
> >>>>>     void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >>>>>
> >>>>> -VhostShadowVirtqueue *vhost_svq_new(void);
> >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> >>>>>
> >>>>>     void vhost_svq_free(VhostShadowVirtqueue *vq);
> >>>>>
> >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> >>>>> index f129ec8395..7c168075d7 100644
> >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> >>>>> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> >>>>>     /**
> >>>>>      * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> >>>>>      * methods and file descriptors.
> >>>>> + *
> >>>>> + * @qsize Shadow VirtQueue size
> >>>>> + *
> >>>>> + * Returns the new virtqueue or NULL.
> >>>>> + *
> >>>>> + * In case of error, reason is reported through error_report.
> >>>>>      */
> >>>>> -VhostShadowVirtqueue *vhost_svq_new(void)
> >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> >>>>>     {
> >>>>> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
> >>>>> +    size_t device_size, driver_size;
> >>>>>         g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
> >>>>>         int r;
> >>>>>
> >>>>> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> >>>>>         /* Placeholder descriptor, it should be deleted at set_kick_fd */
> >>>>>         event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
> >>>>>
> >>>>> +    svq->vring.num = qsize;
> >>>> I wonder if this is the best. E.g some hardware can support up to 32K
> >>>> queue size. So this will probably end up with:
> >>>>
> >>>> 1) SVQ use 32K queue size
> >>>> 2) hardware queue uses 256
> >>>>
> >>> In that case SVQ vring queue size will be 32K and guest's vring can
> >>> negotiate any number with SVQ equal or less than 32K,
> >>
> >> Sorry for being unclear what I meant is actually
> >>
> >> 1) SVQ uses 32K queue size
> >>
> >> 2) guest vq uses 256
> >>
> >> This looks like a burden that needs extra logic and may damage the
> >> performance.
> >>
> > Still not getting this point.
> >
> > An available guest buffer, although contiguous in GPA/GVA, can expand
> > in multiple buffers if it's not contiguous in qemu's VA (by the while
> > loop in virtqueue_map_desc [1]). In that scenario it is better to have
> > "plenty" of SVQ buffers.
>
>
> Yes, but this case should be rare. So in this case we should deal with
> overrun on SVQ, that is
>
> 1) SVQ is full
> 2) guest VQ isn't
>
> We need to
>
> 1) check the available buffer slots
> 2) disable guest kick and wait for the used buffers
>
> But it looks to me the current code is not ready for dealing with this case?
>

Yes it deals, that's the meaning of svq->next_guest_avail_elem.

>
> >
> > I'm ok if we decide to put an upper limit though, or if we decide not
> > to handle this situation. But we would leave out valid virtio drivers.
> > Maybe to set a fixed upper limit (1024?)? To add another parameter
> > (x-svq-size-n=N)?
> >
> > If you mean we lose performance because memory gets more sparse I
> > think the only possibility is to limit that way.
>
>
> If guest is not using 32K, having a 32K for svq may gives extra stress
> on the cache since we will end up with a pretty large working set.
>

That might be true. My guess is that it should not matter, since SVQ
and the guest's vring will have the same numbers of scattered buffers
and the avail / used / packed ring will be consumed more or less
sequentially. But I haven't tested.

I think it's better to add an upper limit (either fixed or in the
qemu's backend's cmdline) later if we see that this is a problem.
Another solution now would be to get the number from the frontend
device cmdline instead of from the vdpa device. I'm ok with that, but
it doesn't delete the svq->next_guest_avail_elem processing, and it
comes with disadvantages in my opinion. More below.

>
> >
> >> And this can lead other interesting situation:
> >>
> >> 1) SVQ uses 256
> >>
> >> 2) guest vq uses 1024
> >>
> >> Where a lot of more SVQ logic is needed.
> >>
> > If we agree that a guest descriptor can expand in multiple SVQ
> > descriptors, this should be already handled by the previous logic too.
> >
> > But this should only happen in case that qemu is launched with a "bad"
> > cmdline, isn't it?
>
>
> This seems can happen when we use -device
> virtio-net-pci,tx_queue_size=1024 with a 256 size vp_vdpa device at least?
>

I'm going to use the rx queue here since it's more accurate, tx has
its own limit separately.

If we use rx_queue_size=256 in L0 and rx_queue_size=1024 in L1 with no
SVQ, L0 qemu will happily accept 1024 as size when L1 qemu writes that
value at vhost_virtqueue_start. I'm not sure what would happen with a
real device, my guess is that the device will fail somehow. That's
what I meant with a "bad cmdline", I should have been more specific.

If we add SVQ to the mix, the guest first negotiates the 1024 with the
qemu device model. After that, vhost.c will try to write 1024 too but
this is totally ignored by this patch's changes at
vhost_vdpa_set_vring_num. Finally, SVQ will set 256 as a ring size to
the device, since it's the read value from the device, leading to your
scenario. So SVQ effectively isolates both sides and makes possible
the communication, even with a device that does not support so many
descriptors.

But SVQ already handles this case: It's the same as if the buffers are
fragmented in HVA and queue size is equal at both sides. That's why I
think SVQ size should depend on the backend device's size, not
frontend cmdline.

Thanks!

>
> >
> > If I run that example with vp_vdpa, L0 qemu will happily accept 1024
> > as a queue size [2]. But if the vdpa device maximum queue size is
> > effectively 256, this will result in an error: We're not exposing it
> > to the guest at any moment but with qemu's cmdline.
> >
> >>> including 256.
> >>> Is that what you mean?
> >>
> >> I mean, it looks to me the logic will be much more simplified if we just
> >> allocate the shadow virtqueue with the size what guest can see (guest
> >> vring).
> >>
> >> Then we don't need to think if the difference of the queue size can have
> >> any side effects.
> >>
> > I think that we cannot avoid that extra logic unless we force GPA to
> > be contiguous in IOVA. If we are sure the guest's buffers cannot be at
> > more than one descriptor in SVQ, then yes, we can simplify things. If
> > not, I think we are forced to carry all of it.
>
>
> Yes, I agree, the code should be robust to handle any case.
>
> Thanks
>
>
> >
> > But if we prove it I'm not opposed to simplifying things and making
> > head at SVQ == head at guest.
> >
> > Thanks!
> >
> > [1] https://gitlab.com/qemu-project/qemu/-/blob/17e31340/hw/virtio/virtio.c#L1297
> > [2] But that's not the whole story: I've been running limited in tx
> > descriptors because of virtio_net_max_tx_queue_size, which predates
> > vdpa. I'll send a patch to also un-limit it.
> >
> >>> If with hardware queues you mean guest's vring, not sure why it is
> >>> "probably 256". I'd say that in that case with the virtio-net kernel
> >>> driver the ring size will be the same as the device export, for
> >>> example, isn't it?
> >>>
> >>> The implementation should support any combination of sizes, but the
> >>> ring size exposed to the guest is never bigger than hardware one.
> >>>
> >>>> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
> >>>> to add event index support?
> >>>>
> >>> I think we should not have any problem with event idx. If you mean
> >>> that the guest could mark more buffers available than SVQ vring's
> >>> size, that should not happen because there must be less entries in the
> >>> guest than SVQ.
> >>>
> >>> But if I understood you correctly, a similar situation could happen if
> >>> a guest's contiguous buffer is scattered across many qemu's VA chunks.
> >>> Even if that would happen, the situation should be ok too: SVQ knows
> >>> the guest's avail idx and, if SVQ is full, it will continue forwarding
> >>> avail buffers when the device uses more buffers.
> >>>
> >>> Does that make sense to you?
> >>
> >> Yes.
> >>
> >> Thanks
> >>
>
Jason Wang Feb. 22, 2022, 3:16 a.m. UTC | #7
On Tue, Feb 22, 2022 at 1:23 AM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Mon, Feb 21, 2022 at 8:15 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2022/2/18 上午1:13, Eugenio Perez Martin 写道:
> > > On Tue, Feb 8, 2022 at 4:58 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>
> > >> 在 2022/2/1 上午2:58, Eugenio Perez Martin 写道:
> > >>> On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > >>>>> First half of the buffers forwarding part, preparing vhost-vdpa
> > >>>>> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
> > >>>>> this is effectively dead code at the moment, but it helps to reduce
> > >>>>> patch size.
> > >>>>>
> > >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > >>>>> ---
> > >>>>>     hw/virtio/vhost-shadow-virtqueue.h |   2 +-
> > >>>>>     hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
> > >>>>>     hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
> > >>>>>     3 files changed, 143 insertions(+), 13 deletions(-)
> > >>>>>
> > >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > >>>>> index 035207a469..39aef5ffdf 100644
> > >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> > >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > >>>>> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> > >>>>>
> > >>>>>     void vhost_svq_stop(VhostShadowVirtqueue *svq);
> > >>>>>
> > >>>>> -VhostShadowVirtqueue *vhost_svq_new(void);
> > >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> > >>>>>
> > >>>>>     void vhost_svq_free(VhostShadowVirtqueue *vq);
> > >>>>>
> > >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > >>>>> index f129ec8395..7c168075d7 100644
> > >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> > >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > >>>>> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > >>>>>     /**
> > >>>>>      * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > >>>>>      * methods and file descriptors.
> > >>>>> + *
> > >>>>> + * @qsize Shadow VirtQueue size
> > >>>>> + *
> > >>>>> + * Returns the new virtqueue or NULL.
> > >>>>> + *
> > >>>>> + * In case of error, reason is reported through error_report.
> > >>>>>      */
> > >>>>> -VhostShadowVirtqueue *vhost_svq_new(void)
> > >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> > >>>>>     {
> > >>>>> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
> > >>>>> +    size_t device_size, driver_size;
> > >>>>>         g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
> > >>>>>         int r;
> > >>>>>
> > >>>>> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> > >>>>>         /* Placeholder descriptor, it should be deleted at set_kick_fd */
> > >>>>>         event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
> > >>>>>
> > >>>>> +    svq->vring.num = qsize;
> > >>>> I wonder if this is the best. E.g some hardware can support up to 32K
> > >>>> queue size. So this will probably end up with:
> > >>>>
> > >>>> 1) SVQ use 32K queue size
> > >>>> 2) hardware queue uses 256
> > >>>>
> > >>> In that case SVQ vring queue size will be 32K and guest's vring can
> > >>> negotiate any number with SVQ equal or less than 32K,
> > >>
> > >> Sorry for being unclear what I meant is actually
> > >>
> > >> 1) SVQ uses 32K queue size
> > >>
> > >> 2) guest vq uses 256
> > >>
> > >> This looks like a burden that needs extra logic and may damage the
> > >> performance.
> > >>
> > > Still not getting this point.
> > >
> > > An available guest buffer, although contiguous in GPA/GVA, can expand
> > > in multiple buffers if it's not contiguous in qemu's VA (by the while
> > > loop in virtqueue_map_desc [1]). In that scenario it is better to have
> > > "plenty" of SVQ buffers.
> >
> >
> > Yes, but this case should be rare. So in this case we should deal with
> > overrun on SVQ, that is
> >
> > 1) SVQ is full
> > 2) guest VQ isn't
> >
> > We need to
> >
> > 1) check the available buffer slots
> > 2) disable guest kick and wait for the used buffers
> >
> > But it looks to me the current code is not ready for dealing with this case?
> >
>
> Yes it deals, that's the meaning of svq->next_guest_avail_elem.

Oh right, I missed that.

>
> >
> > >
> > > I'm ok if we decide to put an upper limit though, or if we decide not
> > > to handle this situation. But we would leave out valid virtio drivers.
> > > Maybe to set a fixed upper limit (1024?)? To add another parameter
> > > (x-svq-size-n=N)?
> > >
> > > If you mean we lose performance because memory gets more sparse I
> > > think the only possibility is to limit that way.
> >
> >
> > If guest is not using 32K, having a 32K for svq may gives extra stress
> > on the cache since we will end up with a pretty large working set.
> >
>
> That might be true. My guess is that it should not matter, since SVQ
> and the guest's vring will have the same numbers of scattered buffers
> and the avail / used / packed ring will be consumed more or less
> sequentially. But I haven't tested.
>
> I think it's better to add an upper limit (either fixed or in the
> qemu's backend's cmdline) later if we see that this is a problem.

I'd suggest using the same size as what the guest saw.

> Another solution now would be to get the number from the frontend
> device cmdline instead of from the vdpa device. I'm ok with that, but
> it doesn't delete the svq->next_guest_avail_elem processing, and it
> comes with disadvantages in my opinion. More below.

Right, we should keep next_guest_avail_elem. Using the same queue size
is a balance between:

1) using next_guest_avail_elem (rare)
2) not give too much stress on the cache

>
> >
> > >
> > >> And this can lead other interesting situation:
> > >>
> > >> 1) SVQ uses 256
> > >>
> > >> 2) guest vq uses 1024
> > >>
> > >> Where a lot of more SVQ logic is needed.
> > >>
> > > If we agree that a guest descriptor can expand in multiple SVQ
> > > descriptors, this should be already handled by the previous logic too.
> > >
> > > But this should only happen in case that qemu is launched with a "bad"
> > > cmdline, isn't it?
> >
> >
> > This seems can happen when we use -device
> > virtio-net-pci,tx_queue_size=1024 with a 256 size vp_vdpa device at least?
> >
>
> I'm going to use the rx queue here since it's more accurate, tx has
> its own limit separately.
>
> If we use rx_queue_size=256 in L0 and rx_queue_size=1024 in L1 with no
> SVQ, L0 qemu will happily accept 1024 as size

Interesting, looks like a bug (I guess it works since you enable vhost?):

Per virtio-spec:

"""
Queue Size. On reset, specifies the maximum queue size supported by
the device. This can be modified by the driver to reduce memory
requirements. A 0 means the queue is unavailable.
"""

We can't increase the queue_size from 256 to 1024 actually. (Only
decrease is allowed).

> when L1 qemu writes that
> value at vhost_virtqueue_start. I'm not sure what would happen with a
> real device, my guess is that the device will fail somehow. That's
> what I meant with a "bad cmdline", I should have been more specific.

I should say that it's something that is probably unrelated to this
series but needs to be addressed.

>
> If we add SVQ to the mix, the guest first negotiates the 1024 with the
> qemu device model. After that, vhost.c will try to write 1024 too but
> this is totally ignored by this patch's changes at
> vhost_vdpa_set_vring_num. Finally, SVQ will set 256 as a ring size to
> the device, since it's the read value from the device, leading to your
> scenario. So SVQ effectively isolates both sides and makes possible
> the communication, even with a device that does not support so many
> descriptors.
>
> But SVQ already handles this case: It's the same as if the buffers are
> fragmented in HVA and queue size is equal at both sides. That's why I
> think SVQ size should depend on the backend device's size, not
> frontend cmdline.

Right.

Thanks

>
> Thanks!
>
> >
> > >
> > > If I run that example with vp_vdpa, L0 qemu will happily accept 1024
> > > as a queue size [2]. But if the vdpa device maximum queue size is
> > > effectively 256, this will result in an error: We're not exposing it
> > > to the guest at any moment but with qemu's cmdline.
> > >
> > >>> including 256.
> > >>> Is that what you mean?
> > >>
> > >> I mean, it looks to me the logic will be much more simplified if we just
> > >> allocate the shadow virtqueue with the size what guest can see (guest
> > >> vring).
> > >>
> > >> Then we don't need to think if the difference of the queue size can have
> > >> any side effects.
> > >>
> > > I think that we cannot avoid that extra logic unless we force GPA to
> > > be contiguous in IOVA. If we are sure the guest's buffers cannot be at
> > > more than one descriptor in SVQ, then yes, we can simplify things. If
> > > not, I think we are forced to carry all of it.
> >
> >
> > Yes, I agree, the code should be robust to handle any case.
> >
> > Thanks
> >
> >
> > >
> > > But if we prove it I'm not opposed to simplifying things and making
> > > head at SVQ == head at guest.
> > >
> > > Thanks!
> > >
> > > [1] https://gitlab.com/qemu-project/qemu/-/blob/17e31340/hw/virtio/virtio.c#L1297
> > > [2] But that's not the whole story: I've been running limited in tx
> > > descriptors because of virtio_net_max_tx_queue_size, which predates
> > > vdpa. I'll send a patch to also un-limit it.
> > >
> > >>> If with hardware queues you mean guest's vring, not sure why it is
> > >>> "probably 256". I'd say that in that case with the virtio-net kernel
> > >>> driver the ring size will be the same as the device export, for
> > >>> example, isn't it?
> > >>>
> > >>> The implementation should support any combination of sizes, but the
> > >>> ring size exposed to the guest is never bigger than hardware one.
> > >>>
> > >>>> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
> > >>>> to add event index support?
> > >>>>
> > >>> I think we should not have any problem with event idx. If you mean
> > >>> that the guest could mark more buffers available than SVQ vring's
> > >>> size, that should not happen because there must be less entries in the
> > >>> guest than SVQ.
> > >>>
> > >>> But if I understood you correctly, a similar situation could happen if
> > >>> a guest's contiguous buffer is scattered across many qemu's VA chunks.
> > >>> Even if that would happen, the situation should be ok too: SVQ knows
> > >>> the guest's avail idx and, if SVQ is full, it will continue forwarding
> > >>> avail buffers when the device uses more buffers.
> > >>>
> > >>> Does that make sense to you?
> > >>
> > >> Yes.
> > >>
> > >> Thanks
> > >>
> >
>
Eugenio Perez Martin Feb. 22, 2022, 7:42 a.m. UTC | #8
On Tue, Feb 22, 2022 at 4:16 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Feb 22, 2022 at 1:23 AM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Mon, Feb 21, 2022 at 8:15 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > >
> > > 在 2022/2/18 上午1:13, Eugenio Perez Martin 写道:
> > > > On Tue, Feb 8, 2022 at 4:58 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >>
> > > >> 在 2022/2/1 上午2:58, Eugenio Perez Martin 写道:
> > > >>> On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > > >>>>> First half of the buffers forwarding part, preparing vhost-vdpa
> > > >>>>> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
> > > >>>>> this is effectively dead code at the moment, but it helps to reduce
> > > >>>>> patch size.
> > > >>>>>
> > > >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > >>>>> ---
> > > >>>>>     hw/virtio/vhost-shadow-virtqueue.h |   2 +-
> > > >>>>>     hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
> > > >>>>>     hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
> > > >>>>>     3 files changed, 143 insertions(+), 13 deletions(-)
> > > >>>>>
> > > >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > >>>>> index 035207a469..39aef5ffdf 100644
> > > >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > >>>>> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> > > >>>>>
> > > >>>>>     void vhost_svq_stop(VhostShadowVirtqueue *svq);
> > > >>>>>
> > > >>>>> -VhostShadowVirtqueue *vhost_svq_new(void);
> > > >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> > > >>>>>
> > > >>>>>     void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > >>>>>
> > > >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > >>>>> index f129ec8395..7c168075d7 100644
> > > >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > >>>>> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > > >>>>>     /**
> > > >>>>>      * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > >>>>>      * methods and file descriptors.
> > > >>>>> + *
> > > >>>>> + * @qsize Shadow VirtQueue size
> > > >>>>> + *
> > > >>>>> + * Returns the new virtqueue or NULL.
> > > >>>>> + *
> > > >>>>> + * In case of error, reason is reported through error_report.
> > > >>>>>      */
> > > >>>>> -VhostShadowVirtqueue *vhost_svq_new(void)
> > > >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> > > >>>>>     {
> > > >>>>> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
> > > >>>>> +    size_t device_size, driver_size;
> > > >>>>>         g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
> > > >>>>>         int r;
> > > >>>>>
> > > >>>>> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> > > >>>>>         /* Placeholder descriptor, it should be deleted at set_kick_fd */
> > > >>>>>         event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
> > > >>>>>
> > > >>>>> +    svq->vring.num = qsize;
> > > >>>> I wonder if this is the best. E.g some hardware can support up to 32K
> > > >>>> queue size. So this will probably end up with:
> > > >>>>
> > > >>>> 1) SVQ use 32K queue size
> > > >>>> 2) hardware queue uses 256
> > > >>>>
> > > >>> In that case SVQ vring queue size will be 32K and guest's vring can
> > > >>> negotiate any number with SVQ equal or less than 32K,
> > > >>
> > > >> Sorry for being unclear what I meant is actually
> > > >>
> > > >> 1) SVQ uses 32K queue size
> > > >>
> > > >> 2) guest vq uses 256
> > > >>
> > > >> This looks like a burden that needs extra logic and may damage the
> > > >> performance.
> > > >>
> > > > Still not getting this point.
> > > >
> > > > An available guest buffer, although contiguous in GPA/GVA, can expand
> > > > in multiple buffers if it's not contiguous in qemu's VA (by the while
> > > > loop in virtqueue_map_desc [1]). In that scenario it is better to have
> > > > "plenty" of SVQ buffers.
> > >
> > >
> > > Yes, but this case should be rare. So in this case we should deal with
> > > overrun on SVQ, that is
> > >
> > > 1) SVQ is full
> > > 2) guest VQ isn't
> > >
> > > We need to
> > >
> > > 1) check the available buffer slots
> > > 2) disable guest kick and wait for the used buffers
> > >
> > > But it looks to me the current code is not ready for dealing with this case?
> > >
> >
> > Yes it deals, that's the meaning of svq->next_guest_avail_elem.
>
> Oh right, I missed that.
>
> >
> > >
> > > >
> > > > I'm ok if we decide to put an upper limit though, or if we decide not
> > > > to handle this situation. But we would leave out valid virtio drivers.
> > > > Maybe to set a fixed upper limit (1024?)? To add another parameter
> > > > (x-svq-size-n=N)?
> > > >
> > > > If you mean we lose performance because memory gets more sparse I
> > > > think the only possibility is to limit that way.
> > >
> > >
> > > If guest is not using 32K, having a 32K for svq may gives extra stress
> > > on the cache since we will end up with a pretty large working set.
> > >
> >
> > That might be true. My guess is that it should not matter, since SVQ
> > and the guest's vring will have the same numbers of scattered buffers
> > and the avail / used / packed ring will be consumed more or less
> > sequentially. But I haven't tested.
> >
> > I think it's better to add an upper limit (either fixed or in the
> > qemu's backend's cmdline) later if we see that this is a problem.
>
> I'd suggest using the same size as what the guest saw.
>
> > Another solution now would be to get the number from the frontend
> > device cmdline instead of from the vdpa device. I'm ok with that, but
> > it doesn't delete the svq->next_guest_avail_elem processing, and it
> > comes with disadvantages in my opinion. More below.
>
> Right, we should keep next_guest_avail_elem. Using the same queue size
> is a balance between:
>
> 1) using next_guest_avail_elem (rare)
> 2) not give too much stress on the cache
>

Ok I'll change the SVQ size for the frontend size then.

> >
> > >
> > > >
> > > >> And this can lead other interesting situation:
> > > >>
> > > >> 1) SVQ uses 256
> > > >>
> > > >> 2) guest vq uses 1024
> > > >>
> > > >> Where a lot of more SVQ logic is needed.
> > > >>
> > > > If we agree that a guest descriptor can expand in multiple SVQ
> > > > descriptors, this should be already handled by the previous logic too.
> > > >
> > > > But this should only happen in case that qemu is launched with a "bad"
> > > > cmdline, isn't it?
> > >
> > >
> > > This seems can happen when we use -device
> > > virtio-net-pci,tx_queue_size=1024 with a 256 size vp_vdpa device at least?
> > >
> >
> > I'm going to use the rx queue here since it's more accurate, tx has
> > its own limit separately.
> >
> > If we use rx_queue_size=256 in L0 and rx_queue_size=1024 in L1 with no
> > SVQ, L0 qemu will happily accept 1024 as size
>
> Interesting, looks like a bug (I guess it works since you enable vhost?):
>

No, emulated interfaces. More below.

> Per virtio-spec:
>
> """
> Queue Size. On reset, specifies the maximum queue size supported by
> the device. This can be modified by the driver to reduce memory
> requirements. A 0 means the queue is unavailable.
> """
>

Yes but how should it fail? Drivers do not know how to check if the
value was invalid. DEVICE_NEEDS_RESET?

The L0 emulated device simply receives the write to pci and calls
virtio_queue_set_num. I can try to add to the check "num <
vdev->vq[n].vring.num_default", but there is no way to notify the
guest that setting the value failed.

> We can't increase the queue_size from 256 to 1024 actually. (Only
> decrease is allowed).
>
> > when L1 qemu writes that
> > value at vhost_virtqueue_start. I'm not sure what would happen with a
> > real device, my guess is that the device will fail somehow. That's
> > what I meant with a "bad cmdline", I should have been more specific.
>
> I should say that it's something that is probably unrelated to this
> series but needs to be addressed.
>

I agree, I can start developing the patches for sure.

> >
> > If we add SVQ to the mix, the guest first negotiates the 1024 with the
> > qemu device model. After that, vhost.c will try to write 1024 too but
> > this is totally ignored by this patch's changes at
> > vhost_vdpa_set_vring_num. Finally, SVQ will set 256 as a ring size to
> > the device, since it's the read value from the device, leading to your
> > scenario. So SVQ effectively isolates both sides and makes possible
> > the communication, even with a device that does not support so many
> > descriptors.
> >
> > But SVQ already handles this case: It's the same as if the buffers are
> > fragmented in HVA and queue size is equal at both sides. That's why I
> > think SVQ size should depend on the backend device's size, not
> > frontend cmdline.
>
> Right.
>
> Thanks
>
> >
> > Thanks!
> >
> > >
> > > >
> > > > If I run that example with vp_vdpa, L0 qemu will happily accept 1024
> > > > as a queue size [2]. But if the vdpa device maximum queue size is
> > > > effectively 256, this will result in an error: We're not exposing it
> > > > to the guest at any moment but with qemu's cmdline.
> > > >
> > > >>> including 256.
> > > >>> Is that what you mean?
> > > >>
> > > >> I mean, it looks to me the logic will be much more simplified if we just
> > > >> allocate the shadow virtqueue with the size what guest can see (guest
> > > >> vring).
> > > >>
> > > >> Then we don't need to think if the difference of the queue size can have
> > > >> any side effects.
> > > >>
> > > > I think that we cannot avoid that extra logic unless we force GPA to
> > > > be contiguous in IOVA. If we are sure the guest's buffers cannot be at
> > > > more than one descriptor in SVQ, then yes, we can simplify things. If
> > > > not, I think we are forced to carry all of it.
> > >
> > >
> > > Yes, I agree, the code should be robust to handle any case.
> > >
> > > Thanks
> > >
> > >
> > > >
> > > > But if we prove it I'm not opposed to simplifying things and making
> > > > head at SVQ == head at guest.
> > > >
> > > > Thanks!
> > > >
> > > > [1] https://gitlab.com/qemu-project/qemu/-/blob/17e31340/hw/virtio/virtio.c#L1297
> > > > [2] But that's not the whole story: I've been running limited in tx
> > > > descriptors because of virtio_net_max_tx_queue_size, which predates
> > > > vdpa. I'll send a patch to also un-limit it.
> > > >
> > > >>> If with hardware queues you mean guest's vring, not sure why it is
> > > >>> "probably 256". I'd say that in that case with the virtio-net kernel
> > > >>> driver the ring size will be the same as the device export, for
> > > >>> example, isn't it?
> > > >>>
> > > >>> The implementation should support any combination of sizes, but the
> > > >>> ring size exposed to the guest is never bigger than hardware one.
> > > >>>
> > > >>>> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
> > > >>>> to add event index support?
> > > >>>>
> > > >>> I think we should not have any problem with event idx. If you mean
> > > >>> that the guest could mark more buffers available than SVQ vring's
> > > >>> size, that should not happen because there must be less entries in the
> > > >>> guest than SVQ.
> > > >>>
> > > >>> But if I understood you correctly, a similar situation could happen if
> > > >>> a guest's contiguous buffer is scattered across many qemu's VA chunks.
> > > >>> Even if that would happen, the situation should be ok too: SVQ knows
> > > >>> the guest's avail idx and, if SVQ is full, it will continue forwarding
> > > >>> avail buffers when the device uses more buffers.
> > > >>>
> > > >>> Does that make sense to you?
> > > >>
> > > >> Yes.
> > > >>
> > > >> Thanks
> > > >>
> > >
> >
>
Jason Wang Feb. 22, 2022, 7:59 a.m. UTC | #9
On Tue, Feb 22, 2022 at 3:43 PM Eugenio Perez Martin
<eperezma@redhat.com> wrote:
>
> On Tue, Feb 22, 2022 at 4:16 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Feb 22, 2022 at 1:23 AM Eugenio Perez Martin
> > <eperezma@redhat.com> wrote:
> > >
> > > On Mon, Feb 21, 2022 at 8:15 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > >
> > > > 在 2022/2/18 上午1:13, Eugenio Perez Martin 写道:
> > > > > On Tue, Feb 8, 2022 at 4:58 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >>
> > > > >> 在 2022/2/1 上午2:58, Eugenio Perez Martin 写道:
> > > > >>> On Sun, Jan 30, 2022 at 5:03 AM Jason Wang <jasowang@redhat.com> wrote:
> > > > >>>> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > > > >>>>> First half of the buffers forwarding part, preparing vhost-vdpa
> > > > >>>>> callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
> > > > >>>>> this is effectively dead code at the moment, but it helps to reduce
> > > > >>>>> patch size.
> > > > >>>>>
> > > > >>>>> Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > > > >>>>> ---
> > > > >>>>>     hw/virtio/vhost-shadow-virtqueue.h |   2 +-
> > > > >>>>>     hw/virtio/vhost-shadow-virtqueue.c |  21 ++++-
> > > > >>>>>     hw/virtio/vhost-vdpa.c             | 133 ++++++++++++++++++++++++++---
> > > > >>>>>     3 files changed, 143 insertions(+), 13 deletions(-)
> > > > >>>>>
> > > > >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
> > > > >>>>> index 035207a469..39aef5ffdf 100644
> > > > >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.h
> > > > >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > > > >>>>> @@ -35,7 +35,7 @@ size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
> > > > >>>>>
> > > > >>>>>     void vhost_svq_stop(VhostShadowVirtqueue *svq);
> > > > >>>>>
> > > > >>>>> -VhostShadowVirtqueue *vhost_svq_new(void);
> > > > >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> > > > >>>>>
> > > > >>>>>     void vhost_svq_free(VhostShadowVirtqueue *vq);
> > > > >>>>>
> > > > >>>>> diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
> > > > >>>>> index f129ec8395..7c168075d7 100644
> > > > >>>>> --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > >>>>> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > >>>>> @@ -277,9 +277,17 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > > > >>>>>     /**
> > > > >>>>>      * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
> > > > >>>>>      * methods and file descriptors.
> > > > >>>>> + *
> > > > >>>>> + * @qsize Shadow VirtQueue size
> > > > >>>>> + *
> > > > >>>>> + * Returns the new virtqueue or NULL.
> > > > >>>>> + *
> > > > >>>>> + * In case of error, reason is reported through error_report.
> > > > >>>>>      */
> > > > >>>>> -VhostShadowVirtqueue *vhost_svq_new(void)
> > > > >>>>> +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> > > > >>>>>     {
> > > > >>>>> +    size_t desc_size = sizeof(vring_desc_t) * qsize;
> > > > >>>>> +    size_t device_size, driver_size;
> > > > >>>>>         g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
> > > > >>>>>         int r;
> > > > >>>>>
> > > > >>>>> @@ -300,6 +308,15 @@ VhostShadowVirtqueue *vhost_svq_new(void)
> > > > >>>>>         /* Placeholder descriptor, it should be deleted at set_kick_fd */
> > > > >>>>>         event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
> > > > >>>>>
> > > > >>>>> +    svq->vring.num = qsize;
> > > > >>>> I wonder if this is the best. E.g some hardware can support up to 32K
> > > > >>>> queue size. So this will probably end up with:
> > > > >>>>
> > > > >>>> 1) SVQ use 32K queue size
> > > > >>>> 2) hardware queue uses 256
> > > > >>>>
> > > > >>> In that case SVQ vring queue size will be 32K and guest's vring can
> > > > >>> negotiate any number with SVQ equal or less than 32K,
> > > > >>
> > > > >> Sorry for being unclear what I meant is actually
> > > > >>
> > > > >> 1) SVQ uses 32K queue size
> > > > >>
> > > > >> 2) guest vq uses 256
> > > > >>
> > > > >> This looks like a burden that needs extra logic and may damage the
> > > > >> performance.
> > > > >>
> > > > > Still not getting this point.
> > > > >
> > > > > An available guest buffer, although contiguous in GPA/GVA, can expand
> > > > > in multiple buffers if it's not contiguous in qemu's VA (by the while
> > > > > loop in virtqueue_map_desc [1]). In that scenario it is better to have
> > > > > "plenty" of SVQ buffers.
> > > >
> > > >
> > > > Yes, but this case should be rare. So in this case we should deal with
> > > > overrun on SVQ, that is
> > > >
> > > > 1) SVQ is full
> > > > 2) guest VQ isn't
> > > >
> > > > We need to
> > > >
> > > > 1) check the available buffer slots
> > > > 2) disable guest kick and wait for the used buffers
> > > >
> > > > But it looks to me the current code is not ready for dealing with this case?
> > > >
> > >
> > > Yes it deals, that's the meaning of svq->next_guest_avail_elem.
> >
> > Oh right, I missed that.
> >
> > >
> > > >
> > > > >
> > > > > I'm ok if we decide to put an upper limit though, or if we decide not
> > > > > to handle this situation. But we would leave out valid virtio drivers.
> > > > > Maybe to set a fixed upper limit (1024?)? To add another parameter
> > > > > (x-svq-size-n=N)?
> > > > >
> > > > > If you mean we lose performance because memory gets more sparse I
> > > > > think the only possibility is to limit that way.
> > > >
> > > >
> > > > If guest is not using 32K, having a 32K for svq may gives extra stress
> > > > on the cache since we will end up with a pretty large working set.
> > > >
> > >
> > > That might be true. My guess is that it should not matter, since SVQ
> > > and the guest's vring will have the same numbers of scattered buffers
> > > and the avail / used / packed ring will be consumed more or less
> > > sequentially. But I haven't tested.
> > >
> > > I think it's better to add an upper limit (either fixed or in the
> > > qemu's backend's cmdline) later if we see that this is a problem.
> >
> > I'd suggest using the same size as what the guest saw.
> >
> > > Another solution now would be to get the number from the frontend
> > > device cmdline instead of from the vdpa device. I'm ok with that, but
> > > it doesn't delete the svq->next_guest_avail_elem processing, and it
> > > comes with disadvantages in my opinion. More below.
> >
> > Right, we should keep next_guest_avail_elem. Using the same queue size
> > is a balance between:
> >
> > 1) using next_guest_avail_elem (rare)
> > 2) not give too much stress on the cache
> >
>
> Ok I'll change the SVQ size for the frontend size then.
>
> > >
> > > >
> > > > >
> > > > >> And this can lead other interesting situation:
> > > > >>
> > > > >> 1) SVQ uses 256
> > > > >>
> > > > >> 2) guest vq uses 1024
> > > > >>
> > > > >> Where a lot of more SVQ logic is needed.
> > > > >>
> > > > > If we agree that a guest descriptor can expand in multiple SVQ
> > > > > descriptors, this should be already handled by the previous logic too.
> > > > >
> > > > > But this should only happen in case that qemu is launched with a "bad"
> > > > > cmdline, isn't it?
> > > >
> > > >
> > > > This seems can happen when we use -device
> > > > virtio-net-pci,tx_queue_size=1024 with a 256 size vp_vdpa device at least?
> > > >
> > >
> > > I'm going to use the rx queue here since it's more accurate, tx has
> > > its own limit separately.
> > >
> > > If we use rx_queue_size=256 in L0 and rx_queue_size=1024 in L1 with no
> > > SVQ, L0 qemu will happily accept 1024 as size
> >
> > Interesting, looks like a bug (I guess it works since you enable vhost?):
> >
>
> No, emulated interfaces. More below.
>
> > Per virtio-spec:
> >
> > """
> > Queue Size. On reset, specifies the maximum queue size supported by
> > the device. This can be modified by the driver to reduce memory
> > requirements. A 0 means the queue is unavailable.
> > """
> >
>
> Yes but how should it fail? Drivers do not know how to check if the
> value was invalid. DEVICE_NEEDS_RESET?

I think it can be detected by reading the value back to see if it matches.

Thanks

>
> The L0 emulated device simply receives the write to pci and calls
> virtio_queue_set_num. I can try to add to the check "num <
> vdev->vq[n].vring.num_default", but there is no way to notify the
> guest that setting the value failed.
>
> > We can't increase the queue_size from 256 to 1024 actually. (Only
> > decrease is allowed).
> >
> > > when L1 qemu writes that
> > > value at vhost_virtqueue_start. I'm not sure what would happen with a
> > > real device, my guess is that the device will fail somehow. That's
> > > what I meant with a "bad cmdline", I should have been more specific.
> >
> > I should say that it's something that is probably unrelated to this
> > series but needs to be addressed.
> >
>
> I agree, I can start developing the patches for sure.
>
> > >
> > > If we add SVQ to the mix, the guest first negotiates the 1024 with the
> > > qemu device model. After that, vhost.c will try to write 1024 too but
> > > this is totally ignored by this patch's changes at
> > > vhost_vdpa_set_vring_num. Finally, SVQ will set 256 as a ring size to
> > > the device, since it's the read value from the device, leading to your
> > > scenario. So SVQ effectively isolates both sides and makes possible
> > > the communication, even with a device that does not support so many
> > > descriptors.
> > >
> > > But SVQ already handles this case: It's the same as if the buffers are
> > > fragmented in HVA and queue size is equal at both sides. That's why I
> > > think SVQ size should depend on the backend device's size, not
> > > frontend cmdline.
> >
> > Right.
> >
> > Thanks
> >
> > >
> > > Thanks!
> > >
> > > >
> > > > >
> > > > > If I run that example with vp_vdpa, L0 qemu will happily accept 1024
> > > > > as a queue size [2]. But if the vdpa device maximum queue size is
> > > > > effectively 256, this will result in an error: We're not exposing it
> > > > > to the guest at any moment but with qemu's cmdline.
> > > > >
> > > > >>> including 256.
> > > > >>> Is that what you mean?
> > > > >>
> > > > >> I mean, it looks to me the logic will be much more simplified if we just
> > > > >> allocate the shadow virtqueue with the size what guest can see (guest
> > > > >> vring).
> > > > >>
> > > > >> Then we don't need to think if the difference of the queue size can have
> > > > >> any side effects.
> > > > >>
> > > > > I think that we cannot avoid that extra logic unless we force GPA to
> > > > > be contiguous in IOVA. If we are sure the guest's buffers cannot be at
> > > > > more than one descriptor in SVQ, then yes, we can simplify things. If
> > > > > not, I think we are forced to carry all of it.
> > > >
> > > >
> > > > Yes, I agree, the code should be robust to handle any case.
> > > >
> > > > Thanks
> > > >
> > > >
> > > > >
> > > > > But if we prove it I'm not opposed to simplifying things and making
> > > > > head at SVQ == head at guest.
> > > > >
> > > > > Thanks!
> > > > >
> > > > > [1] https://gitlab.com/qemu-project/qemu/-/blob/17e31340/hw/virtio/virtio.c#L1297
> > > > > [2] But that's not the whole story: I've been running limited in tx
> > > > > descriptors because of virtio_net_max_tx_queue_size, which predates
> > > > > vdpa. I'll send a patch to also un-limit it.
> > > > >
> > > > >>> If with hardware queues you mean guest's vring, not sure why it is
> > > > >>> "probably 256". I'd say that in that case with the virtio-net kernel
> > > > >>> driver the ring size will be the same as the device export, for
> > > > >>> example, isn't it?
> > > > >>>
> > > > >>> The implementation should support any combination of sizes, but the
> > > > >>> ring size exposed to the guest is never bigger than hardware one.
> > > > >>>
> > > > >>>> ? Or we SVQ can stick to 256 but this will this cause trouble if we want
> > > > >>>> to add event index support?
> > > > >>>>
> > > > >>> I think we should not have any problem with event idx. If you mean
> > > > >>> that the guest could mark more buffers available than SVQ vring's
> > > > >>> size, that should not happen because there must be less entries in the
> > > > >>> guest than SVQ.
> > > > >>>
> > > > >>> But if I understood you correctly, a similar situation could happen if
> > > > >>> a guest's contiguous buffer is scattered across many qemu's VA chunks.
> > > > >>> Even if that would happen, the situation should be ok too: SVQ knows
> > > > >>> the guest's avail idx and, if SVQ is full, it will continue forwarding
> > > > >>> avail buffers when the device uses more buffers.
> > > > >>>
> > > > >>> Does that make sense to you?
> > > > >>
> > > > >> Yes.
> > > > >>
> > > > >> Thanks
> > > > >>
> > > >
> > >
> >
>
diff mbox series

Patch

diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index 035207a469..39aef5ffdf 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -35,7 +35,7 @@  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
 
 void vhost_svq_free(VhostShadowVirtqueue *vq);
 
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index f129ec8395..7c168075d7 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -277,9 +277,17 @@  void vhost_svq_stop(VhostShadowVirtqueue *svq)
 /**
  * Creates vhost shadow virtqueue, and instruct vhost device to use the shadow
  * methods and file descriptors.
+ *
+ * @qsize Shadow VirtQueue size
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
 {
+    size_t desc_size = sizeof(vring_desc_t) * qsize;
+    size_t device_size, driver_size;
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
 
@@ -300,6 +308,15 @@  VhostShadowVirtqueue *vhost_svq_new(void)
     /* Placeholder descriptor, it should be deleted at set_kick_fd */
     event_notifier_init_fd(&svq->svq_kick, INVALID_SVQ_KICK_FD);
 
+    svq->vring.num = qsize;
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
@@ -318,5 +335,7 @@  void vhost_svq_free(VhostShadowVirtqueue *vq)
     event_notifier_cleanup(&vq->hdev_kick);
     event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
+    qemu_vfree(vq->vring.desc);
+    qemu_vfree(vq->vring.used);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 9d801cf907..53e14bafa0 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -641,20 +641,52 @@  static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
 }
 
-static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
-                                      struct vhost_vring_state *ring)
+static int vhost_vdpa_set_dev_vring_num(struct vhost_dev *dev,
+                                        struct vhost_vring_state *ring)
 {
     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
 }
 
-static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-                                       struct vhost_vring_state *ring)
+static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
+                                    struct vhost_vring_state *ring)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Vring num was set at device start. SVQ num is handled by VirtQueue
+         * code
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_num(dev, ring);
+}
+
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
 {
     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 }
 
+static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
+                                     struct vhost_vring_state *ring)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Vring base was set at device start. SVQ base is handled by VirtQueue
+         * code
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
+}
+
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
@@ -784,8 +816,8 @@  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
     }
 }
 
-static int vhost_vdpa_get_features(struct vhost_dev *dev,
-                                     uint64_t *features)
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
 {
     int ret;
 
@@ -794,15 +826,64 @@  static int vhost_vdpa_get_features(struct vhost_dev *dev,
     return ret;
 }
 
+static int vhost_vdpa_get_features(struct vhost_dev *dev, uint64_t *features)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Filter only features that SVQ can offer to guest */
+        vhost_svq_valid_guest_features(features);
+    }
+
+    return ret;
+}
+
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        uint64_t dev_features, svq_features, acked_features;
+        bool ok;
+
+        ret = vhost_vdpa_get_dev_features(dev, &dev_features);
+        if (ret != 0) {
+            error_report("Can't get vdpa device features, got (%d)", ret);
+            return ret;
+        }
+
+        svq_features = dev_features;
+        ok = vhost_svq_valid_device_features(&svq_features);
+        if (unlikely(!ok)) {
+            error_report("SVQ Invalid device feature flags, offer: 0x%"
+                         PRIx64", ok: 0x%"PRIx64, dev->features, svq_features);
+            return -1;
+        }
+
+        ok = vhost_svq_valid_guest_features(&features);
+        if (unlikely(!ok)) {
+            error_report(
+                "Invalid guest acked feature flag, acked: 0x%"
+                PRIx64", ok: 0x%"PRIx64, dev->acked_features, features);
+            return -1;
+        }
+
+        ok = vhost_svq_ack_guest_features(svq_features, features,
+                                          &acked_features);
+        if (unlikely(!ok)) {
+            return -1;
+        }
+
+        features = acked_features;
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -822,13 +903,31 @@  static int vhost_vdpa_set_owner(struct vhost_dev *dev)
     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
 }
 
-static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
-                    struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
+static void vhost_vdpa_vq_get_guest_addr(struct vhost_vring_addr *addr,
+                                         struct vhost_virtqueue *vq)
 {
-    assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
+}
+
+static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
+                                  struct vhost_vring_addr *addr,
+                                  struct vhost_virtqueue *vq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
+
+    if (v->shadow_vqs_enabled) {
+        int idx = vhost_vdpa_get_vq_index(dev, addr->index);
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, idx);
+
+        vhost_svq_get_vring_addr(svq, addr);
+    } else {
+        vhost_vdpa_vq_get_guest_addr(addr, vq);
+    }
+
     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
                                  addr->avail_user_addr, addr->used_user_addr);
     return 0;
@@ -849,6 +948,12 @@  static void vhost_psvq_free(gpointer svq)
     vhost_svq_free(svq);
 }
 
+static int vhost_vdpa_get_max_queue_size(struct vhost_dev *dev,
+                                         uint16_t *qsize)
+{
+    return vhost_vdpa_call(dev, VHOST_VDPA_GET_VRING_NUM, qsize);
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -857,6 +962,7 @@  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                                            vhost_psvq_free);
     uint64_t dev_features;
     uint64_t svq_features;
+    uint16_t qsize;
     int r;
     bool ok;
 
@@ -864,7 +970,7 @@  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         goto out;
     }
 
-    r = vhost_vdpa_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg(errp, "Can't get vdpa device features, got (%d)", r);
         return r;
@@ -879,9 +985,14 @@  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return -1;
     }
 
+    r = vhost_vdpa_get_max_queue_size(hdev, &qsize);
+    if (unlikely(r)) {
+        qsize = 256;
+    }
+
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_psvq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        VhostShadowVirtqueue *svq = vhost_svq_new();
+        VhostShadowVirtqueue *svq = vhost_svq_new(qsize);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);