diff mbox

[3/3] vhost: iommu: cache static mapping if there is

Message ID 1496404254-17429-4-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu June 2, 2017, 11:50 a.m. UTC
This patch pre-heat vhost iotlb cache when passthrough mode enabled.

Sometimes, even if user specified iommu_platform for vhost devices,
IOMMU might still be disabled. One case is passthrough mode in VT-d
implementation. We can detect this by observing iommu_list. If it's
empty, it means IOMMU translation is disabled, then we can actually
pre-heat the translation (it'll be static mapping then) by first
invalidating all IOTLB, then cache existing memory ranges into vhost
backend iotlb using 1:1 mapping.

Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/virtio/trace-events |  4 +++
 hw/virtio/vhost.c      | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

Comments

Michael S. Tsirkin June 2, 2017, 3:45 p.m. UTC | #1
On Fri, Jun 02, 2017 at 07:50:54PM +0800, Peter Xu wrote:
> This patch pre-heat vhost iotlb cache when passthrough mode enabled.
> 
> Sometimes, even if user specified iommu_platform for vhost devices,
> IOMMU might still be disabled. One case is passthrough mode in VT-d
> implementation. We can detect this by observing iommu_list. If it's
> empty, it means IOMMU translation is disabled, then we can actually
> pre-heat the translation (it'll be static mapping then) by first
> invalidating all IOTLB, then cache existing memory ranges into vhost
> backend iotlb using 1:1 mapping.
> 
> Reviewed-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>

This is still a hack I think. What if there's an invalidation?
I think the right thing is to send updates only when requested,
but sent the largest mapping including the iova, not from iova until end
of page. Thoughts?

> ---
>  hw/virtio/trace-events |  4 +++
>  hw/virtio/vhost.c      | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 70 insertions(+)
> 
> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> index 1f7a7c1..54dcbb3 100644
> --- a/hw/virtio/trace-events
> +++ b/hw/virtio/trace-events
> @@ -24,3 +24,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g
>  virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d"
>  virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d"
>  virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: %"PRIx64" num_pages: %d"
> +
> +# hw/virtio/vhost.c
> +vhost_iommu_commit(void) ""
> +vhost_iommu_static_preheat(void) ""
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 03a46a7..d03d720 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -27,6 +27,7 @@
>  #include "hw/virtio/virtio-access.h"
>  #include "migration/blocker.h"
>  #include "sysemu/dma.h"
> +#include "trace.h"
>  
>  /* enabled until disconnected backend stabilizes */
>  #define _VHOST_DEBUG 1
> @@ -730,6 +731,11 @@ static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      }
>  }
>  
> +static bool vhost_iommu_mr_enabled(struct vhost_dev *dev)
> +{
> +    return !QLIST_EMPTY(&dev->iommu_list);
> +}
> +
>  static void vhost_iommu_region_add(MemoryListener *listener,
>                                     MemoryRegionSection *section)
>  {
> @@ -782,6 +788,65 @@ static void vhost_iommu_region_del(MemoryListener *listener,
>      }
>  }
>  
> +static void vhost_iommu_commit(MemoryListener *listener)
> +{
> +    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
> +                                         iommu_listener);
> +    struct vhost_memory_region *r;
> +    int i;
> +
> +    trace_vhost_iommu_commit();
> +
> +    if (!vhost_iommu_mr_enabled(dev)) {
> +        /*
> +        * This means iommu_platform is enabled, however iommu memory
> +        * region is disabled, e.g., when device passthrough is setup.
> +        * Then, no translation is needed any more.
> +        *
> +        * Let's first invalidate the whole IOTLB, then pre-heat the
> +        * static mapping by looping over vhost memory ranges.
> +        */
> +
> +        if (dev->vhost_ops->vhost_invalidate_device_iotlb(dev, 0,
> +                                                          UINT64_MAX)) {
> +            error_report("%s: flush existing IOTLB failed", __func__);
> +            return;
> +        }
> +
> +        /*
> +         * Current VHOST_IOTLB_INVALIDATE API has a small defect that
> +         * the invalidation for (start=0, size=UINT64_MAX) cannot
> +         * really invalidate an cached range of (start=UINT64_MAX-1,
> +         * size=1). We send this 2nd invalidation to workaround this.
> +         * But, frankly speaking for QEMU we don't have a problem with
> +         * this since we will never have a vhost cache with range
> +         * (start=UINT64_MAX-1, size=1) - if you see
> +         * address_space_get_iotlb_entry() all IOTLBs are page
> +         * aligned.
> +         */
> +        if (dev->vhost_ops->vhost_invalidate_device_iotlb(dev, UINT64_MAX,
> +                                                          1)) {
> +            error_report("%s: flush existing IOTLB failed", __func__);
> +            return;
> +        }
> +
> +        for (i = 0; i < dev->mem->nregions; i++) {
> +            r = &dev->mem->regions[i];
> +            /* Vhost regions are writable RAM, so IOMMU_RW suites. */
> +            if (dev->vhost_ops->vhost_update_device_iotlb(dev,
> +                                                          r->guest_phys_addr,
> +                                                          r->userspace_addr,
> +                                                          r->memory_size,
> +                                                          IOMMU_RW)) {
> +                error_report("%s: pre-heat static mapping failed", __func__);
> +                return;
> +            }
> +        }
> +
> +        trace_vhost_iommu_static_preheat();
> +    }
> +}
> +
>  static void vhost_region_nop(MemoryListener *listener,
>                               MemoryRegionSection *section)
>  {
> @@ -1298,6 +1363,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>      hdev->iommu_listener = (MemoryListener) {
>          .region_add = vhost_iommu_region_add,
>          .region_del = vhost_iommu_region_del,
> +        .commit = vhost_iommu_commit,
>      };
>  
>      if (hdev->migration_blocker == NULL) {
> -- 
> 2.7.4
Michael S. Tsirkin June 2, 2017, 4:51 p.m. UTC | #2
On Fri, Jun 02, 2017 at 07:50:54PM +0800, Peter Xu wrote:
> This patch pre-heat vhost iotlb cache when passthrough mode enabled.
> 
> Sometimes, even if user specified iommu_platform for vhost devices,
> IOMMU might still be disabled. One case is passthrough mode in VT-d
> implementation. We can detect this by observing iommu_list. If it's
> empty, it means IOMMU translation is disabled, then we can actually
> pre-heat the translation (it'll be static mapping then) by first
> invalidating all IOTLB, then cache existing memory ranges into vhost
> backend iotlb using 1:1 mapping.
> 
> Reviewed-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>

Can't say I like this, this does not help the more important
use-case of dpdk which has a small static mapping but can't
be detected by QEMU. vhost should just heat up whatever's
actually used. Why isn't this enough?

> ---
>  hw/virtio/trace-events |  4 +++
>  hw/virtio/vhost.c      | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 70 insertions(+)
> 
> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
> index 1f7a7c1..54dcbb3 100644
> --- a/hw/virtio/trace-events
> +++ b/hw/virtio/trace-events
> @@ -24,3 +24,7 @@ virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g
>  virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d"
>  virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d"
>  virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: %"PRIx64" num_pages: %d"
> +
> +# hw/virtio/vhost.c
> +vhost_iommu_commit(void) ""
> +vhost_iommu_static_preheat(void) ""
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 03a46a7..d03d720 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -27,6 +27,7 @@
>  #include "hw/virtio/virtio-access.h"
>  #include "migration/blocker.h"
>  #include "sysemu/dma.h"
> +#include "trace.h"
>  
>  /* enabled until disconnected backend stabilizes */
>  #define _VHOST_DEBUG 1
> @@ -730,6 +731,11 @@ static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      }
>  }
>  
> +static bool vhost_iommu_mr_enabled(struct vhost_dev *dev)
> +{
> +    return !QLIST_EMPTY(&dev->iommu_list);
> +}
> +
>  static void vhost_iommu_region_add(MemoryListener *listener,
>                                     MemoryRegionSection *section)
>  {
> @@ -782,6 +788,65 @@ static void vhost_iommu_region_del(MemoryListener *listener,
>      }
>  }
>  
> +static void vhost_iommu_commit(MemoryListener *listener)
> +{
> +    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
> +                                         iommu_listener);
> +    struct vhost_memory_region *r;
> +    int i;
> +
> +    trace_vhost_iommu_commit();
> +
> +    if (!vhost_iommu_mr_enabled(dev)) {
> +        /*
> +        * This means iommu_platform is enabled, however iommu memory
> +        * region is disabled, e.g., when device passthrough is setup.
> +        * Then, no translation is needed any more.
> +        *
> +        * Let's first invalidate the whole IOTLB, then pre-heat the
> +        * static mapping by looping over vhost memory ranges.
> +        */
> +
> +        if (dev->vhost_ops->vhost_invalidate_device_iotlb(dev, 0,
> +                                                          UINT64_MAX)) {
> +            error_report("%s: flush existing IOTLB failed", __func__);
> +            return;
> +        }
> +
> +        /*
> +         * Current VHOST_IOTLB_INVALIDATE API has a small defect that
> +         * the invalidation for (start=0, size=UINT64_MAX) cannot
> +         * really invalidate an cached range of (start=UINT64_MAX-1,
> +         * size=1). We send this 2nd invalidation to workaround this.
> +         * But, frankly speaking for QEMU we don't have a problem with
> +         * this since we will never have a vhost cache with range
> +         * (start=UINT64_MAX-1, size=1) - if you see
> +         * address_space_get_iotlb_entry() all IOTLBs are page
> +         * aligned.
> +         */
> +        if (dev->vhost_ops->vhost_invalidate_device_iotlb(dev, UINT64_MAX,
> +                                                          1)) {
> +            error_report("%s: flush existing IOTLB failed", __func__);
> +            return;
> +        }
> +
> +        for (i = 0; i < dev->mem->nregions; i++) {
> +            r = &dev->mem->regions[i];
> +            /* Vhost regions are writable RAM, so IOMMU_RW suites. */
> +            if (dev->vhost_ops->vhost_update_device_iotlb(dev,
> +                                                          r->guest_phys_addr,
> +                                                          r->userspace_addr,
> +                                                          r->memory_size,
> +                                                          IOMMU_RW)) {
> +                error_report("%s: pre-heat static mapping failed", __func__);
> +                return;
> +            }
> +        }
> +
> +        trace_vhost_iommu_static_preheat();
> +    }
> +}
> +
>  static void vhost_region_nop(MemoryListener *listener,
>                               MemoryRegionSection *section)
>  {
> @@ -1298,6 +1363,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>      hdev->iommu_listener = (MemoryListener) {
>          .region_add = vhost_iommu_region_add,
>          .region_del = vhost_iommu_region_del,
> +        .commit = vhost_iommu_commit,
>      };
>  
>      if (hdev->migration_blocker == NULL) {
> -- 
> 2.7.4
Peter Xu June 5, 2017, 3:15 a.m. UTC | #3
On Fri, Jun 02, 2017 at 06:45:05PM +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 02, 2017 at 07:50:54PM +0800, Peter Xu wrote:
> > This patch pre-heat vhost iotlb cache when passthrough mode enabled.
> > 
> > Sometimes, even if user specified iommu_platform for vhost devices,
> > IOMMU might still be disabled. One case is passthrough mode in VT-d
> > implementation. We can detect this by observing iommu_list. If it's
> > empty, it means IOMMU translation is disabled, then we can actually
> > pre-heat the translation (it'll be static mapping then) by first
> > invalidating all IOTLB, then cache existing memory ranges into vhost
> > backend iotlb using 1:1 mapping.
> > 
> > Reviewed-by: Jason Wang <jasowang@redhat.com>
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> This is still a hack I think. What if there's an invalidation?
> I think the right thing is to send updates only when requested,
> but sent the largest mapping including the iova, not from iova until end
> of page. Thoughts?

Indeed it's kind of a hack, but it does not hurt anything but will
definitely boost performance in most cases...

Yes "sent the largest mapping including the iova" is okay, but the
first IO on one region would be delayed as well, so IMHO it's not the
best solution as well. I think the best solution should be (for sure)
that vhost knows it's PT, then it just skips the translation
completely. I just don't sure whether there's simple/good way to do
this.

Thanks,
Jason Wang June 5, 2017, 4:07 a.m. UTC | #4
On 2017年06月05日 11:15, Peter Xu wrote:
> On Fri, Jun 02, 2017 at 06:45:05PM +0300, Michael S. Tsirkin wrote:
>> On Fri, Jun 02, 2017 at 07:50:54PM +0800, Peter Xu wrote:
>>> This patch pre-heat vhost iotlb cache when passthrough mode enabled.
>>>
>>> Sometimes, even if user specified iommu_platform for vhost devices,
>>> IOMMU might still be disabled. One case is passthrough mode in VT-d
>>> implementation. We can detect this by observing iommu_list. If it's
>>> empty, it means IOMMU translation is disabled, then we can actually
>>> pre-heat the translation (it'll be static mapping then) by first
>>> invalidating all IOTLB, then cache existing memory ranges into vhost
>>> backend iotlb using 1:1 mapping.
>>>
>>> Reviewed-by: Jason Wang <jasowang@redhat.com>
>>> Signed-off-by: Peter Xu <peterx@redhat.com>
>> This is still a hack I think. What if there's an invalidation?
>> I think the right thing is to send updates only when requested,
>> but sent the largest mapping including the iova, not from iova until end
>> of page. Thoughts?
> Indeed it's kind of a hack, but it does not hurt anything but will
> definitely boost performance in most cases...
>
> Yes "sent the largest mapping including the iova" is okay, but the
> first IO on one region would be delayed as well, so IMHO it's not the
> best solution as well. I think the best solution should be (for sure)
> that vhost knows it's PT, then it just skips the translation
> completely. I just don't sure whether there's simple/good way to do
> this.
>
> Thanks,

We can disabled device IOTLB completely in this case. But looks like 
there's a minor kernel bug prevent us from doing this. Let me post a fix 
and let's see then.

Thanks
Michael S. Tsirkin June 5, 2017, 3:05 p.m. UTC | #5
On Mon, Jun 05, 2017 at 11:15:11AM +0800, Peter Xu wrote:
> On Fri, Jun 02, 2017 at 06:45:05PM +0300, Michael S. Tsirkin wrote:
> > On Fri, Jun 02, 2017 at 07:50:54PM +0800, Peter Xu wrote:
> > > This patch pre-heat vhost iotlb cache when passthrough mode enabled.
> > > 
> > > Sometimes, even if user specified iommu_platform for vhost devices,
> > > IOMMU might still be disabled. One case is passthrough mode in VT-d
> > > implementation. We can detect this by observing iommu_list. If it's
> > > empty, it means IOMMU translation is disabled, then we can actually
> > > pre-heat the translation (it'll be static mapping then) by first
> > > invalidating all IOTLB, then cache existing memory ranges into vhost
> > > backend iotlb using 1:1 mapping.
> > > 
> > > Reviewed-by: Jason Wang <jasowang@redhat.com>
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > 
> > This is still a hack I think. What if there's an invalidation?
> > I think the right thing is to send updates only when requested,
> > but sent the largest mapping including the iova, not from iova until end
> > of page. Thoughts?
> 
> Indeed it's kind of a hack, but it does not hurt anything but will
> definitely boost performance in most cases...
> 
> Yes "sent the largest mapping including the iova" is okay, but the
> first IO on one region would be delayed as well, so IMHO it's not the
> best solution as well. I think the best solution should be (for sure)
> that vhost knows it's PT, then it just skips the translation
> completely. I just don't sure whether there's simple/good way to do
> this.
> 
> Thanks,

If you send the whole 64 bit area, then backend can detect this
easily.


> -- 
> Peter Xu
diff mbox

Patch

diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 1f7a7c1..54dcbb3 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -24,3 +24,7 @@  virtio_balloon_handle_output(const char *name, uint64_t gpa) "section name: %s g
 virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: %d actual: %d"
 virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d oldactual: %d"
 virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon target: %"PRIx64" num_pages: %d"
+
+# hw/virtio/vhost.c
+vhost_iommu_commit(void) ""
+vhost_iommu_static_preheat(void) ""
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 03a46a7..d03d720 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -27,6 +27,7 @@ 
 #include "hw/virtio/virtio-access.h"
 #include "migration/blocker.h"
 #include "sysemu/dma.h"
+#include "trace.h"
 
 /* enabled until disconnected backend stabilizes */
 #define _VHOST_DEBUG 1
@@ -730,6 +731,11 @@  static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     }
 }
 
+static bool vhost_iommu_mr_enabled(struct vhost_dev *dev)
+{
+    return !QLIST_EMPTY(&dev->iommu_list);
+}
+
 static void vhost_iommu_region_add(MemoryListener *listener,
                                    MemoryRegionSection *section)
 {
@@ -782,6 +788,65 @@  static void vhost_iommu_region_del(MemoryListener *listener,
     }
 }
 
+static void vhost_iommu_commit(MemoryListener *listener)
+{
+    struct vhost_dev *dev = container_of(listener, struct vhost_dev,
+                                         iommu_listener);
+    struct vhost_memory_region *r;
+    int i;
+
+    trace_vhost_iommu_commit();
+
+    if (!vhost_iommu_mr_enabled(dev)) {
+        /*
+        * This means iommu_platform is enabled, however iommu memory
+        * region is disabled, e.g., when device passthrough is setup.
+        * Then, no translation is needed any more.
+        *
+        * Let's first invalidate the whole IOTLB, then pre-heat the
+        * static mapping by looping over vhost memory ranges.
+        */
+
+        if (dev->vhost_ops->vhost_invalidate_device_iotlb(dev, 0,
+                                                          UINT64_MAX)) {
+            error_report("%s: flush existing IOTLB failed", __func__);
+            return;
+        }
+
+        /*
+         * Current VHOST_IOTLB_INVALIDATE API has a small defect that
+         * the invalidation for (start=0, size=UINT64_MAX) cannot
+         * really invalidate an cached range of (start=UINT64_MAX-1,
+         * size=1). We send this 2nd invalidation to workaround this.
+         * But, frankly speaking for QEMU we don't have a problem with
+         * this since we will never have a vhost cache with range
+         * (start=UINT64_MAX-1, size=1) - if you see
+         * address_space_get_iotlb_entry() all IOTLBs are page
+         * aligned.
+         */
+        if (dev->vhost_ops->vhost_invalidate_device_iotlb(dev, UINT64_MAX,
+                                                          1)) {
+            error_report("%s: flush existing IOTLB failed", __func__);
+            return;
+        }
+
+        for (i = 0; i < dev->mem->nregions; i++) {
+            r = &dev->mem->regions[i];
+            /* Vhost regions are writable RAM, so IOMMU_RW suites. */
+            if (dev->vhost_ops->vhost_update_device_iotlb(dev,
+                                                          r->guest_phys_addr,
+                                                          r->userspace_addr,
+                                                          r->memory_size,
+                                                          IOMMU_RW)) {
+                error_report("%s: pre-heat static mapping failed", __func__);
+                return;
+            }
+        }
+
+        trace_vhost_iommu_static_preheat();
+    }
+}
+
 static void vhost_region_nop(MemoryListener *listener,
                              MemoryRegionSection *section)
 {
@@ -1298,6 +1363,7 @@  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
     hdev->iommu_listener = (MemoryListener) {
         .region_add = vhost_iommu_region_add,
         .region_del = vhost_iommu_region_del,
+        .commit = vhost_iommu_commit,
     };
 
     if (hdev->migration_blocker == NULL) {