diff mbox

[for,2.8,11/11] vhost_net: device IOTLB support

Message ID 1472526419-5900-12-git-send-email-jasowang@redhat.com
State New
Headers show

Commit Message

Jason Wang Aug. 30, 2016, 3:06 a.m. UTC
This patches implements Device IOTLB support for vhost kernel. This is
done through:

1) switch to use dma helpers when map/unmap vrings from vhost codes
2) kernel support for Device IOTLB API:

- allow vhost-net to query the IOMMU IOTLB entry through eventfd
- enable the ability for qemu to update a specified mapping of vhost
- through ioctl.
- enable the ability to invalidate a specified range of iova for the
  device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
  triggered through iommu memory region notifier from device IOTLB
  invalidation descriptor processing routine.

With all the above, kernel vhost_net can co-operate with IOMMU.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-backend.c         | 104 ++++++++++++++++++++++++++
 hw/virtio/vhost.c                 | 149 ++++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-backend.h |  14 ++++
 include/hw/virtio/vhost.h         |   4 +
 include/hw/virtio/virtio-access.h |  44 ++++++++++-
 net/tap.c                         |   1 +
 6 files changed, 291 insertions(+), 25 deletions(-)

Comments

Peter Xu Sept. 1, 2016, 3:34 a.m. UTC | #1
On Tue, Aug 30, 2016 at 11:06:59AM +0800, Jason Wang wrote:
> This patches implements Device IOTLB support for vhost kernel. This is
> done through:
> 
> 1) switch to use dma helpers when map/unmap vrings from vhost codes
> 2) kernel support for Device IOTLB API:
> 
> - allow vhost-net to query the IOMMU IOTLB entry through eventfd
> - enable the ability for qemu to update a specified mapping of vhost
> - through ioctl.
> - enable the ability to invalidate a specified range of iova for the
>   device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
>   triggered through iommu memory region notifier from device IOTLB
>   invalidation descriptor processing routine.
> 
> With all the above, kernel vhost_net can co-operate with IOMMU.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  hw/virtio/vhost-backend.c         | 104 ++++++++++++++++++++++++++
>  hw/virtio/vhost.c                 | 149 ++++++++++++++++++++++++++++++++------
>  include/hw/virtio/vhost-backend.h |  14 ++++
>  include/hw/virtio/vhost.h         |   4 +
>  include/hw/virtio/virtio-access.h |  44 ++++++++++-
>  net/tap.c                         |   1 +
>  6 files changed, 291 insertions(+), 25 deletions(-)
> 
> diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
> index 7681f15..a5754f3 100644
> --- a/hw/virtio/vhost-backend.c
> +++ b/hw/virtio/vhost-backend.c
> @@ -172,6 +172,107 @@ static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
>      return idx - dev->vq_index;
>  }
>  
> +static void vhost_kernel_iotlb_read(void *opaque)
> +{
> +    struct vhost_dev *dev = opaque;
> +    struct vhost_msg msg;
> +    ssize_t len;
> +
> +    while((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) {
> +        struct vhost_iotlb_msg *imsg = &msg.iotlb;
> +        if (len < sizeof msg) {
> +            error_report("Wrong vhost message len: %d", (int)len);
> +            break;
> +        }
> +        if (msg.type != VHOST_IOTLB_MSG) {
> +            error_report("Unknown vhost iotlb message type");
> +            break;
> +        }
> +        switch (imsg->type) {
> +        case VHOST_IOTLB_MISS:
> +            vhost_device_iotlb_miss(dev, imsg->iova,
> +                                    imsg->perm != VHOST_ACCESS_RO);
> +            break;
> +        case VHOST_IOTLB_UPDATE:
> +        case VHOST_IOTLB_INVALIDATE:
> +            error_report("Unexpected IOTLB message type");
> +            break;
> +        case VHOST_IOTLB_ACCESS_FAIL:
> +            /* FIXME: report device iotlb error */
> +            break;
> +        default:
> +            break;
> +        }
> +    }
> +}
> +
> +static int vhost_kernel_update_device_iotlb(struct vhost_dev *dev,
> +                                            uint64_t iova, uint64_t uaddr,
> +                                            uint64_t len,
> +                                            IOMMUAccessFlags perm)
> +{
> +    struct vhost_msg msg = {
> +        .type = VHOST_IOTLB_MSG,
> +        .iotlb = {
> +            .iova = iova,
> +            .uaddr = uaddr,
> +            .size = len,
> +            .type = VHOST_IOTLB_UPDATE,
> +        }
> +    };
> +
> +    switch (perm) {
> +    case IOMMU_RO:
> +        msg.iotlb.perm = VHOST_ACCESS_RO;
> +        break;
> +    case IOMMU_WO:
> +        msg.iotlb.perm = VHOST_ACCESS_WO;
> +        break;
> +    case IOMMU_RW:
> +        msg.iotlb.perm = VHOST_ACCESS_RW;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
> +        error_report("Fail to update device iotlb");
> +        return -EFAULT;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vhost_kernel_invalidate_device_iotlb(struct vhost_dev *dev,
> +                                                uint64_t iova, uint64_t len)
> +{
> +    struct vhost_msg msg = {
> +        .type = VHOST_IOTLB_MSG,
> +        .iotlb = {
> +            .iova = iova,
> +            .size = len,
> +            .type = VHOST_IOTLB_INVALIDATE,
> +        }
> +    };
> +
> +    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
> +        error_report("Fail to invalidate device iotlb");
> +        return -EFAULT;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vhost_kernel_set_iotlb_callback(struct vhost_dev *dev,
> +                                           int enabled)
> +{
> +    if (enabled)
> +        qemu_set_fd_handler((uintptr_t)dev->opaque,
> +                            vhost_kernel_iotlb_read, NULL, dev);
> +    else
> +        qemu_set_fd_handler((uintptr_t)dev->opaque, NULL, NULL, NULL);
> +}
> +
>  static const VhostOps kernel_ops = {
>          .backend_type = VHOST_BACKEND_TYPE_KERNEL,
>          .vhost_backend_init = vhost_kernel_init,
> @@ -197,6 +298,9 @@ static const VhostOps kernel_ops = {
>          .vhost_set_owner = vhost_kernel_set_owner,
>          .vhost_reset_device = vhost_kernel_reset_device,
>          .vhost_get_vq_index = vhost_kernel_get_vq_index,
> +        .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback,
> +        .vhost_update_device_iotlb = vhost_kernel_update_device_iotlb,
> +        .vhost_invalidate_device_iotlb = vhost_kernel_invalidate_device_iotlb,
>  };
>  
>  int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type)
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 3d0c807..94e577b 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -437,7 +437,7 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
>              continue;
>          }
>          l = vq->ring_size;
> -        p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
> +        p = virtio_memory_map(dev->vdev, vq->ring_phys, &l, 1);
>          if (!p || l != vq->ring_size) {
>              error_report("Unable to map ring buffer for ring %d", i);
>              r = -ENOMEM;
> @@ -446,7 +446,7 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
>              error_report("Ring buffer relocated for ring %d", i);
>              r = -EBUSY;
>          }
> -        cpu_physical_memory_unmap(p, l, 0, 0);
> +        virtio_memory_unmap(dev->vdev, p, l, 0, 0);
>      }
>      return r;
>  }
> @@ -674,13 +674,18 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
>      return 0;
>  }
>  
> -static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
> +static int vhost_dev_set_features(struct vhost_dev *dev,
> +                                  bool enable_log)
>  {
>      uint64_t features = dev->acked_features;
> +    bool has_iommu = mr_has_iommu_ops(virtio_get_dma_as(dev->vdev)->root);
>      int r;
>      if (enable_log) {
>          features |= 0x1ULL << VHOST_F_LOG_ALL;
>      }
> +    if (has_iommu) {
> +        features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
> +    }
>      r = dev->vhost_ops->vhost_set_features(dev, features);
>      if (r < 0) {
>          VHOST_OPS_DEBUG("vhost_set_features failed");
> @@ -817,6 +822,56 @@ static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
>      return -errno;
>  }
>  
> +static int vhost_memory_region_lookup(struct vhost_dev *hdev,
> +                                      uint64_t gpa, uint64_t *uaddr,
> +                                      uint64_t *len)
> +{
> +    int i;
> +
> +    for (i = 0; i < hdev->mem->nregions; i++) {
> +        struct vhost_memory_region *reg = hdev->mem->regions + i;
> +
> +        if (gpa >= reg->guest_phys_addr &&
> +            reg->guest_phys_addr + reg->memory_size > gpa) {
> +            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
> +            *len = reg->guest_phys_addr + reg->memory_size - gpa;
> +            return 0;
> +        }
> +    }
> +
> +    return -EFAULT;
> +}
> +
> +void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
> +{
> +    IOMMUTLBEntry iotlb;
> +    uint64_t uaddr, len;
> +
> +    rcu_read_lock();
> +
> +    iotlb = address_space_get_iotlb_entry(virtio_get_dma_as(dev->vdev),
> +                                          iova, write);
> +    if (iotlb.target_as != NULL) {
> +        if (vhost_memory_region_lookup(dev, iotlb.translated_addr,
> +                                       &uaddr, &len)) {
> +            error_report("Fail to lookup the translated address "
> +                         "%"PRIx64, iotlb.translated_addr);
> +            goto out;
> +        }
> +
> +        len = MIN(iotlb.addr_mask + 1, len);
> +        iova = iova & ~iotlb.addr_mask;
> +
> +        if (dev->vhost_ops->vhost_update_device_iotlb(dev, iova, uaddr,
> +                                                      len, iotlb.perm)) {
> +            error_report("Fail to update device iotlb");
> +            goto out;
> +        }
> +    }

Question: when will target_as == NULL? Do we need an assertion here if
it should never happen?

> +out:
> +    rcu_read_unlock();
> +}
> +
>  static int vhost_virtqueue_start(struct vhost_dev *dev,
>                                  struct VirtIODevice *vdev,
>                                  struct vhost_virtqueue *vq,
> @@ -859,21 +914,21 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
>  
>      s = l = virtio_queue_get_desc_size(vdev, idx);
>      a = virtio_queue_get_desc_addr(vdev, idx);
> -    vq->desc = cpu_physical_memory_map(a, &l, 0);
> +    vq->desc = virtio_memory_map(vdev, a, &l, 0);
>      if (!vq->desc || l != s) {
>          r = -ENOMEM;
>          goto fail_alloc_desc;
>      }
>      s = l = virtio_queue_get_avail_size(vdev, idx);
>      a = virtio_queue_get_avail_addr(vdev, idx);
> -    vq->avail = cpu_physical_memory_map(a, &l, 0);
> +    vq->avail = virtio_memory_map(vdev, a, &l, 0);
>      if (!vq->avail || l != s) {
>          r = -ENOMEM;
>          goto fail_alloc_avail;
>      }
>      vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
>      vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
> -    vq->used = cpu_physical_memory_map(a, &l, 1);
> +    vq->used = virtio_memory_map(vdev, a, &l, 1);
>      if (!vq->used || l != s) {
>          r = -ENOMEM;
>          goto fail_alloc_used;
> @@ -881,7 +936,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
>  
>      vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
>      vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
> -    vq->ring = cpu_physical_memory_map(a, &l, 1);
> +    vq->ring = virtio_memory_map(vdev, a, &l, 1);
>      if (!vq->ring || l != s) {
>          r = -ENOMEM;
>          goto fail_alloc_ring;
> @@ -913,20 +968,19 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
>      }
>  
>      return 0;
> -
>  fail_kick:
>  fail_alloc:
> -    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
> -                              0, 0);
> +    virtio_memory_unmap(vdev, vq->ring, virtio_queue_get_ring_size(vdev, idx),
> +                        0, 0);
>  fail_alloc_ring:
> -    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
> -                              0, 0);
> +    virtio_memory_unmap(vdev, vq->used, virtio_queue_get_used_size(vdev, idx),
> +                        0, 0);
>  fail_alloc_used:
> -    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
> -                              0, 0);
> +    virtio_memory_unmap(vdev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
> +                        0, 0);
>  fail_alloc_avail:
> -    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
> -                              0, 0);
> +    virtio_memory_unmap(vdev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
> +                        0, 0);
>  fail_alloc_desc:
>      return r;
>  }
> @@ -959,14 +1013,14 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
>                                                  vhost_vq_index);
>      }
>  
> -    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
> -                              0, virtio_queue_get_ring_size(vdev, idx));
> -    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
> -                              1, virtio_queue_get_used_size(vdev, idx));
> -    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
> -                              0, virtio_queue_get_avail_size(vdev, idx));
> -    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
> -                              0, virtio_queue_get_desc_size(vdev, idx));
> +    virtio_memory_unmap(vdev, vq->ring, virtio_queue_get_ring_size(vdev, idx),
> +                        0, virtio_queue_get_ring_size(vdev, idx));
> +    virtio_memory_unmap(vdev, vq->used, virtio_queue_get_used_size(vdev, idx),
> +                        1, virtio_queue_get_used_size(vdev, idx));
> +    virtio_memory_unmap(vdev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
> +                        0, virtio_queue_get_avail_size(vdev, idx));
> +    virtio_memory_unmap(vdev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
> +                         0, virtio_queue_get_desc_size(vdev, idx));
>  }
>  
>  static void vhost_eventfd_add(MemoryListener *listener,
> @@ -1023,6 +1077,9 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
>          r = -errno;
>          goto fail_call;
>      }
> +
> +    vq->dev = dev;
> +
>      return 0;
>  fail_call:
>      event_notifier_cleanup(&vq->masked_notifier);
> @@ -1034,12 +1091,25 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
>      event_notifier_cleanup(&vq->masked_notifier);
>  }
>  
> +static void vhost_iommu_unmap_notify(Notifier *n, void *data)
> +{
> +    struct vhost_dev *hdev = container_of(n, struct vhost_dev, n);
> +    IOMMUTLBEntry *iotlb = data;
> +
> +    if (hdev->vhost_ops->vhost_invalidate_device_iotlb(hdev,
> +                                                       iotlb->iova,
> +                                                       iotlb->addr_mask +1)) {
> +        error_report("Fail to invalidate device iotlb");
> +    }
> +}
> +
>  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>                     VhostBackendType backend_type, uint32_t busyloop_timeout)
>  {
>      uint64_t features;
>      int i, r, n_initialized_vqs = 0;
>  
> +    hdev->vdev = NULL;
>      hdev->migration_blocker = NULL;
>  
>      r = vhost_set_backend_type(hdev, backend_type);
> @@ -1104,6 +1174,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>          .priority = 10
>      };
>  
> +    hdev->n.notify = vhost_iommu_unmap_notify;
> +
>      if (hdev->migration_blocker == NULL) {
>          if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
>              error_setg(&hdev->migration_blocker,
> @@ -1296,11 +1368,18 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
>      assert(hdev->vhost_ops);
>  
>      hdev->started = true;
> +    hdev->vdev = vdev;
>  
>      r = vhost_dev_set_features(hdev, hdev->log_enabled);
>      if (r < 0) {
>          goto fail_features;
>      }
> +
> +    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
> +        memory_region_register_iommu_notifier(virtio_get_dma_as(vdev)->root,
> +                                              &hdev->n);
> +    }
> +
>      r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
>      if (r < 0) {
>          VHOST_OPS_DEBUG("vhost_set_mem_table failed");
> @@ -1334,7 +1413,22 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
>          }
>      }
>  
> +    hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
> +
> +    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
> +        /* Update used ring information for IOTLB to work correctly */
> +        for (i = 0; i < hdev->nvqs; ++i) {
> +            struct vhost_virtqueue *vq = hdev->vqs + i;
> +            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
> +        }
> +    }
>      return 0;
> +#if 0
> +fail_iotlb:
> +    if (hdev->vhost_ops->vhost_set_vring_enable) {
> +        hdev->vhost_ops->vhost_set_vring_enable(hdev, 0);
> +    }
> +#endif

Maybe we can remove these lines if not to be used.

>  fail_log:
>      vhost_log_put(hdev, false);
>  fail_vq:
> @@ -1345,6 +1439,7 @@ fail_vq:
>                               hdev->vq_index + i);
>      }
>      i = hdev->nvqs;
> +

Nit: A newline without context change.

>  fail_mem:
>  fail_features:
>  
> @@ -1359,6 +1454,7 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>  
>      /* should only be called after backend is connected */
>      assert(hdev->vhost_ops);
> +    hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
>  
>      for (i = 0; i < hdev->nvqs; ++i) {
>          vhost_virtqueue_stop(hdev,
> @@ -1367,8 +1463,13 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>                               hdev->vq_index + i);
>      }
>  
> +    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
> +        memory_region_unregister_iommu_notifier(virtio_get_dma_as(vdev)->root,
> +                                                &hdev->n);
> +    }
>      vhost_log_put(hdev, true);
>      hdev->started = false;
> +    hdev->vdev = NULL;
>  }
>  
>  int vhost_net_set_backend(struct vhost_dev *hdev,
> diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
> index cf7f0b5..5cf8c70 100644
> --- a/include/hw/virtio/vhost-backend.h
> +++ b/include/hw/virtio/vhost-backend.h
> @@ -8,9 +8,12 @@
>   *
>   */
>  
> +

Another nit for newline.

>  #ifndef VHOST_BACKEND_H
>  #define VHOST_BACKEND_H
>  
> +#include "exec/memory.h"
> +
>  typedef enum VhostBackendType {
>      VHOST_BACKEND_TYPE_NONE = 0,
>      VHOST_BACKEND_TYPE_KERNEL = 1,
> @@ -73,6 +76,14 @@ typedef int (*vhost_migration_done_op)(struct vhost_dev *dev,
>  typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
>                                             uint64_t start1, uint64_t size1,
>                                             uint64_t start2, uint64_t size2);
> +typedef void (*vhost_set_iotlb_callback_op)(struct vhost_dev *dev,
> +                                           int enabled);
> +typedef int (*vhost_update_device_iotlb_op)(struct vhost_dev *dev,
> +                                            uint64_t iova, uint64_t uaddr,
> +                                            uint64_t len,
> +                                            IOMMUAccessFlags perm);
> +typedef int (*vhost_invalidate_device_iotlb_op)(struct vhost_dev *dev,
> +                                                uint64_t iova, uint64_t len);
>  
>  typedef struct VhostOps {
>      VhostBackendType backend_type;
> @@ -102,6 +113,9 @@ typedef struct VhostOps {
>      vhost_requires_shm_log_op vhost_requires_shm_log;
>      vhost_migration_done_op vhost_migration_done;
>      vhost_backend_can_merge_op vhost_backend_can_merge;
> +    vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
> +    vhost_update_device_iotlb_op vhost_update_device_iotlb;
> +    vhost_invalidate_device_iotlb_op vhost_invalidate_device_iotlb;
>  } VhostOps;
>  
>  extern const VhostOps user_ops;
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index e433089..b971fa1 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -20,6 +20,7 @@ struct vhost_virtqueue {
>      unsigned long long ring_phys;
>      unsigned ring_size;
>      EventNotifier masked_notifier;
> +    struct vhost_dev *dev;
>  };
>  
>  typedef unsigned long vhost_log_chunk_t;
> @@ -37,6 +38,7 @@ struct vhost_log {
>  
>  struct vhost_memory;
>  struct vhost_dev {
> +    VirtIODevice *vdev;
>      MemoryListener memory_listener;
>      struct vhost_memory *mem;
>      int n_mem_sections;
> @@ -61,6 +63,7 @@ struct vhost_dev {
>      void *opaque;
>      struct vhost_log *log;
>      QLIST_ENTRY(vhost_dev) entry;
> +    Notifier n;
>  };
>  
>  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
> @@ -90,4 +93,5 @@ bool vhost_has_free_slot(void);
>  int vhost_net_set_backend(struct vhost_dev *hdev,
>                            struct vhost_vring_file *file);
>  
> +void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
>  #endif
> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
> index 4071dad..3560bba 100644
> --- a/include/hw/virtio/virtio-access.h
> +++ b/include/hw/virtio/virtio-access.h
> @@ -18,6 +18,7 @@
>  
>  #include "hw/virtio/virtio.h"
>  #include "hw/virtio/virtio-bus.h"
> +#include "sysemu/dma.h"
>  #include "exec/address-spaces.h"
>  
>  #if defined(TARGET_PPC64) || defined(TARGET_ARM)
> @@ -200,4 +201,45 @@ static inline void virtio_tswap64s(VirtIODevice *vdev, uint64_t *s)
>  {
>      *s = virtio_tswap64(vdev, *s);
>  }
> -#endif /* QEMU_VIRTIO_ACCESS_H */
> +
> +static inline bool mr_has_iommu_ops(MemoryRegion *mr)
> +{
> +    if (mr->alias) {
> +        return mr_has_iommu_ops(mr->alias);
> +    }
> +
> +    if (mr->iommu_ops)
> +        return true;
> +    else
> +        return false;
> +}

Shall we just enhance memory_region_is_iommu() with alias handling,
then call memory_region_is_iommu() directly?

> +
> +static inline void *virtio_memory_map(VirtIODevice *vdev, hwaddr addr,
> +                                      hwaddr *plen, int is_write)
> +{
> +    AddressSpace *dma_as = virtio_get_dma_as(vdev);
> +
> +    if (!mr_has_iommu_ops(dma_as->root)) {
> +      return dma_memory_map(dma_as, addr, plen, is_write ?
         ^^ indents :)

> +                            DMA_DIRECTION_FROM_DEVICE :
> +                            DMA_DIRECTION_TO_DEVICE);
> +    } else {
> +      return (void *)addr;
         ^^ and here

> +    }
> +}
> +
> +
> +static inline void virtio_memory_unmap(VirtIODevice *vdev, void *buffer,
> +                                       hwaddr len, int is_write,
> +                                       hwaddr access_len)
> +{
> +    AddressSpace *dma_as = virtio_get_dma_as(vdev);
> +
> +    if (!mr_has_iommu_ops(dma_as->root)) {
> +      dma_memory_unmap(dma_as, buffer, len, is_write ?
         ^^ and here

And... One general question on the vhost fd used to communicate
between QEMU and vhost: I see that data is coming from both directions
on this fd, would this be a problem?

For example, invalidations are triggered by guest kernel writting to
IOMMU IQ registers (which is a vcpu thread), this may trigger write()
to the vhost fd, meanwhile QEMU event loop is reading it? How are we
handling possible concurrent operations on this same fd? Please just
point out if I missed anything. :)

Thanks,

-- peterx
Jason Wang Sept. 1, 2016, 7:36 a.m. UTC | #2
On 2016年09月01日 11:34, Peter Xu wrote:
> On Tue, Aug 30, 2016 at 11:06:59AM +0800, Jason Wang wrote:
>> This patches implements Device IOTLB support for vhost kernel. This is
>> done through:
>>
>> 1) switch to use dma helpers when map/unmap vrings from vhost codes
>> 2) kernel support for Device IOTLB API:
>>
>> - allow vhost-net to query the IOMMU IOTLB entry through eventfd
>> - enable the ability for qemu to update a specified mapping of vhost
>> - through ioctl.
>> - enable the ability to invalidate a specified range of iova for the
>>    device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
>>    triggered through iommu memory region notifier from device IOTLB
>>    invalidation descriptor processing routine.
>>
>> With all the above, kernel vhost_net can co-operate with IOMMU.
>>
>> Cc: Michael S. Tsirkin <mst@redhat.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>>   hw/virtio/vhost-backend.c         | 104 ++++++++++++++++++++++++++
>>   hw/virtio/vhost.c                 | 149 ++++++++++++++++++++++++++++++++------
>>   include/hw/virtio/vhost-backend.h |  14 ++++
>>   include/hw/virtio/vhost.h         |   4 +
>>   include/hw/virtio/virtio-access.h |  44 ++++++++++-
>>   net/tap.c                         |   1 +
>>   6 files changed, 291 insertions(+), 25 deletions(-)

[...]

>> +
>> +void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
>> +{
>> +    IOMMUTLBEntry iotlb;
>> +    uint64_t uaddr, len;
>> +
>> +    rcu_read_lock();
>> +
>> +    iotlb = address_space_get_iotlb_entry(virtio_get_dma_as(dev->vdev),
>> +                                          iova, write);
>> +    if (iotlb.target_as != NULL) {
>> +        if (vhost_memory_region_lookup(dev, iotlb.translated_addr,
>> +                                       &uaddr, &len)) {
>> +            error_report("Fail to lookup the translated address "
>> +                         "%"PRIx64, iotlb.translated_addr);
>> +            goto out;
>> +        }
>> +
>> +        len = MIN(iotlb.addr_mask + 1, len);
>> +        iova = iova & ~iotlb.addr_mask;
>> +
>> +        if (dev->vhost_ops->vhost_update_device_iotlb(dev, iova, uaddr,
>> +                                                      len, iotlb.perm)) {
>> +            error_report("Fail to update device iotlb");
>> +            goto out;
>> +        }
>> +    }
> Question: when will target_as == NULL? Do we need an assertion here if
> it should never happen?

Good catch, looks like we need check perm against IOMMU_NONE here for 
invalid translation instead of checking target_as.

>> +out:
>> +    rcu_read_unlock();
>> +}
>> +


>> +    hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
>> +
>> +    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
>> +        /* Update used ring information for IOTLB to work correctly */
>> +        for (i = 0; i < hdev->nvqs; ++i) {
>> +            struct vhost_virtqueue *vq = hdev->vqs + i;
>> +            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
>> +        }
>> +    }
>>       return 0;
>> +#if 0
>> +fail_iotlb:
>> +    if (hdev->vhost_ops->vhost_set_vring_enable) {
>> +        hdev->vhost_ops->vhost_set_vring_enable(hdev, 0);
>> +    }
>> +#endif
> Maybe we can remove these lines if not to be used.

Yes.

>
>>   fail_log:
>>       vhost_log_put(hdev, false);
>>   fail_vq:
>> @@ -1345,6 +1439,7 @@ fail_vq:
>>                                hdev->vq_index + i);
>>       }
>>       i = hdev->nvqs;
>> +
> Nit: A newline without context change.

Will remove this in next version.

>
>>   fail_mem:
>>   fail_features:
>>   
>> @@ -1359,6 +1454,7 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>>   
>>       /* should only be called after backend is connected */
>>       assert(hdev->vhost_ops);
>> +    hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
>>   
>>       for (i = 0; i < hdev->nvqs; ++i) {
>>           vhost_virtqueue_stop(hdev,
>> @@ -1367,8 +1463,13 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>>                                hdev->vq_index + i);
>>       }
>>   
>> +    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
>> +        memory_region_unregister_iommu_notifier(virtio_get_dma_as(vdev)->root,
>> +                                                &hdev->n);
>> +    }
>>       vhost_log_put(hdev, true);
>>       hdev->started = false;
>> +    hdev->vdev = NULL;
>>   }
>>   
>>   int vhost_net_set_backend(struct vhost_dev *hdev,
>> diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
>> index cf7f0b5..5cf8c70 100644
>> --- a/include/hw/virtio/vhost-backend.h
>> +++ b/include/hw/virtio/vhost-backend.h
>> @@ -8,9 +8,12 @@
>>    *
>>    */
>>   
>> +
> Another nit for newline.

Will remove this in next version.

>
>>   #ifndef VHOST_BACKEND_H
>>   #define VHOST_BACKEND_H
>>   
>> +#include "exec/memory.h"
>> +
>>   typedef enum VhostBackendType {
>>       VHOST_BACKEND_TYPE_NONE = 0,
>>       VHOST_BACKEND_TYPE_KERNEL = 1,
>> @@ -73,6 +76,14 @@ typedef int (*vhost_migration_done_op)(struct vhost_dev *dev,
>>   typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
>>                                              uint64_t start1, uint64_t size1,
>>                                              uint64_t start2, uint64_t size2);
>> +typedef void (*vhost_set_iotlb_callback_op)(struct vhost_dev *dev,
>> +                                           int enabled);
>> +typedef int (*vhost_update_device_iotlb_op)(struct vhost_dev *dev,
>> +                                            uint64_t iova, uint64_t uaddr,
>> +                                            uint64_t len,
>> +                                            IOMMUAccessFlags perm);
>> +typedef int (*vhost_invalidate_device_iotlb_op)(struct vhost_dev *dev,
>> +                                                uint64_t iova, uint64_t len);
>>   
>>   typedef struct VhostOps {
>>       VhostBackendType backend_type;
>> @@ -102,6 +113,9 @@ typedef struct VhostOps {
>>       vhost_requires_shm_log_op vhost_requires_shm_log;
>>       vhost_migration_done_op vhost_migration_done;
>>       vhost_backend_can_merge_op vhost_backend_can_merge;
>> +    vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
>> +    vhost_update_device_iotlb_op vhost_update_device_iotlb;
>> +    vhost_invalidate_device_iotlb_op vhost_invalidate_device_iotlb;
>>   } VhostOps;
>>   
>>   extern const VhostOps user_ops;
>> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
>> index e433089..b971fa1 100644
>> --- a/include/hw/virtio/vhost.h
>> +++ b/include/hw/virtio/vhost.h
>> @@ -20,6 +20,7 @@ struct vhost_virtqueue {
>>       unsigned long long ring_phys;
>>       unsigned ring_size;
>>       EventNotifier masked_notifier;
>> +    struct vhost_dev *dev;
>>   };
>>   
>>   typedef unsigned long vhost_log_chunk_t;
>> @@ -37,6 +38,7 @@ struct vhost_log {
>>   
>>   struct vhost_memory;
>>   struct vhost_dev {
>> +    VirtIODevice *vdev;
>>       MemoryListener memory_listener;
>>       struct vhost_memory *mem;
>>       int n_mem_sections;
>> @@ -61,6 +63,7 @@ struct vhost_dev {
>>       void *opaque;
>>       struct vhost_log *log;
>>       QLIST_ENTRY(vhost_dev) entry;
>> +    Notifier n;
>>   };
>>   
>>   int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>> @@ -90,4 +93,5 @@ bool vhost_has_free_slot(void);
>>   int vhost_net_set_backend(struct vhost_dev *hdev,
>>                             struct vhost_vring_file *file);
>>   
>> +void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
>>   #endif
>> diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
>> index 4071dad..3560bba 100644
>> --- a/include/hw/virtio/virtio-access.h
>> +++ b/include/hw/virtio/virtio-access.h
>> @@ -18,6 +18,7 @@
>>   
>>   #include "hw/virtio/virtio.h"
>>   #include "hw/virtio/virtio-bus.h"
>> +#include "sysemu/dma.h"
>>   #include "exec/address-spaces.h"
>>   
>>   #if defined(TARGET_PPC64) || defined(TARGET_ARM)
>> @@ -200,4 +201,45 @@ static inline void virtio_tswap64s(VirtIODevice *vdev, uint64_t *s)
>>   {
>>       *s = virtio_tswap64(vdev, *s);
>>   }
>> -#endif /* QEMU_VIRTIO_ACCESS_H */
>> +
>> +static inline bool mr_has_iommu_ops(MemoryRegion *mr)
>> +{
>> +    if (mr->alias) {
>> +        return mr_has_iommu_ops(mr->alias);
>> +    }
>> +
>> +    if (mr->iommu_ops)
>> +        return true;
>> +    else
>> +        return false;
>> +}
> Shall we just enhance memory_region_is_iommu() with alias handling,
> then call memory_region_is_iommu() directly?

Good idea, will do this in next version.

>
>> +
>> +static inline void *virtio_memory_map(VirtIODevice *vdev, hwaddr addr,
>> +                                      hwaddr *plen, int is_write)
>> +{
>> +    AddressSpace *dma_as = virtio_get_dma_as(vdev);
>> +
>> +    if (!mr_has_iommu_ops(dma_as->root)) {
>> +      return dma_memory_map(dma_as, addr, plen, is_write ?
>           ^^ indents :)
>

Will fix this :)

>> +                            DMA_DIRECTION_FROM_DEVICE :
>> +                            DMA_DIRECTION_TO_DEVICE);
>> +    } else {
>> +      return (void *)addr;
>           ^^ and here

And this.

>> +    }
>> +}
>> +
>> +
>> +static inline void virtio_memory_unmap(VirtIODevice *vdev, void *buffer,
>> +                                       hwaddr len, int is_write,
>> +                                       hwaddr access_len)
>> +{
>> +    AddressSpace *dma_as = virtio_get_dma_as(vdev);
>> +
>> +    if (!mr_has_iommu_ops(dma_as->root)) {
>> +      dma_memory_unmap(dma_as, buffer, len, is_write ?
>           ^^ and here
>
> And... One general question on the vhost fd used to communicate
> between QEMU and vhost: I see that data is coming from both directions
> on this fd, would this be a problem?

Looks not :)

>
> For example, invalidations are triggered by guest kernel writting to
> IOMMU IQ registers (which is a vcpu thread), this may trigger write()
> to the vhost fd, meanwhile QEMU event loop is reading it? How are we
> handling possible concurrent operations on this same fd? Please just
> point out if I missed anything. :)

The synchronization were done in kernel:

- message dequeue and enqueue were protected by a spinlock
- the data path and control path (e.g IOTLB updating) were synchronized 
through mutex

Thanks

>
> Thanks,
>
> -- peterx
Peter Xu Sept. 2, 2016, 5:47 a.m. UTC | #3
On Thu, Sep 01, 2016 at 03:36:40PM +0800, Jason Wang wrote:
> The synchronization were done in kernel:
> 
> - message dequeue and enqueue were protected by a spinlock
> - the data path and control path (e.g IOTLB updating) were synchronized
> through mutex

Yes I didn't notice it's the vhost chr device... Thanks!

-- peterx
diff mbox

Patch

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 7681f15..a5754f3 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -172,6 +172,107 @@  static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
     return idx - dev->vq_index;
 }
 
+static void vhost_kernel_iotlb_read(void *opaque)
+{
+    struct vhost_dev *dev = opaque;
+    struct vhost_msg msg;
+    ssize_t len;
+
+    while((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) {
+        struct vhost_iotlb_msg *imsg = &msg.iotlb;
+        if (len < sizeof msg) {
+            error_report("Wrong vhost message len: %d", (int)len);
+            break;
+        }
+        if (msg.type != VHOST_IOTLB_MSG) {
+            error_report("Unknown vhost iotlb message type");
+            break;
+        }
+        switch (imsg->type) {
+        case VHOST_IOTLB_MISS:
+            vhost_device_iotlb_miss(dev, imsg->iova,
+                                    imsg->perm != VHOST_ACCESS_RO);
+            break;
+        case VHOST_IOTLB_UPDATE:
+        case VHOST_IOTLB_INVALIDATE:
+            error_report("Unexpected IOTLB message type");
+            break;
+        case VHOST_IOTLB_ACCESS_FAIL:
+            /* FIXME: report device iotlb error */
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+static int vhost_kernel_update_device_iotlb(struct vhost_dev *dev,
+                                            uint64_t iova, uint64_t uaddr,
+                                            uint64_t len,
+                                            IOMMUAccessFlags perm)
+{
+    struct vhost_msg msg = {
+        .type = VHOST_IOTLB_MSG,
+        .iotlb = {
+            .iova = iova,
+            .uaddr = uaddr,
+            .size = len,
+            .type = VHOST_IOTLB_UPDATE,
+        }
+    };
+
+    switch (perm) {
+    case IOMMU_RO:
+        msg.iotlb.perm = VHOST_ACCESS_RO;
+        break;
+    case IOMMU_WO:
+        msg.iotlb.perm = VHOST_ACCESS_WO;
+        break;
+    case IOMMU_RW:
+        msg.iotlb.perm = VHOST_ACCESS_RW;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
+        error_report("Fail to update device iotlb");
+        return -EFAULT;
+    }
+
+    return 0;
+}
+
+static int vhost_kernel_invalidate_device_iotlb(struct vhost_dev *dev,
+                                                uint64_t iova, uint64_t len)
+{
+    struct vhost_msg msg = {
+        .type = VHOST_IOTLB_MSG,
+        .iotlb = {
+            .iova = iova,
+            .size = len,
+            .type = VHOST_IOTLB_INVALIDATE,
+        }
+    };
+
+    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
+        error_report("Fail to invalidate device iotlb");
+        return -EFAULT;
+    }
+
+    return 0;
+}
+
+static void vhost_kernel_set_iotlb_callback(struct vhost_dev *dev,
+                                           int enabled)
+{
+    if (enabled)
+        qemu_set_fd_handler((uintptr_t)dev->opaque,
+                            vhost_kernel_iotlb_read, NULL, dev);
+    else
+        qemu_set_fd_handler((uintptr_t)dev->opaque, NULL, NULL, NULL);
+}
+
 static const VhostOps kernel_ops = {
         .backend_type = VHOST_BACKEND_TYPE_KERNEL,
         .vhost_backend_init = vhost_kernel_init,
@@ -197,6 +298,9 @@  static const VhostOps kernel_ops = {
         .vhost_set_owner = vhost_kernel_set_owner,
         .vhost_reset_device = vhost_kernel_reset_device,
         .vhost_get_vq_index = vhost_kernel_get_vq_index,
+        .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback,
+        .vhost_update_device_iotlb = vhost_kernel_update_device_iotlb,
+        .vhost_invalidate_device_iotlb = vhost_kernel_invalidate_device_iotlb,
 };
 
 int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type)
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 3d0c807..94e577b 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -437,7 +437,7 @@  static int vhost_verify_ring_mappings(struct vhost_dev *dev,
             continue;
         }
         l = vq->ring_size;
-        p = cpu_physical_memory_map(vq->ring_phys, &l, 1);
+        p = virtio_memory_map(dev->vdev, vq->ring_phys, &l, 1);
         if (!p || l != vq->ring_size) {
             error_report("Unable to map ring buffer for ring %d", i);
             r = -ENOMEM;
@@ -446,7 +446,7 @@  static int vhost_verify_ring_mappings(struct vhost_dev *dev,
             error_report("Ring buffer relocated for ring %d", i);
             r = -EBUSY;
         }
-        cpu_physical_memory_unmap(p, l, 0, 0);
+        virtio_memory_unmap(dev->vdev, p, l, 0, 0);
     }
     return r;
 }
@@ -674,13 +674,18 @@  static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
     return 0;
 }
 
-static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
+static int vhost_dev_set_features(struct vhost_dev *dev,
+                                  bool enable_log)
 {
     uint64_t features = dev->acked_features;
+    bool has_iommu = mr_has_iommu_ops(virtio_get_dma_as(dev->vdev)->root);
     int r;
     if (enable_log) {
         features |= 0x1ULL << VHOST_F_LOG_ALL;
     }
+    if (has_iommu) {
+        features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
+    }
     r = dev->vhost_ops->vhost_set_features(dev, features);
     if (r < 0) {
         VHOST_OPS_DEBUG("vhost_set_features failed");
@@ -817,6 +822,56 @@  static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
     return -errno;
 }
 
+static int vhost_memory_region_lookup(struct vhost_dev *hdev,
+                                      uint64_t gpa, uint64_t *uaddr,
+                                      uint64_t *len)
+{
+    int i;
+
+    for (i = 0; i < hdev->mem->nregions; i++) {
+        struct vhost_memory_region *reg = hdev->mem->regions + i;
+
+        if (gpa >= reg->guest_phys_addr &&
+            reg->guest_phys_addr + reg->memory_size > gpa) {
+            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
+            *len = reg->guest_phys_addr + reg->memory_size - gpa;
+            return 0;
+        }
+    }
+
+    return -EFAULT;
+}
+
+void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
+{
+    IOMMUTLBEntry iotlb;
+    uint64_t uaddr, len;
+
+    rcu_read_lock();
+
+    iotlb = address_space_get_iotlb_entry(virtio_get_dma_as(dev->vdev),
+                                          iova, write);
+    if (iotlb.target_as != NULL) {
+        if (vhost_memory_region_lookup(dev, iotlb.translated_addr,
+                                       &uaddr, &len)) {
+            error_report("Fail to lookup the translated address "
+                         "%"PRIx64, iotlb.translated_addr);
+            goto out;
+        }
+
+        len = MIN(iotlb.addr_mask + 1, len);
+        iova = iova & ~iotlb.addr_mask;
+
+        if (dev->vhost_ops->vhost_update_device_iotlb(dev, iova, uaddr,
+                                                      len, iotlb.perm)) {
+            error_report("Fail to update device iotlb");
+            goto out;
+        }
+    }
+out:
+    rcu_read_unlock();
+}
+
 static int vhost_virtqueue_start(struct vhost_dev *dev,
                                 struct VirtIODevice *vdev,
                                 struct vhost_virtqueue *vq,
@@ -859,21 +914,21 @@  static int vhost_virtqueue_start(struct vhost_dev *dev,
 
     s = l = virtio_queue_get_desc_size(vdev, idx);
     a = virtio_queue_get_desc_addr(vdev, idx);
-    vq->desc = cpu_physical_memory_map(a, &l, 0);
+    vq->desc = virtio_memory_map(vdev, a, &l, 0);
     if (!vq->desc || l != s) {
         r = -ENOMEM;
         goto fail_alloc_desc;
     }
     s = l = virtio_queue_get_avail_size(vdev, idx);
     a = virtio_queue_get_avail_addr(vdev, idx);
-    vq->avail = cpu_physical_memory_map(a, &l, 0);
+    vq->avail = virtio_memory_map(vdev, a, &l, 0);
     if (!vq->avail || l != s) {
         r = -ENOMEM;
         goto fail_alloc_avail;
     }
     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
-    vq->used = cpu_physical_memory_map(a, &l, 1);
+    vq->used = virtio_memory_map(vdev, a, &l, 1);
     if (!vq->used || l != s) {
         r = -ENOMEM;
         goto fail_alloc_used;
@@ -881,7 +936,7 @@  static int vhost_virtqueue_start(struct vhost_dev *dev,
 
     vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx);
     vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx);
-    vq->ring = cpu_physical_memory_map(a, &l, 1);
+    vq->ring = virtio_memory_map(vdev, a, &l, 1);
     if (!vq->ring || l != s) {
         r = -ENOMEM;
         goto fail_alloc_ring;
@@ -913,20 +968,19 @@  static int vhost_virtqueue_start(struct vhost_dev *dev,
     }
 
     return 0;
-
 fail_kick:
 fail_alloc:
-    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
-                              0, 0);
+    virtio_memory_unmap(vdev, vq->ring, virtio_queue_get_ring_size(vdev, idx),
+                        0, 0);
 fail_alloc_ring:
-    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
-                              0, 0);
+    virtio_memory_unmap(vdev, vq->used, virtio_queue_get_used_size(vdev, idx),
+                        0, 0);
 fail_alloc_used:
-    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
-                              0, 0);
+    virtio_memory_unmap(vdev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
+                        0, 0);
 fail_alloc_avail:
-    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
-                              0, 0);
+    virtio_memory_unmap(vdev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
+                        0, 0);
 fail_alloc_desc:
     return r;
 }
@@ -959,14 +1013,14 @@  static void vhost_virtqueue_stop(struct vhost_dev *dev,
                                                 vhost_vq_index);
     }
 
-    cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx),
-                              0, virtio_queue_get_ring_size(vdev, idx));
-    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
-                              1, virtio_queue_get_used_size(vdev, idx));
-    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
-                              0, virtio_queue_get_avail_size(vdev, idx));
-    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
-                              0, virtio_queue_get_desc_size(vdev, idx));
+    virtio_memory_unmap(vdev, vq->ring, virtio_queue_get_ring_size(vdev, idx),
+                        0, virtio_queue_get_ring_size(vdev, idx));
+    virtio_memory_unmap(vdev, vq->used, virtio_queue_get_used_size(vdev, idx),
+                        1, virtio_queue_get_used_size(vdev, idx));
+    virtio_memory_unmap(vdev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
+                        0, virtio_queue_get_avail_size(vdev, idx));
+    virtio_memory_unmap(vdev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
+                         0, virtio_queue_get_desc_size(vdev, idx));
 }
 
 static void vhost_eventfd_add(MemoryListener *listener,
@@ -1023,6 +1077,9 @@  static int vhost_virtqueue_init(struct vhost_dev *dev,
         r = -errno;
         goto fail_call;
     }
+
+    vq->dev = dev;
+
     return 0;
 fail_call:
     event_notifier_cleanup(&vq->masked_notifier);
@@ -1034,12 +1091,25 @@  static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
     event_notifier_cleanup(&vq->masked_notifier);
 }
 
+static void vhost_iommu_unmap_notify(Notifier *n, void *data)
+{
+    struct vhost_dev *hdev = container_of(n, struct vhost_dev, n);
+    IOMMUTLBEntry *iotlb = data;
+
+    if (hdev->vhost_ops->vhost_invalidate_device_iotlb(hdev,
+                                                       iotlb->iova,
+                                                       iotlb->addr_mask +1)) {
+        error_report("Fail to invalidate device iotlb");
+    }
+}
+
 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
                    VhostBackendType backend_type, uint32_t busyloop_timeout)
 {
     uint64_t features;
     int i, r, n_initialized_vqs = 0;
 
+    hdev->vdev = NULL;
     hdev->migration_blocker = NULL;
 
     r = vhost_set_backend_type(hdev, backend_type);
@@ -1104,6 +1174,8 @@  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
         .priority = 10
     };
 
+    hdev->n.notify = vhost_iommu_unmap_notify;
+
     if (hdev->migration_blocker == NULL) {
         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
             error_setg(&hdev->migration_blocker,
@@ -1296,11 +1368,18 @@  int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
     assert(hdev->vhost_ops);
 
     hdev->started = true;
+    hdev->vdev = vdev;
 
     r = vhost_dev_set_features(hdev, hdev->log_enabled);
     if (r < 0) {
         goto fail_features;
     }
+
+    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
+        memory_region_register_iommu_notifier(virtio_get_dma_as(vdev)->root,
+                                              &hdev->n);
+    }
+
     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
     if (r < 0) {
         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
@@ -1334,7 +1413,22 @@  int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
         }
     }
 
+    hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
+
+    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
+        /* Update used ring information for IOTLB to work correctly */
+        for (i = 0; i < hdev->nvqs; ++i) {
+            struct vhost_virtqueue *vq = hdev->vqs + i;
+            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
+        }
+    }
     return 0;
+#if 0
+fail_iotlb:
+    if (hdev->vhost_ops->vhost_set_vring_enable) {
+        hdev->vhost_ops->vhost_set_vring_enable(hdev, 0);
+    }
+#endif
 fail_log:
     vhost_log_put(hdev, false);
 fail_vq:
@@ -1345,6 +1439,7 @@  fail_vq:
                              hdev->vq_index + i);
     }
     i = hdev->nvqs;
+
 fail_mem:
 fail_features:
 
@@ -1359,6 +1454,7 @@  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
 
     /* should only be called after backend is connected */
     assert(hdev->vhost_ops);
+    hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
 
     for (i = 0; i < hdev->nvqs; ++i) {
         vhost_virtqueue_stop(hdev,
@@ -1367,8 +1463,13 @@  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
                              hdev->vq_index + i);
     }
 
+    if (mr_has_iommu_ops(virtio_get_dma_as(vdev)->root)) {
+        memory_region_unregister_iommu_notifier(virtio_get_dma_as(vdev)->root,
+                                                &hdev->n);
+    }
     vhost_log_put(hdev, true);
     hdev->started = false;
+    hdev->vdev = NULL;
 }
 
 int vhost_net_set_backend(struct vhost_dev *hdev,
diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
index cf7f0b5..5cf8c70 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -8,9 +8,12 @@ 
  *
  */
 
+
 #ifndef VHOST_BACKEND_H
 #define VHOST_BACKEND_H
 
+#include "exec/memory.h"
+
 typedef enum VhostBackendType {
     VHOST_BACKEND_TYPE_NONE = 0,
     VHOST_BACKEND_TYPE_KERNEL = 1,
@@ -73,6 +76,14 @@  typedef int (*vhost_migration_done_op)(struct vhost_dev *dev,
 typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
                                            uint64_t start1, uint64_t size1,
                                            uint64_t start2, uint64_t size2);
+typedef void (*vhost_set_iotlb_callback_op)(struct vhost_dev *dev,
+                                           int enabled);
+typedef int (*vhost_update_device_iotlb_op)(struct vhost_dev *dev,
+                                            uint64_t iova, uint64_t uaddr,
+                                            uint64_t len,
+                                            IOMMUAccessFlags perm);
+typedef int (*vhost_invalidate_device_iotlb_op)(struct vhost_dev *dev,
+                                                uint64_t iova, uint64_t len);
 
 typedef struct VhostOps {
     VhostBackendType backend_type;
@@ -102,6 +113,9 @@  typedef struct VhostOps {
     vhost_requires_shm_log_op vhost_requires_shm_log;
     vhost_migration_done_op vhost_migration_done;
     vhost_backend_can_merge_op vhost_backend_can_merge;
+    vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
+    vhost_update_device_iotlb_op vhost_update_device_iotlb;
+    vhost_invalidate_device_iotlb_op vhost_invalidate_device_iotlb;
 } VhostOps;
 
 extern const VhostOps user_ops;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index e433089..b971fa1 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -20,6 +20,7 @@  struct vhost_virtqueue {
     unsigned long long ring_phys;
     unsigned ring_size;
     EventNotifier masked_notifier;
+    struct vhost_dev *dev;
 };
 
 typedef unsigned long vhost_log_chunk_t;
@@ -37,6 +38,7 @@  struct vhost_log {
 
 struct vhost_memory;
 struct vhost_dev {
+    VirtIODevice *vdev;
     MemoryListener memory_listener;
     struct vhost_memory *mem;
     int n_mem_sections;
@@ -61,6 +63,7 @@  struct vhost_dev {
     void *opaque;
     struct vhost_log *log;
     QLIST_ENTRY(vhost_dev) entry;
+    Notifier n;
 };
 
 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
@@ -90,4 +93,5 @@  bool vhost_has_free_slot(void);
 int vhost_net_set_backend(struct vhost_dev *hdev,
                           struct vhost_vring_file *file);
 
+void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
 #endif
diff --git a/include/hw/virtio/virtio-access.h b/include/hw/virtio/virtio-access.h
index 4071dad..3560bba 100644
--- a/include/hw/virtio/virtio-access.h
+++ b/include/hw/virtio/virtio-access.h
@@ -18,6 +18,7 @@ 
 
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio-bus.h"
+#include "sysemu/dma.h"
 #include "exec/address-spaces.h"
 
 #if defined(TARGET_PPC64) || defined(TARGET_ARM)
@@ -200,4 +201,45 @@  static inline void virtio_tswap64s(VirtIODevice *vdev, uint64_t *s)
 {
     *s = virtio_tswap64(vdev, *s);
 }
-#endif /* QEMU_VIRTIO_ACCESS_H */
+
+static inline bool mr_has_iommu_ops(MemoryRegion *mr)
+{
+    if (mr->alias) {
+        return mr_has_iommu_ops(mr->alias);
+    }
+
+    if (mr->iommu_ops)
+        return true;
+    else
+        return false;
+}
+
+static inline void *virtio_memory_map(VirtIODevice *vdev, hwaddr addr,
+                                      hwaddr *plen, int is_write)
+{
+    AddressSpace *dma_as = virtio_get_dma_as(vdev);
+
+    if (!mr_has_iommu_ops(dma_as->root)) {
+      return dma_memory_map(dma_as, addr, plen, is_write ?
+                            DMA_DIRECTION_FROM_DEVICE :
+                            DMA_DIRECTION_TO_DEVICE);
+    } else {
+      return (void *)addr;
+    }
+}
+
+
+static inline void virtio_memory_unmap(VirtIODevice *vdev, void *buffer,
+                                       hwaddr len, int is_write,
+                                       hwaddr access_len)
+{
+    AddressSpace *dma_as = virtio_get_dma_as(vdev);
+
+    if (!mr_has_iommu_ops(dma_as->root)) {
+      dma_memory_unmap(dma_as, buffer, len, is_write ?
+                       DMA_DIRECTION_FROM_DEVICE : DMA_DIRECTION_TO_DEVICE,
+                       access_len);
+    }
+}
+
+#endif /* _QEMU_VIRTIO_ACCESS_H */
diff --git a/net/tap.c b/net/tap.c
index 6abb962..363805e 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -696,6 +696,7 @@  static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                                  "tap: open vhost char device failed");
                 return;
             }
+            fcntl(vhostfd, F_SETFL, O_NONBLOCK);
         }
         options.opaque = (void *)(uintptr_t)vhostfd;