diff mbox

[V4,net-next] vhost_net: device IOTLB support

Message ID 1484109132-3331-1-git-send-email-jasowang@redhat.com
State New
Headers show

Commit Message

Jason Wang Jan. 11, 2017, 4:32 a.m. UTC
This patches implements Device IOTLB support for vhost kernel. This is
done through:

1) switch to use dma helpers when map/unmap vrings from vhost codes
2) introduce a set of VhostOps to:
   - setting up device IOTLB request callback
   - processing device IOTLB request
   - processing device IOTLB invalidation
2) kernel support for Device IOTLB API:

- allow vhost-net to query the IOMMU IOTLB entry through eventfd
- enable the ability for qemu to update a specified mapping of vhost
- through ioctl.
- enable the ability to invalidate a specified range of iova for the
  device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
  triggered through iommu memory region notifier from device IOTLB
  invalidation descriptor processing routine.

With all the above, kernel vhost_net can co-operate with userspace
IOMMU. For vhost-user, the support could be easily done on top by
implementing the VhostOps.

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes from V4:
- set iotlb callback only when IOMMU_PLATFORM is negotiated (fix
  vhost-user qtest failure)
- whitelist VIRTIO_F_IOMMU_PLATFORM instead of manually add it
- keep cpu_physical_memory_map() in vhost_memory_map()
---
 hw/net/vhost_net.c                |   1 +
 hw/virtio/vhost-backend.c         |  99 +++++++++++++++++++++++
 hw/virtio/vhost.c                 | 166 +++++++++++++++++++++++++++++++++-----
 include/hw/virtio/vhost-backend.h |  13 +++
 include/hw/virtio/vhost.h         |   4 +
 net/tap.c                         |   1 +
 6 files changed, 262 insertions(+), 22 deletions(-)

Comments

Jason Wang Jan. 11, 2017, 4:36 a.m. UTC | #1
On 2017年01月11日 12:32, Jason Wang wrote:
> This patches implements Device IOTLB support for vhost kernel. This is
> done through:
>
> 1) switch to use dma helpers when map/unmap vrings from vhost codes
> 2) introduce a set of VhostOps to:
>     - setting up device IOTLB request callback
>     - processing device IOTLB request
>     - processing device IOTLB invalidation
> 2) kernel support for Device IOTLB API:
>
> - allow vhost-net to query the IOMMU IOTLB entry through eventfd
> - enable the ability for qemu to update a specified mapping of vhost
> - through ioctl.
> - enable the ability to invalidate a specified range of iova for the
>    device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
>    triggered through iommu memory region notifier from device IOTLB
>    invalidation descriptor processing routine.
>
> With all the above, kernel vhost_net can co-operate with userspace
> IOMMU. For vhost-user, the support could be easily done on top by
> implementing the VhostOps.
>
> Cc: Michael S. Tsirkin<mst@redhat.com>
> Signed-off-by: Jason Wang<jasowang@redhat.com>
> ---
> Changes from V4:
> - set iotlb callback only when IOMMU_PLATFORM is negotiated (fix
>    vhost-user qtest failure)
> - whitelist VIRTIO_F_IOMMU_PLATFORM instead of manually add it
> - keep cpu_physical_memory_map() in vhost_memory_map()
> ---

Note: the patch is for qemu not net-next :)
Michael S. Tsirkin Jan. 12, 2017, 2:17 p.m. UTC | #2
On Wed, Jan 11, 2017 at 12:32:12PM +0800, Jason Wang wrote:
> This patches implements Device IOTLB support for vhost kernel. This is
> done through:
> 
> 1) switch to use dma helpers when map/unmap vrings from vhost codes
> 2) introduce a set of VhostOps to:
>    - setting up device IOTLB request callback
>    - processing device IOTLB request
>    - processing device IOTLB invalidation
> 2) kernel support for Device IOTLB API:
> 
> - allow vhost-net to query the IOMMU IOTLB entry through eventfd
> - enable the ability for qemu to update a specified mapping of vhost
> - through ioctl.
> - enable the ability to invalidate a specified range of iova for the
>   device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
>   triggered through iommu memory region notifier from device IOTLB
>   invalidation descriptor processing routine.
> 
> With all the above, kernel vhost_net can co-operate with userspace
> IOMMU. For vhost-user, the support could be easily done on top by
> implementing the VhostOps.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Applied, thanks!

> ---
> Changes from V4:
> - set iotlb callback only when IOMMU_PLATFORM is negotiated (fix
>   vhost-user qtest failure)

In fact this only checks virtio_host_has_feature - which is
the right thing to do, we can't trust the guest.

> - whitelist VIRTIO_F_IOMMU_PLATFORM instead of manually add it
> - keep cpu_physical_memory_map() in vhost_memory_map()

One further enhancement might be to detect that guest disabled
iommu (e.g. globally, or using iommu=pt) and disable
the iotlb to avoid overhead for guests which use DPDK
for assigned devices but not for vhost.


> ---
>  hw/net/vhost_net.c                |   1 +
>  hw/virtio/vhost-backend.c         |  99 +++++++++++++++++++++++
>  hw/virtio/vhost.c                 | 166 +++++++++++++++++++++++++++++++++-----
>  include/hw/virtio/vhost-backend.h |  13 +++
>  include/hw/virtio/vhost.h         |   4 +
>  net/tap.c                         |   1 +
>  6 files changed, 262 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
> index 6280422..22874a9 100644
> --- a/hw/net/vhost_net.c
> +++ b/hw/net/vhost_net.c
> @@ -52,6 +52,7 @@ static const int kernel_feature_bits[] = {
>      VIRTIO_NET_F_MRG_RXBUF,
>      VIRTIO_F_VERSION_1,
>      VIRTIO_NET_F_MTU,
> +    VIRTIO_F_IOMMU_PLATFORM,
>      VHOST_INVALID_FEATURE_BIT
>  };
>  
> diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
> index 272a5ec..be927b8 100644
> --- a/hw/virtio/vhost-backend.c
> +++ b/hw/virtio/vhost-backend.c
> @@ -185,6 +185,102 @@ static int vhost_kernel_vsock_set_running(struct vhost_dev *dev, int start)
>  }
>  #endif /* CONFIG_VHOST_VSOCK */
>  
> +static void vhost_kernel_iotlb_read(void *opaque)
> +{
> +    struct vhost_dev *dev = opaque;
> +    struct vhost_msg msg;
> +    ssize_t len;
> +
> +    while ((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) {
> +        struct vhost_iotlb_msg *imsg = &msg.iotlb;
> +        if (len < sizeof msg) {
> +            error_report("Wrong vhost message len: %d", (int)len);
> +            break;
> +        }
> +        if (msg.type != VHOST_IOTLB_MSG) {
> +            error_report("Unknown vhost iotlb message type");
> +            break;
> +        }
> +        switch (imsg->type) {
> +        case VHOST_IOTLB_MISS:
> +            vhost_device_iotlb_miss(dev, imsg->iova,
> +                                    imsg->perm != VHOST_ACCESS_RO);
> +            break;
> +        case VHOST_IOTLB_UPDATE:
> +        case VHOST_IOTLB_INVALIDATE:
> +            error_report("Unexpected IOTLB message type");
> +            break;
> +        case VHOST_IOTLB_ACCESS_FAIL:
> +            /* FIXME: report device iotlb error */
> +            break;
> +        default:
> +            break;
> +        }
> +    }
> +}
> +
> +static int vhost_kernel_update_device_iotlb(struct vhost_dev *dev,
> +                                            uint64_t iova, uint64_t uaddr,
> +                                            uint64_t len,
> +                                            IOMMUAccessFlags perm)
> +{
> +    struct vhost_msg msg;
> +    msg.type = VHOST_IOTLB_MSG;
> +    msg.iotlb.iova =  iova;
> +    msg.iotlb.uaddr = uaddr;
> +    msg.iotlb.size = len;
> +    msg.iotlb.type = VHOST_IOTLB_UPDATE;
> +
> +    switch (perm) {
> +    case IOMMU_RO:
> +        msg.iotlb.perm = VHOST_ACCESS_RO;
> +        break;
> +    case IOMMU_WO:
> +        msg.iotlb.perm = VHOST_ACCESS_WO;
> +        break;
> +    case IOMMU_RW:
> +        msg.iotlb.perm = VHOST_ACCESS_RW;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
> +        error_report("Fail to update device iotlb");
> +        return -EFAULT;
> +    }
> +
> +    return 0;
> +}
> +
> +static int vhost_kernel_invalidate_device_iotlb(struct vhost_dev *dev,
> +                                                uint64_t iova, uint64_t len)
> +{
> +    struct vhost_msg msg;
> +
> +    msg.type = VHOST_IOTLB_MSG;
> +    msg.iotlb.iova = iova;
> +    msg.iotlb.size = len;
> +    msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
> +
> +    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
> +        error_report("Fail to invalidate device iotlb");
> +        return -EFAULT;
> +    }
> +
> +    return 0;
> +}
> +
> +static void vhost_kernel_set_iotlb_callback(struct vhost_dev *dev,
> +                                           int enabled)
> +{
> +    if (enabled)
> +        qemu_set_fd_handler((uintptr_t)dev->opaque,
> +                            vhost_kernel_iotlb_read, NULL, dev);
> +    else
> +        qemu_set_fd_handler((uintptr_t)dev->opaque, NULL, NULL, NULL);
> +}
> +
>  static const VhostOps kernel_ops = {
>          .backend_type = VHOST_BACKEND_TYPE_KERNEL,
>          .vhost_backend_init = vhost_kernel_init,
> @@ -214,6 +310,9 @@ static const VhostOps kernel_ops = {
>          .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid,
>          .vhost_vsock_set_running = vhost_kernel_vsock_set_running,
>  #endif /* CONFIG_VHOST_VSOCK */
> +        .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback,
> +        .vhost_update_device_iotlb = vhost_kernel_update_device_iotlb,
> +        .vhost_invalidate_device_iotlb = vhost_kernel_invalidate_device_iotlb,
>  };
>  
>  int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type)
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index d396b22..9cacf55 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -26,6 +26,7 @@
>  #include "hw/virtio/virtio-bus.h"
>  #include "hw/virtio/virtio-access.h"
>  #include "migration/migration.h"
> +#include "sysemu/dma.h"
>  
>  /* enabled until disconnected backend stabilizes */
>  #define _VHOST_DEBUG 1
> @@ -421,8 +422,36 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
>      dev->log_size = size;
>  }
>  
> +static int vhost_dev_has_iommu(struct vhost_dev *dev)
> +{
> +    VirtIODevice *vdev = dev->vdev;
> +    AddressSpace *dma_as = vdev->dma_as;
> +
> +    return memory_region_is_iommu(dma_as->root) &&
> +           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
> +}
> +
> +static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
> +                              hwaddr *plen, int is_write)
> +{
> +    if (!vhost_dev_has_iommu(dev)) {
> +        return cpu_physical_memory_map(addr, plen, is_write);
> +    } else {
> +        return (void *)(uintptr_t)addr;
> +    }
> +}
> +
> +static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
> +                               hwaddr len, int is_write,
> +                               hwaddr access_len)
> +{
> +    if (!vhost_dev_has_iommu(dev)) {
> +        cpu_physical_memory_unmap(buffer, len, is_write, access_len);
> +    }
> +}
>  
> -static int vhost_verify_ring_part_mapping(void *part,
> +static int vhost_verify_ring_part_mapping(struct vhost_dev *dev,
> +                                          void *part,
>                                            uint64_t part_addr,
>                                            uint64_t part_size,
>                                            uint64_t start_addr,
> @@ -436,14 +465,14 @@ static int vhost_verify_ring_part_mapping(void *part,
>          return 0;
>      }
>      l = part_size;
> -    p = cpu_physical_memory_map(part_addr, &l, 1);
> +    p = vhost_memory_map(dev, part_addr, &l, 1);
>      if (!p || l != part_size) {
>          r = -ENOMEM;
>      }
>      if (p != part) {
>          r = -EBUSY;
>      }
> -    cpu_physical_memory_unmap(p, l, 0, 0);
> +    vhost_memory_unmap(dev, p, l, 0, 0);
>      return r;
>  }
>  
> @@ -463,21 +492,21 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
>          struct vhost_virtqueue *vq = dev->vqs + i;
>  
>          j = 0;
> -        r = vhost_verify_ring_part_mapping(vq->desc, vq->desc_phys,
> +        r = vhost_verify_ring_part_mapping(dev, vq->desc, vq->desc_phys,
>                                             vq->desc_size, start_addr, size);
>          if (!r) {
>              break;
>          }
>  
>          j++;
> -        r = vhost_verify_ring_part_mapping(vq->avail, vq->avail_phys,
> +        r = vhost_verify_ring_part_mapping(dev, vq->avail, vq->avail_phys,
>                                             vq->avail_size, start_addr, size);
>          if (!r) {
>              break;
>          }
>  
>          j++;
> -        r = vhost_verify_ring_part_mapping(vq->used, vq->used_phys,
> +        r = vhost_verify_ring_part_mapping(dev, vq->used, vq->used_phys,
>                                             vq->used_size, start_addr, size);
>          if (!r) {
>              break;
> @@ -715,7 +744,8 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
>      return 0;
>  }
>  
> -static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
> +static int vhost_dev_set_features(struct vhost_dev *dev,
> +                                  bool enable_log)
>  {
>      uint64_t features = dev->acked_features;
>      int r;
> @@ -858,6 +888,56 @@ static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
>      return -errno;
>  }
>  
> +static int vhost_memory_region_lookup(struct vhost_dev *hdev,
> +                                      uint64_t gpa, uint64_t *uaddr,
> +                                      uint64_t *len)
> +{
> +    int i;
> +
> +    for (i = 0; i < hdev->mem->nregions; i++) {
> +        struct vhost_memory_region *reg = hdev->mem->regions + i;
> +
> +        if (gpa >= reg->guest_phys_addr &&
> +            reg->guest_phys_addr + reg->memory_size > gpa) {
> +            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
> +            *len = reg->guest_phys_addr + reg->memory_size - gpa;
> +            return 0;
> +        }
> +    }
> +
> +    return -EFAULT;
> +}
> +
> +void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
> +{
> +    IOMMUTLBEntry iotlb;
> +    uint64_t uaddr, len;
> +
> +    rcu_read_lock();
> +
> +    iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
> +                                          iova, write);
> +    if (iotlb.target_as != NULL) {
> +        if (vhost_memory_region_lookup(dev, iotlb.translated_addr,
> +                                       &uaddr, &len)) {
> +            error_report("Fail to lookup the translated address "
> +                         "%"PRIx64, iotlb.translated_addr);
> +            goto out;
> +        }
> +
> +        len = MIN(iotlb.addr_mask + 1, len);
> +        iova = iova & ~iotlb.addr_mask;
> +
> +        if (dev->vhost_ops->vhost_update_device_iotlb(dev, iova, uaddr,
> +                                                      len, iotlb.perm)) {
> +            error_report("Fail to update device iotlb");
> +            goto out;
> +        }
> +    }
> +out:
> +    rcu_read_unlock();
> +}
> +
>  static int vhost_virtqueue_start(struct vhost_dev *dev,
>                                  struct VirtIODevice *vdev,
>                                  struct vhost_virtqueue *vq,
> @@ -903,21 +983,21 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
>  
>      vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
>      vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx);
> -    vq->desc = cpu_physical_memory_map(a, &l, 0);
> +    vq->desc = vhost_memory_map(dev, a, &l, 0);
>      if (!vq->desc || l != s) {
>          r = -ENOMEM;
>          goto fail_alloc_desc;
>      }
>      vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
>      vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
> -    vq->avail = cpu_physical_memory_map(a, &l, 0);
> +    vq->avail = vhost_memory_map(dev, a, &l, 0);
>      if (!vq->avail || l != s) {
>          r = -ENOMEM;
>          goto fail_alloc_avail;
>      }
>      vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
>      vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
> -    vq->used = cpu_physical_memory_map(a, &l, 1);
> +    vq->used = vhost_memory_map(dev, a, &l, 1);
>      if (!vq->used || l != s) {
>          r = -ENOMEM;
>          goto fail_alloc_used;
> @@ -963,14 +1043,14 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
>  fail_vector:
>  fail_kick:
>  fail_alloc:
> -    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
> -                              0, 0);
> +    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
> +                       0, 0);
>  fail_alloc_used:
> -    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
> -                              0, 0);
> +    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
> +                       0, 0);
>  fail_alloc_avail:
> -    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
> -                              0, 0);
> +    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
> +                       0, 0);
>  fail_alloc_desc:
>      return r;
>  }
> @@ -1004,12 +1084,12 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
>                                                  vhost_vq_index);
>      }
>  
> -    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
> -                              1, virtio_queue_get_used_size(vdev, idx));
> -    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
> -                              0, virtio_queue_get_avail_size(vdev, idx));
> -    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
> -                              0, virtio_queue_get_desc_size(vdev, idx));
> +    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
> +                       1, virtio_queue_get_used_size(vdev, idx));
> +    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
> +                       0, virtio_queue_get_avail_size(vdev, idx));
> +    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
> +                       0, virtio_queue_get_desc_size(vdev, idx));
>  }
>  
>  static void vhost_eventfd_add(MemoryListener *listener,
> @@ -1066,6 +1146,9 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
>          r = -errno;
>          goto fail_call;
>      }
> +
> +    vq->dev = dev;
> +
>      return 0;
>  fail_call:
>      event_notifier_cleanup(&vq->masked_notifier);
> @@ -1077,12 +1160,24 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
>      event_notifier_cleanup(&vq->masked_notifier);
>  }
>  
> +static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> +{
> +    struct vhost_dev *hdev = container_of(n, struct vhost_dev, n);
> +
> +    if (hdev->vhost_ops->vhost_invalidate_device_iotlb(hdev,
> +                                                       iotlb->iova,
> +                                                       iotlb->addr_mask + 1)) {
> +        error_report("Fail to invalidate device iotlb");
> +    }
> +}
> +
>  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>                     VhostBackendType backend_type, uint32_t busyloop_timeout)
>  {
>      uint64_t features;
>      int i, r, n_initialized_vqs = 0;
>  
> +    hdev->vdev = NULL;
>      hdev->migration_blocker = NULL;
>  
>      r = vhost_set_backend_type(hdev, backend_type);
> @@ -1147,6 +1242,9 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
>          .priority = 10
>      };
>  
> +    hdev->n.notify = vhost_iommu_unmap_notify;
> +    hdev->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;
> +
>      if (hdev->migration_blocker == NULL) {
>          if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
>              error_setg(&hdev->migration_blocker,
> @@ -1342,11 +1440,18 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
>      assert(hdev->vhost_ops);
>  
>      hdev->started = true;
> +    hdev->vdev = vdev;
>  
>      r = vhost_dev_set_features(hdev, hdev->log_enabled);
>      if (r < 0) {
>          goto fail_features;
>      }
> +
> +    if (vhost_dev_has_iommu(hdev)) {
> +        memory_region_register_iommu_notifier(vdev->dma_as->root,
> +                                              &hdev->n);
> +    }
> +
>      r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
>      if (r < 0) {
>          VHOST_OPS_DEBUG("vhost_set_mem_table failed");
> @@ -1380,6 +1485,16 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
>          }
>      }
>  
> +    if (vhost_dev_has_iommu(hdev)) {
> +        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
> +
> +        /* Update used ring information for IOTLB to work correctly,
> +         * vhost-kernel code requires for this.*/
> +        for (i = 0; i < hdev->nvqs; ++i) {
> +            struct vhost_virtqueue *vq = hdev->vqs + i;
> +            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
> +        }
> +    }
>      return 0;
>  fail_log:
>      vhost_log_put(hdev, false);
> @@ -1391,6 +1506,7 @@ fail_vq:
>                               hdev->vq_index + i);
>      }
>      i = hdev->nvqs;
> +
>  fail_mem:
>  fail_features:
>  
> @@ -1413,8 +1529,14 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
>                               hdev->vq_index + i);
>      }
>  
> +    if (vhost_dev_has_iommu(hdev)) {
> +        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
> +        memory_region_unregister_iommu_notifier(vdev->dma_as->root,
> +                                                &hdev->n);
> +    }
>      vhost_log_put(hdev, true);
>      hdev->started = false;
> +    hdev->vdev = NULL;
>  }
>  
>  int vhost_net_set_backend(struct vhost_dev *hdev,
> diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
> index 30abc11..c3cf4a7 100644
> --- a/include/hw/virtio/vhost-backend.h
> +++ b/include/hw/virtio/vhost-backend.h
> @@ -11,6 +11,8 @@
>  #ifndef VHOST_BACKEND_H
>  #define VHOST_BACKEND_H
>  
> +#include "exec/memory.h"
> +
>  typedef enum VhostBackendType {
>      VHOST_BACKEND_TYPE_NONE = 0,
>      VHOST_BACKEND_TYPE_KERNEL = 1,
> @@ -77,6 +79,14 @@ typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
>  typedef int (*vhost_vsock_set_guest_cid_op)(struct vhost_dev *dev,
>                                              uint64_t guest_cid);
>  typedef int (*vhost_vsock_set_running_op)(struct vhost_dev *dev, int start);
> +typedef void (*vhost_set_iotlb_callback_op)(struct vhost_dev *dev,
> +                                           int enabled);
> +typedef int (*vhost_update_device_iotlb_op)(struct vhost_dev *dev,
> +                                            uint64_t iova, uint64_t uaddr,
> +                                            uint64_t len,
> +                                            IOMMUAccessFlags perm);
> +typedef int (*vhost_invalidate_device_iotlb_op)(struct vhost_dev *dev,
> +                                                uint64_t iova, uint64_t len);
>  
>  typedef struct VhostOps {
>      VhostBackendType backend_type;
> @@ -109,6 +119,9 @@ typedef struct VhostOps {
>      vhost_backend_can_merge_op vhost_backend_can_merge;
>      vhost_vsock_set_guest_cid_op vhost_vsock_set_guest_cid;
>      vhost_vsock_set_running_op vhost_vsock_set_running;
> +    vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
> +    vhost_update_device_iotlb_op vhost_update_device_iotlb;
> +    vhost_invalidate_device_iotlb_op vhost_invalidate_device_iotlb;
>  } VhostOps;
>  
>  extern const VhostOps user_ops;
> diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
> index 1fe5aad..52f633e 100644
> --- a/include/hw/virtio/vhost.h
> +++ b/include/hw/virtio/vhost.h
> @@ -21,6 +21,7 @@ struct vhost_virtqueue {
>      unsigned long long used_phys;
>      unsigned used_size;
>      EventNotifier masked_notifier;
> +    struct vhost_dev *dev;
>  };
>  
>  typedef unsigned long vhost_log_chunk_t;
> @@ -38,6 +39,7 @@ struct vhost_log {
>  
>  struct vhost_memory;
>  struct vhost_dev {
> +    VirtIODevice *vdev;
>      MemoryListener memory_listener;
>      struct vhost_memory *mem;
>      int n_mem_sections;
> @@ -62,6 +64,7 @@ struct vhost_dev {
>      void *opaque;
>      struct vhost_log *log;
>      QLIST_ENTRY(vhost_dev) entry;
> +    IOMMUNotifier n;
>  };
>  
>  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
> @@ -91,4 +94,5 @@ bool vhost_has_free_slot(void);
>  int vhost_net_set_backend(struct vhost_dev *hdev,
>                            struct vhost_vring_file *file);
>  
> +void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
>  #endif
> diff --git a/net/tap.c b/net/tap.c
> index b6896a7..86071b2 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -696,6 +696,7 @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
>                                   "tap: open vhost char device failed");
>                  return;
>              }
> +            fcntl(vhostfd, F_SETFL, O_NONBLOCK);
>          }
>          options.opaque = (void *)(uintptr_t)vhostfd;
>  
> -- 
> 2.7.4
Jason Wang Jan. 13, 2017, 2:45 a.m. UTC | #3
On 2017年01月12日 22:17, Michael S. Tsirkin wrote:
> On Wed, Jan 11, 2017 at 12:32:12PM +0800, Jason Wang wrote:
>> This patches implements Device IOTLB support for vhost kernel. This is
>> done through:
>>
>> 1) switch to use dma helpers when map/unmap vrings from vhost codes
>> 2) introduce a set of VhostOps to:
>>     - setting up device IOTLB request callback
>>     - processing device IOTLB request
>>     - processing device IOTLB invalidation
>> 2) kernel support for Device IOTLB API:
>>
>> - allow vhost-net to query the IOMMU IOTLB entry through eventfd
>> - enable the ability for qemu to update a specified mapping of vhost
>> - through ioctl.
>> - enable the ability to invalidate a specified range of iova for the
>>    device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
>>    triggered through iommu memory region notifier from device IOTLB
>>    invalidation descriptor processing routine.
>>
>> With all the above, kernel vhost_net can co-operate with userspace
>> IOMMU. For vhost-user, the support could be easily done on top by
>> implementing the VhostOps.
>>
>> Cc: Michael S. Tsirkin<mst@redhat.com>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> Applied, thanks!
>
>> ---
>> Changes from V4:
>> - set iotlb callback only when IOMMU_PLATFORM is negotiated (fix
>>    vhost-user qtest failure)
> In fact this only checks virtio_host_has_feature - which is
> the right thing to do, we can't trust the guest.
>
>> - whitelist VIRTIO_F_IOMMU_PLATFORM instead of manually add it
>> - keep cpu_physical_memory_map() in vhost_memory_map()
> One further enhancement might be to detect that guest disabled
> iommu (e.g. globally, or using iommu=pt) and disable
> the iotlb to avoid overhead for guests which use DPDK
> for assigned devices but not for vhost.
>
>

Yes, it's in my todo list.

Thanks
Michael S. Tsirkin Jan. 13, 2017, 4:30 p.m. UTC | #4
On Fri, Jan 13, 2017 at 10:45:09AM +0800, Jason Wang wrote:
> 
> 
> On 2017年01月12日 22:17, Michael S. Tsirkin wrote:
> > On Wed, Jan 11, 2017 at 12:32:12PM +0800, Jason Wang wrote:
> > > This patches implements Device IOTLB support for vhost kernel. This is
> > > done through:
> > > 
> > > 1) switch to use dma helpers when map/unmap vrings from vhost codes
> > > 2) introduce a set of VhostOps to:
> > >     - setting up device IOTLB request callback
> > >     - processing device IOTLB request
> > >     - processing device IOTLB invalidation
> > > 2) kernel support for Device IOTLB API:
> > > 
> > > - allow vhost-net to query the IOMMU IOTLB entry through eventfd
> > > - enable the ability for qemu to update a specified mapping of vhost
> > > - through ioctl.
> > > - enable the ability to invalidate a specified range of iova for the
> > >    device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
> > >    triggered through iommu memory region notifier from device IOTLB
> > >    invalidation descriptor processing routine.
> > > 
> > > With all the above, kernel vhost_net can co-operate with userspace
> > > IOMMU. For vhost-user, the support could be easily done on top by
> > > implementing the VhostOps.
> > > 
> > > Cc: Michael S. Tsirkin<mst@redhat.com>
> > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > Applied, thanks!
> > 
> > > ---
> > > Changes from V4:
> > > - set iotlb callback only when IOMMU_PLATFORM is negotiated (fix
> > >    vhost-user qtest failure)
> > In fact this only checks virtio_host_has_feature - which is
> > the right thing to do, we can't trust the guest.
> > 
> > > - whitelist VIRTIO_F_IOMMU_PLATFORM instead of manually add it
> > > - keep cpu_physical_memory_map() in vhost_memory_map()
> > One further enhancement might be to detect that guest disabled
> > iommu (e.g. globally, or using iommu=pt) and disable
> > the iotlb to avoid overhead for guests which use DPDK
> > for assigned devices but not for vhost.
> > 
> > 
> 
> Yes, it's in my todo list.
> 
> Thanks

Something that I just noticed is that when user requests iommu_platform
but vhost can not provide it, this patches will just let vhost continue
without.  I think that's wrong, since iommu_platform is a security
feature, when it's not supported I think we should fail init.
Jason Wang Jan. 16, 2017, 3:33 a.m. UTC | #5
On 2017年01月14日 00:30, Michael S. Tsirkin wrote:
> On Fri, Jan 13, 2017 at 10:45:09AM +0800, Jason Wang wrote:
>>
>> On 2017年01月12日 22:17, Michael S. Tsirkin wrote:
>>> On Wed, Jan 11, 2017 at 12:32:12PM +0800, Jason Wang wrote:
>>>> This patches implements Device IOTLB support for vhost kernel. This is
>>>> done through:
>>>>
>>>> 1) switch to use dma helpers when map/unmap vrings from vhost codes
>>>> 2) introduce a set of VhostOps to:
>>>>      - setting up device IOTLB request callback
>>>>      - processing device IOTLB request
>>>>      - processing device IOTLB invalidation
>>>> 2) kernel support for Device IOTLB API:
>>>>
>>>> - allow vhost-net to query the IOMMU IOTLB entry through eventfd
>>>> - enable the ability for qemu to update a specified mapping of vhost
>>>> - through ioctl.
>>>> - enable the ability to invalidate a specified range of iova for the
>>>>     device IOTLB of vhost through ioctl. In x86/intel_iommu case this is
>>>>     triggered through iommu memory region notifier from device IOTLB
>>>>     invalidation descriptor processing routine.
>>>>
>>>> With all the above, kernel vhost_net can co-operate with userspace
>>>> IOMMU. For vhost-user, the support could be easily done on top by
>>>> implementing the VhostOps.
>>>>
>>>> Cc: Michael S. Tsirkin<mst@redhat.com>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> Applied, thanks!
>>>
>>>> ---
>>>> Changes from V4:
>>>> - set iotlb callback only when IOMMU_PLATFORM is negotiated (fix
>>>>     vhost-user qtest failure)
>>> In fact this only checks virtio_host_has_feature - which is
>>> the right thing to do, we can't trust the guest.
>>>
>>>> - whitelist VIRTIO_F_IOMMU_PLATFORM instead of manually add it
>>>> - keep cpu_physical_memory_map() in vhost_memory_map()
>>> One further enhancement might be to detect that guest disabled
>>> iommu (e.g. globally, or using iommu=pt) and disable
>>> the iotlb to avoid overhead for guests which use DPDK
>>> for assigned devices but not for vhost.
>>>
>>>
>> Yes, it's in my todo list.
>>
>> Thanks
> Something that I just noticed is that when user requests iommu_platform
> but vhost can not provide it, this patches will just let vhost continue
> without.  I think that's wrong, since iommu_platform is a security
> feature, when it's not supported I think we should fail init.
>

Let me post a fix for this.

Thanks
diff mbox

Patch

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 6280422..22874a9 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -52,6 +52,7 @@  static const int kernel_feature_bits[] = {
     VIRTIO_NET_F_MRG_RXBUF,
     VIRTIO_F_VERSION_1,
     VIRTIO_NET_F_MTU,
+    VIRTIO_F_IOMMU_PLATFORM,
     VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 272a5ec..be927b8 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -185,6 +185,102 @@  static int vhost_kernel_vsock_set_running(struct vhost_dev *dev, int start)
 }
 #endif /* CONFIG_VHOST_VSOCK */
 
+static void vhost_kernel_iotlb_read(void *opaque)
+{
+    struct vhost_dev *dev = opaque;
+    struct vhost_msg msg;
+    ssize_t len;
+
+    while ((len = read((uintptr_t)dev->opaque, &msg, sizeof msg)) > 0) {
+        struct vhost_iotlb_msg *imsg = &msg.iotlb;
+        if (len < sizeof msg) {
+            error_report("Wrong vhost message len: %d", (int)len);
+            break;
+        }
+        if (msg.type != VHOST_IOTLB_MSG) {
+            error_report("Unknown vhost iotlb message type");
+            break;
+        }
+        switch (imsg->type) {
+        case VHOST_IOTLB_MISS:
+            vhost_device_iotlb_miss(dev, imsg->iova,
+                                    imsg->perm != VHOST_ACCESS_RO);
+            break;
+        case VHOST_IOTLB_UPDATE:
+        case VHOST_IOTLB_INVALIDATE:
+            error_report("Unexpected IOTLB message type");
+            break;
+        case VHOST_IOTLB_ACCESS_FAIL:
+            /* FIXME: report device iotlb error */
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+static int vhost_kernel_update_device_iotlb(struct vhost_dev *dev,
+                                            uint64_t iova, uint64_t uaddr,
+                                            uint64_t len,
+                                            IOMMUAccessFlags perm)
+{
+    struct vhost_msg msg;
+    msg.type = VHOST_IOTLB_MSG;
+    msg.iotlb.iova =  iova;
+    msg.iotlb.uaddr = uaddr;
+    msg.iotlb.size = len;
+    msg.iotlb.type = VHOST_IOTLB_UPDATE;
+
+    switch (perm) {
+    case IOMMU_RO:
+        msg.iotlb.perm = VHOST_ACCESS_RO;
+        break;
+    case IOMMU_WO:
+        msg.iotlb.perm = VHOST_ACCESS_WO;
+        break;
+    case IOMMU_RW:
+        msg.iotlb.perm = VHOST_ACCESS_RW;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
+        error_report("Fail to update device iotlb");
+        return -EFAULT;
+    }
+
+    return 0;
+}
+
+static int vhost_kernel_invalidate_device_iotlb(struct vhost_dev *dev,
+                                                uint64_t iova, uint64_t len)
+{
+    struct vhost_msg msg;
+
+    msg.type = VHOST_IOTLB_MSG;
+    msg.iotlb.iova = iova;
+    msg.iotlb.size = len;
+    msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
+
+    if (write((uintptr_t)dev->opaque, &msg, sizeof msg) != sizeof msg) {
+        error_report("Fail to invalidate device iotlb");
+        return -EFAULT;
+    }
+
+    return 0;
+}
+
+static void vhost_kernel_set_iotlb_callback(struct vhost_dev *dev,
+                                           int enabled)
+{
+    if (enabled)
+        qemu_set_fd_handler((uintptr_t)dev->opaque,
+                            vhost_kernel_iotlb_read, NULL, dev);
+    else
+        qemu_set_fd_handler((uintptr_t)dev->opaque, NULL, NULL, NULL);
+}
+
 static const VhostOps kernel_ops = {
         .backend_type = VHOST_BACKEND_TYPE_KERNEL,
         .vhost_backend_init = vhost_kernel_init,
@@ -214,6 +310,9 @@  static const VhostOps kernel_ops = {
         .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid,
         .vhost_vsock_set_running = vhost_kernel_vsock_set_running,
 #endif /* CONFIG_VHOST_VSOCK */
+        .vhost_set_iotlb_callback = vhost_kernel_set_iotlb_callback,
+        .vhost_update_device_iotlb = vhost_kernel_update_device_iotlb,
+        .vhost_invalidate_device_iotlb = vhost_kernel_invalidate_device_iotlb,
 };
 
 int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type)
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index d396b22..9cacf55 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -26,6 +26,7 @@ 
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
 #include "migration/migration.h"
+#include "sysemu/dma.h"
 
 /* enabled until disconnected backend stabilizes */
 #define _VHOST_DEBUG 1
@@ -421,8 +422,36 @@  static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
     dev->log_size = size;
 }
 
+static int vhost_dev_has_iommu(struct vhost_dev *dev)
+{
+    VirtIODevice *vdev = dev->vdev;
+    AddressSpace *dma_as = vdev->dma_as;
+
+    return memory_region_is_iommu(dma_as->root) &&
+           virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
+}
+
+static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
+                              hwaddr *plen, int is_write)
+{
+    if (!vhost_dev_has_iommu(dev)) {
+        return cpu_physical_memory_map(addr, plen, is_write);
+    } else {
+        return (void *)(uintptr_t)addr;
+    }
+}
+
+static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
+                               hwaddr len, int is_write,
+                               hwaddr access_len)
+{
+    if (!vhost_dev_has_iommu(dev)) {
+        cpu_physical_memory_unmap(buffer, len, is_write, access_len);
+    }
+}
 
-static int vhost_verify_ring_part_mapping(void *part,
+static int vhost_verify_ring_part_mapping(struct vhost_dev *dev,
+                                          void *part,
                                           uint64_t part_addr,
                                           uint64_t part_size,
                                           uint64_t start_addr,
@@ -436,14 +465,14 @@  static int vhost_verify_ring_part_mapping(void *part,
         return 0;
     }
     l = part_size;
-    p = cpu_physical_memory_map(part_addr, &l, 1);
+    p = vhost_memory_map(dev, part_addr, &l, 1);
     if (!p || l != part_size) {
         r = -ENOMEM;
     }
     if (p != part) {
         r = -EBUSY;
     }
-    cpu_physical_memory_unmap(p, l, 0, 0);
+    vhost_memory_unmap(dev, p, l, 0, 0);
     return r;
 }
 
@@ -463,21 +492,21 @@  static int vhost_verify_ring_mappings(struct vhost_dev *dev,
         struct vhost_virtqueue *vq = dev->vqs + i;
 
         j = 0;
-        r = vhost_verify_ring_part_mapping(vq->desc, vq->desc_phys,
+        r = vhost_verify_ring_part_mapping(dev, vq->desc, vq->desc_phys,
                                            vq->desc_size, start_addr, size);
         if (!r) {
             break;
         }
 
         j++;
-        r = vhost_verify_ring_part_mapping(vq->avail, vq->avail_phys,
+        r = vhost_verify_ring_part_mapping(dev, vq->avail, vq->avail_phys,
                                            vq->avail_size, start_addr, size);
         if (!r) {
             break;
         }
 
         j++;
-        r = vhost_verify_ring_part_mapping(vq->used, vq->used_phys,
+        r = vhost_verify_ring_part_mapping(dev, vq->used, vq->used_phys,
                                            vq->used_size, start_addr, size);
         if (!r) {
             break;
@@ -715,7 +744,8 @@  static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
     return 0;
 }
 
-static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
+static int vhost_dev_set_features(struct vhost_dev *dev,
+                                  bool enable_log)
 {
     uint64_t features = dev->acked_features;
     int r;
@@ -858,6 +888,56 @@  static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
     return -errno;
 }
 
+static int vhost_memory_region_lookup(struct vhost_dev *hdev,
+                                      uint64_t gpa, uint64_t *uaddr,
+                                      uint64_t *len)
+{
+    int i;
+
+    for (i = 0; i < hdev->mem->nregions; i++) {
+        struct vhost_memory_region *reg = hdev->mem->regions + i;
+
+        if (gpa >= reg->guest_phys_addr &&
+            reg->guest_phys_addr + reg->memory_size > gpa) {
+            *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
+            *len = reg->guest_phys_addr + reg->memory_size - gpa;
+            return 0;
+        }
+    }
+
+    return -EFAULT;
+}
+
+void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
+{
+    IOMMUTLBEntry iotlb;
+    uint64_t uaddr, len;
+
+    rcu_read_lock();
+
+    iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
+                                          iova, write);
+    if (iotlb.target_as != NULL) {
+        if (vhost_memory_region_lookup(dev, iotlb.translated_addr,
+                                       &uaddr, &len)) {
+            error_report("Fail to lookup the translated address "
+                         "%"PRIx64, iotlb.translated_addr);
+            goto out;
+        }
+
+        len = MIN(iotlb.addr_mask + 1, len);
+        iova = iova & ~iotlb.addr_mask;
+
+        if (dev->vhost_ops->vhost_update_device_iotlb(dev, iova, uaddr,
+                                                      len, iotlb.perm)) {
+            error_report("Fail to update device iotlb");
+            goto out;
+        }
+    }
+out:
+    rcu_read_unlock();
+}
+
 static int vhost_virtqueue_start(struct vhost_dev *dev,
                                 struct VirtIODevice *vdev,
                                 struct vhost_virtqueue *vq,
@@ -903,21 +983,21 @@  static int vhost_virtqueue_start(struct vhost_dev *dev,
 
     vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
     vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx);
-    vq->desc = cpu_physical_memory_map(a, &l, 0);
+    vq->desc = vhost_memory_map(dev, a, &l, 0);
     if (!vq->desc || l != s) {
         r = -ENOMEM;
         goto fail_alloc_desc;
     }
     vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
     vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
-    vq->avail = cpu_physical_memory_map(a, &l, 0);
+    vq->avail = vhost_memory_map(dev, a, &l, 0);
     if (!vq->avail || l != s) {
         r = -ENOMEM;
         goto fail_alloc_avail;
     }
     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
-    vq->used = cpu_physical_memory_map(a, &l, 1);
+    vq->used = vhost_memory_map(dev, a, &l, 1);
     if (!vq->used || l != s) {
         r = -ENOMEM;
         goto fail_alloc_used;
@@ -963,14 +1043,14 @@  static int vhost_virtqueue_start(struct vhost_dev *dev,
 fail_vector:
 fail_kick:
 fail_alloc:
-    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
-                              0, 0);
+    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
+                       0, 0);
 fail_alloc_used:
-    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
-                              0, 0);
+    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
+                       0, 0);
 fail_alloc_avail:
-    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
-                              0, 0);
+    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
+                       0, 0);
 fail_alloc_desc:
     return r;
 }
@@ -1004,12 +1084,12 @@  static void vhost_virtqueue_stop(struct vhost_dev *dev,
                                                 vhost_vq_index);
     }
 
-    cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx),
-                              1, virtio_queue_get_used_size(vdev, idx));
-    cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx),
-                              0, virtio_queue_get_avail_size(vdev, idx));
-    cpu_physical_memory_unmap(vq->desc, virtio_queue_get_desc_size(vdev, idx),
-                              0, virtio_queue_get_desc_size(vdev, idx));
+    vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
+                       1, virtio_queue_get_used_size(vdev, idx));
+    vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
+                       0, virtio_queue_get_avail_size(vdev, idx));
+    vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
+                       0, virtio_queue_get_desc_size(vdev, idx));
 }
 
 static void vhost_eventfd_add(MemoryListener *listener,
@@ -1066,6 +1146,9 @@  static int vhost_virtqueue_init(struct vhost_dev *dev,
         r = -errno;
         goto fail_call;
     }
+
+    vq->dev = dev;
+
     return 0;
 fail_call:
     event_notifier_cleanup(&vq->masked_notifier);
@@ -1077,12 +1160,24 @@  static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
     event_notifier_cleanup(&vq->masked_notifier);
 }
 
+static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+    struct vhost_dev *hdev = container_of(n, struct vhost_dev, n);
+
+    if (hdev->vhost_ops->vhost_invalidate_device_iotlb(hdev,
+                                                       iotlb->iova,
+                                                       iotlb->addr_mask + 1)) {
+        error_report("Fail to invalidate device iotlb");
+    }
+}
+
 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
                    VhostBackendType backend_type, uint32_t busyloop_timeout)
 {
     uint64_t features;
     int i, r, n_initialized_vqs = 0;
 
+    hdev->vdev = NULL;
     hdev->migration_blocker = NULL;
 
     r = vhost_set_backend_type(hdev, backend_type);
@@ -1147,6 +1242,9 @@  int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
         .priority = 10
     };
 
+    hdev->n.notify = vhost_iommu_unmap_notify;
+    hdev->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;
+
     if (hdev->migration_blocker == NULL) {
         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
             error_setg(&hdev->migration_blocker,
@@ -1342,11 +1440,18 @@  int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
     assert(hdev->vhost_ops);
 
     hdev->started = true;
+    hdev->vdev = vdev;
 
     r = vhost_dev_set_features(hdev, hdev->log_enabled);
     if (r < 0) {
         goto fail_features;
     }
+
+    if (vhost_dev_has_iommu(hdev)) {
+        memory_region_register_iommu_notifier(vdev->dma_as->root,
+                                              &hdev->n);
+    }
+
     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
     if (r < 0) {
         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
@@ -1380,6 +1485,16 @@  int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
         }
     }
 
+    if (vhost_dev_has_iommu(hdev)) {
+        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
+
+        /* Update used ring information for IOTLB to work correctly,
+         * vhost-kernel code requires for this.*/
+        for (i = 0; i < hdev->nvqs; ++i) {
+            struct vhost_virtqueue *vq = hdev->vqs + i;
+            vhost_device_iotlb_miss(hdev, vq->used_phys, true);
+        }
+    }
     return 0;
 fail_log:
     vhost_log_put(hdev, false);
@@ -1391,6 +1506,7 @@  fail_vq:
                              hdev->vq_index + i);
     }
     i = hdev->nvqs;
+
 fail_mem:
 fail_features:
 
@@ -1413,8 +1529,14 @@  void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
                              hdev->vq_index + i);
     }
 
+    if (vhost_dev_has_iommu(hdev)) {
+        hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
+        memory_region_unregister_iommu_notifier(vdev->dma_as->root,
+                                                &hdev->n);
+    }
     vhost_log_put(hdev, true);
     hdev->started = false;
+    hdev->vdev = NULL;
 }
 
 int vhost_net_set_backend(struct vhost_dev *hdev,
diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
index 30abc11..c3cf4a7 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -11,6 +11,8 @@ 
 #ifndef VHOST_BACKEND_H
 #define VHOST_BACKEND_H
 
+#include "exec/memory.h"
+
 typedef enum VhostBackendType {
     VHOST_BACKEND_TYPE_NONE = 0,
     VHOST_BACKEND_TYPE_KERNEL = 1,
@@ -77,6 +79,14 @@  typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
 typedef int (*vhost_vsock_set_guest_cid_op)(struct vhost_dev *dev,
                                             uint64_t guest_cid);
 typedef int (*vhost_vsock_set_running_op)(struct vhost_dev *dev, int start);
+typedef void (*vhost_set_iotlb_callback_op)(struct vhost_dev *dev,
+                                           int enabled);
+typedef int (*vhost_update_device_iotlb_op)(struct vhost_dev *dev,
+                                            uint64_t iova, uint64_t uaddr,
+                                            uint64_t len,
+                                            IOMMUAccessFlags perm);
+typedef int (*vhost_invalidate_device_iotlb_op)(struct vhost_dev *dev,
+                                                uint64_t iova, uint64_t len);
 
 typedef struct VhostOps {
     VhostBackendType backend_type;
@@ -109,6 +119,9 @@  typedef struct VhostOps {
     vhost_backend_can_merge_op vhost_backend_can_merge;
     vhost_vsock_set_guest_cid_op vhost_vsock_set_guest_cid;
     vhost_vsock_set_running_op vhost_vsock_set_running;
+    vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
+    vhost_update_device_iotlb_op vhost_update_device_iotlb;
+    vhost_invalidate_device_iotlb_op vhost_invalidate_device_iotlb;
 } VhostOps;
 
 extern const VhostOps user_ops;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 1fe5aad..52f633e 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -21,6 +21,7 @@  struct vhost_virtqueue {
     unsigned long long used_phys;
     unsigned used_size;
     EventNotifier masked_notifier;
+    struct vhost_dev *dev;
 };
 
 typedef unsigned long vhost_log_chunk_t;
@@ -38,6 +39,7 @@  struct vhost_log {
 
 struct vhost_memory;
 struct vhost_dev {
+    VirtIODevice *vdev;
     MemoryListener memory_listener;
     struct vhost_memory *mem;
     int n_mem_sections;
@@ -62,6 +64,7 @@  struct vhost_dev {
     void *opaque;
     struct vhost_log *log;
     QLIST_ENTRY(vhost_dev) entry;
+    IOMMUNotifier n;
 };
 
 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
@@ -91,4 +94,5 @@  bool vhost_has_free_slot(void);
 int vhost_net_set_backend(struct vhost_dev *hdev,
                           struct vhost_vring_file *file);
 
+void vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
 #endif
diff --git a/net/tap.c b/net/tap.c
index b6896a7..86071b2 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -696,6 +696,7 @@  static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                                  "tap: open vhost char device failed");
                 return;
             }
+            fcntl(vhostfd, F_SETFL, O_NONBLOCK);
         }
         options.opaque = (void *)(uintptr_t)vhostfd;