Patchwork [RFC,v2] VFIO based device assignment

login
register
mail settings
Submitter Alex Williamson
Date Nov. 5, 2010, 8:16 p.m.
Message ID <20101105200558.26484.87430.stgit@s20.home>
Download mbox | patch
Permalink /patch/70291/
State New
Headers show

Comments

Alex Williamson - Nov. 5, 2010, 8:16 p.m.
This is an update to the original QEMU VFIO driver I sent several
months ago.  Since then, I've ported it to the qemu-kvm.git tree,
and back, incorporated common msix and msi code, added irqfd
support, created an irqfd-like mechanism for EOIs, and likely
numerous other things that I've forgotten.  I have quite a few
outstanding patches that enable things needed here, so I thought
it would be useful to see what the current code snapshot looks
like.  Please review and comment.  Thanks,

Alex

--

This patch adds qemu device assignment support using the proposed
VFIO/UIOMMU kernel interfaces.  The existing KVM-only device assignment
code makes use of various pci sysfs files for config space, MMIO BAR
mapping, and misc other config items.  It then jumps over to KVM-specific
ioctls for enabling interrupts and assigning devices to IOMMU domains.
Finally, IO-port support uses in/out directly.  This is a messy model
to support and causes numerous issues when we try to allow unprivileged
users to access PCI devices.

VFIO/UIOMMU reduces this to two interfaces, /dev/vfioX and /dev/uiommu.
The VFIO device file provides all the necessary support for accessing
PCI config space, read/write/mmap BARs (including IO-port space),
configuring INTx/MSI/MSI-X interupts and setting up DMA mapping.  The
UIOMMU interface allows iommu domains to be created, and via vfio,
devices can be bound to a domain.  This provides an easier model to
support (IMHO) and removes the bindings that make current device
assignment only useable for KVM enabled guests.

Usage is similar to KVM device assignment.  Rather than binding the
device to the pci-stub driver, vfio devices need to be bound to the
vfio driver.  From there, it's a simple matter of specifying the
device as:

-device vfio,host=01:00.0

This example requires either root privileges or proper permissions on
/dev/uiommu and /dev/vfioX.  To support unprivileged operation, the
options vfiofd= and uiommufd= are available.  Depending on the usage
of uiommufd, each guest device can be assigned to the same iommu
domain, or to independent iommu domains.  In the example above, each
device is assigned to a separate iommu domain.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 Makefile.target |    1 
 hw/linux-vfio.h |  273 +++++++++++
 hw/vfio.c       | 1398 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio.h       |   68 +++
 4 files changed, 1740 insertions(+), 0 deletions(-)
 create mode 100644 hw/linux-vfio.h
 create mode 100644 hw/vfio.c
 create mode 100644 hw/vfio.h

Patch

diff --git a/Makefile.target b/Makefile.target
index 91e6e74..f67490a 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -203,6 +203,7 @@  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
 obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
 obj-i386-y += debugcon.o multiboot.o
 obj-i386-y += pc_piix.o
+obj-i386-y += vfio.o
 
 # shared objects
 obj-ppc-y = ppc.o
diff --git a/hw/linux-vfio.h b/hw/linux-vfio.h
new file mode 100644
index 0000000..5f2e52e
--- /dev/null
+++ b/hw/linux-vfio.h
@@ -0,0 +1,273 @@ 
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Portions derived from drivers/uio/uio.c:
+ * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
+ * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
+ * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Portions derived from drivers/uio/uio_pci_generic.c:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ */
+#include <linux/types.h>
+
+/*
+ * VFIO driver - allow mapping and use of certain PCI devices
+ * in unprivileged user processes. (If IOMMU is present)
+ * Especially useful for Virtual Function parts of SR-IOV devices
+ */
+
+#ifdef __KERNEL__
+
+struct vfio_nl_client {
+	struct list_head	list;
+	u64			msgcap;
+	struct net		*net;
+	u32			pid;
+};
+
+struct perm_bits;
+struct vfio_dev {
+	struct device	*dev;
+	struct pci_dev	*pdev;
+	char		name[8];
+	u8		*pci_config_map;
+	int		pci_config_size;
+	int		devnum;
+	void __iomem	*barmap[PCI_ROM_RESOURCE+1];
+	spinlock_t	irqlock;	/* guards command register accesses */
+	int		listeners;
+	u32		locked_pages;
+	struct mutex	lgate;		/* listener gate */
+	struct mutex	dgate;		/* dma op gate */
+	struct mutex	igate;		/* intr op gate */
+	struct mutex	ngate;		/* netlink op gate */
+	struct list_head nlc_list;	/* netlink clients */
+	wait_queue_head_t dev_idle_q;
+	wait_queue_head_t nl_wait_q;
+	u32		nl_reply_seq;
+	u32		nl_reply_value;
+	int		mapcount;
+	struct uiommu_domain	*udomain;
+	int			cachec;
+	struct msix_entry	*msix;
+	struct eventfd_ctx	*ev_irq;
+	struct eventfd_ctx	**ev_msi;
+	struct eventfd_ctx	**ev_msix;
+	int			msi_nvec;
+	int			msix_nvec;
+	u8		*vconfig;
+	u32		rbar[7];	/* copies of real bars */
+	u8		msi_qmax;
+	u8		bardirty;
+	struct perm_bits	*msi_perm;
+};
+
+struct vfio_listener {
+	struct vfio_dev	*vdev;
+	struct list_head	dm_list;
+	struct mm_struct	*mm;
+	struct mmu_notifier	mmu_notifier;
+};
+
+/*
+ * Structure for keeping track of memory nailed down by the
+ * user for DMA
+ */
+struct dma_map_page {
+	struct list_head list;
+	struct page     **pages;
+	dma_addr_t      daddr;
+	unsigned long	vaddr;
+	int		npage;
+	int		rdwr;
+};
+
+/* VFIO class infrastructure */
+struct vfio_class {
+	struct kref kref;
+	struct class *class;
+};
+extern struct vfio_class *vfio_class;
+
+ssize_t vfio_io_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+ssize_t vfio_mem_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+ssize_t vfio_config_readwrite(int, struct vfio_dev *,
+			char __user *, size_t, loff_t *);
+
+void vfio_drop_msi(struct vfio_dev *);
+void vfio_drop_msix(struct vfio_dev *);
+int vfio_setup_msi(struct vfio_dev *, int, void __user *);
+int vfio_setup_msix(struct vfio_dev *, int, void __user *);
+
+#ifndef PCI_MSIX_ENTRY_SIZE
+#define	PCI_MSIX_ENTRY_SIZE	16
+#endif
+#ifndef PCI_STATUS_INTERRUPT
+#define	PCI_STATUS_INTERRUPT	0x08
+#endif
+
+struct vfio_dma_map;
+void vfio_dma_unmapall(struct vfio_listener *);
+int vfio_dma_unmap_dm(struct vfio_listener *, struct vfio_dma_map *);
+int vfio_dma_map_common(struct vfio_listener *, unsigned int,
+			struct vfio_dma_map *);
+int vfio_domain_set(struct vfio_dev *, int, int);
+int vfio_domain_unset(struct vfio_dev *);
+
+int vfio_class_init(void);
+void vfio_class_destroy(void);
+int vfio_dev_add_attributes(struct vfio_dev *);
+int vfio_build_config_map(struct vfio_dev *);
+
+int vfio_nl_init(void);
+void vfio_nl_freeclients(struct vfio_dev *);
+void vfio_nl_exit(void);
+int vfio_nl_remove(struct vfio_dev *);
+int vfio_validate(struct vfio_dev *);
+int vfio_nl_upcall(struct vfio_dev *, u8, int, int);
+void vfio_pm_process_reply(int);
+pci_ers_result_t vfio_error_detected(struct pci_dev *, pci_channel_state_t);
+pci_ers_result_t vfio_mmio_enabled(struct pci_dev *);
+pci_ers_result_t vfio_link_reset(struct pci_dev *);
+pci_ers_result_t vfio_slot_reset(struct pci_dev *);
+void vfio_error_resume(struct pci_dev *);
+#define VFIO_ERROR_REPLY_TIMEOUT	(3*HZ)
+#define VFIO_SUSPEND_REPLY_TIMEOUT	(5*HZ)
+
+irqreturn_t vfio_interrupt(int, void *);
+
+#endif	/* __KERNEL__ */
+
+/* Kernel & User level defines for ioctls */
+
+/*
+ * Structure for DMA mapping of user buffers
+ * vaddr, dmaaddr, and size must all be page aligned
+ * buffer may only be larger than 1 page if (a) there is
+ * an iommu in the system, or (b) buffer is part of a huge page
+ */
+struct vfio_dma_map {
+	__u64	vaddr;		/* process virtual addr */
+	__u64	dmaaddr;	/* desired and/or returned dma address */
+	__u64	size;		/* size in bytes */
+	__u64	flags;		/* bool: 0 for r/o; 1 for r/w */
+#define	VFIO_FLAG_WRITE		0x1	/* req writeable DMA mem */
+};
+
+/* map user pages at specific dma address */
+/* requires previous VFIO_DOMAIN_SET */
+#define	VFIO_DMA_MAP_IOVA	_IOWR(';', 101, struct vfio_dma_map)
+
+/* unmap user pages */
+#define	VFIO_DMA_UNMAP		_IOW(';', 102, struct vfio_dma_map)
+
+/* request IRQ interrupts; use given eventfd */
+#define	VFIO_EVENTFD_IRQ	_IOW(';', 103, int)
+
+/* Request MSI interrupts: arg[0] is #, arg[1-n] are eventfds */
+#define	VFIO_EVENTFDS_MSI	_IOW(';', 104, int)
+
+/* Request MSI-X interrupts: arg[0] is #, arg[1-n] are eventfds */
+#define	VFIO_EVENTFDS_MSIX	_IOW(';', 105, int)
+
+/* Get length of a BAR */
+#define	VFIO_BAR_LEN		_IOWR(';', 167, __u32)
+
+/* Set the IOMMU domain - arg is fd from uiommu driver */
+#define	VFIO_DOMAIN_SET		_IOW(';', 107, int)
+
+/* Unset the IOMMU domain */
+#define	VFIO_DOMAIN_UNSET	_IO(';', 108)
+
+/* Re-enable INTx */
+#define VFIO_IRQ_EOI            _IO(';', 109)
+
+/* Re-enable INTx via eventfd*/
+#define VFIO_IRQ_EOI_EVENTFD    _IOW(';', 110, int)
+
+/*
+ * Reads, writes, and mmaps determine which PCI BAR (or config space)
+ * from the high level bits of the file offset
+ */
+#define	VFIO_PCI_BAR0_RESOURCE		0x0
+#define	VFIO_PCI_BAR1_RESOURCE		0x1
+#define	VFIO_PCI_BAR2_RESOURCE		0x2
+#define	VFIO_PCI_BAR3_RESOURCE		0x3
+#define	VFIO_PCI_BAR4_RESOURCE		0x4
+#define	VFIO_PCI_BAR5_RESOURCE		0x5
+#define	VFIO_PCI_ROM_RESOURCE		0x6
+#define	VFIO_PCI_CONFIG_RESOURCE	0xF
+#define	VFIO_PCI_SPACE_SHIFT	32
+#define VFIO_PCI_CONFIG_OFF vfio_pci_space_to_offset(VFIO_PCI_CONFIG_RESOURCE)
+
+static inline int vfio_offset_to_pci_space(__u64 off)
+{
+	return (off >> VFIO_PCI_SPACE_SHIFT) & 0xF;
+}
+
+static inline __u32 vfio_offset_to_pci_offset(__u64 off)
+{
+	return off & (__u32)0xFFFFFFFF;
+}
+
+static inline __u64 vfio_pci_space_to_offset(int sp)
+{
+	return (__u64)(sp) << VFIO_PCI_SPACE_SHIFT;
+}
+
+/*
+ * Netlink defines:
+ */
+#define VFIO_GENL_NAME	"VFIO"
+
+/* message types */
+enum {
+	VFIO_MSG_INVAL = 0,
+	/* kernel to user */
+	VFIO_MSG_REMOVE,		/* unbind, module or hotplug remove */
+	VFIO_MSG_ERROR_DETECTED,	/* pci err handling - error detected */
+	VFIO_MSG_MMIO_ENABLED,		/* pci err handling - mmio enabled */
+	VFIO_MSG_LINK_RESET,		/* pci err handling - link reset */
+	VFIO_MSG_SLOT_RESET,		/* pci err handling - slot reset */
+	VFIO_MSG_ERROR_RESUME,		/* pci err handling - resume normal */
+	VFIO_MSG_PM_SUSPEND,		/* suspend or hibernate notification */
+	VFIO_MSG_PM_RESUME,		/* resume after suspend or hibernate */
+	/* user to kernel */
+	VFIO_MSG_REGISTER,
+	VFIO_MSG_ERROR_HANDLING_REPLY,	/* err handling reply */
+	VFIO_MSG_PM_SUSPEND_REPLY,	/* suspend notify reply */
+};
+
+/* attributes */
+enum {
+	VFIO_ATTR_UNSPEC,
+	VFIO_ATTR_MSGCAP,	/* bitmask of messages desired */
+	VFIO_ATTR_PCI_DOMAIN,
+	VFIO_ATTR_PCI_BUS,
+	VFIO_ATTR_PCI_SLOT,
+	VFIO_ATTR_PCI_FUNC,
+	VFIO_ATTR_CHANNEL_STATE,
+	VFIO_ATTR_ERROR_HANDLING_REPLY,
+	VFIO_ATTR_PM_SUSPEND_REPLY,
+	__VFIO_NL_ATTR_MAX
+};
+#define VFIO_NL_ATTR_MAX (__VFIO_NL_ATTR_MAX - 1)
diff --git a/hw/vfio.c b/hw/vfio.c
new file mode 100644
index 0000000..922a47a
--- /dev/null
+++ b/hw/vfio.c
@@ -0,0 +1,1398 @@ 
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <dirent.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "config.h"
+#include "event_notifier.h"
+#include "hw.h"
+#include "kvm.h"
+#include "memory.h"
+#include "monitor.h"
+#include "msi.h"
+#include "msix.h"
+#include "pc.h"
+#include "qemu-error.h"
+#include "range.h"
+#include "vfio.h"
+#include <pci/header.h>
+#include <pci/types.h>
+#include <linux/types.h>
+#include "linux-vfio.h"
+
+//#define DEBUG_VFIO
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { printf("vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+/* TODO: msix.h should define these */
+#define MSIX_CAP_LENGTH 12
+#define MSIX_PAGE_SIZE 0x1000
+
+/* XXX: on qemu-kvm.git we have msix/intx notifiers and irqfds.  With these
+ * we can allow interrupts to bypass userspace.  There's no good #define to
+ * figure out when these are present, so we toggle on the device assignment
+ * ifdef even though it has no relation to the bits we're looking for. */
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+#define QEMU_KVM_BUILD
+#endif
+
+static void vfio_disable_interrupts(VFIODevice *vdev);
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len);
+/*
+ * Generic
+ */
+static uint8_t pci_find_cap_offset(PCIDevice *pdev, uint8_t cap)
+{
+    int id;
+    int max_cap = 48;
+    int pos = PCI_CAPABILITY_LIST;
+    int status;
+
+    status = pdev->config[PCI_STATUS];
+    if ((status & PCI_STATUS_CAP_LIST) == 0) {
+        return 0;
+    }
+
+    while (max_cap--) {
+        pos = pdev->config[pos];
+        if (pos < 0x40) {
+            break;
+        }
+
+        pos &= ~3;
+        id = pdev->config[pos + PCI_CAP_LIST_ID];
+
+        if (id == 0xff) {
+            break;
+        }
+        if (id == cap) {
+            return pos;
+        }
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+static int parse_hostaddr(DeviceState *qdev, Property *prop, const char *str)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+    const char *p = str;
+    int n, seg, bus, dev, func;
+    char field[5];
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1 || p[n] != ':') {
+        return -EINVAL;
+    }
+
+    seg = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (sscanf(p, "%4[^:]%n", field, &n) != 1) {
+        return -EINVAL;
+    }
+
+    if (p[n] == ':') {
+        bus = strtol(field, NULL, 16);
+        p += n + 1;
+    } else {
+        bus = seg;
+        seg = 0;
+    }
+
+    if (sscanf(p, "%4[^.]%n", field, &n) != 1 || p[n] != '.') {
+        return -EINVAL;
+    }
+
+    dev = strtol(field, NULL, 16);
+    p += n + 1;
+
+    if (!qemu_isdigit(*p)) {
+        return -EINVAL;
+    }
+
+    func = *p - '0';
+
+    ptr->seg = seg;
+    ptr->bus = bus;
+    ptr->dev = dev;
+    ptr->func = func;
+    return 0;
+}
+
+static int print_hostaddr(DeviceState *qdev, Property *prop,
+                          char *dest, size_t len)
+{
+    PCIHostDevice *ptr = qdev_get_prop_ptr(qdev, prop);
+
+    return snprintf(dest, len, "%04x:%02x:%02x.%x",
+                    ptr->seg, ptr->bus, ptr->dev, ptr->func);
+}
+
+/*
+ * INTx
+ */
+static inline void vfio_unmask_intx(VFIODevice *vdev)
+{
+    ioctl(vdev->vfiofd, VFIO_IRQ_EOI);
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+    VFIODevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->intx.notifier)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func,
+            'A' + vdev->intx.pin);
+
+    vdev->intx.pending = true;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(ioapic_eoi_client *client)
+{
+    VFIODevice *vdev = container_of(client, VFIODevice, intx.eoi_client);
+
+    if (!vdev->intx.irqfd_enabled) {
+        if (!vdev->intx.pending) {
+            return;
+        }
+
+        vdev->intx.pending = false;
+
+        /* If the interrupt is injected via qemu (not irqfd), we need to
+         * deassert the interrupt here so qemu knows about the level change.
+         * Otherwise the next interrupt won't make it out of qemu.  Interrupts
+         * via irqfd are completely outside of qemu, so we can skip it. */
+        qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func);
+
+    vfio_unmask_intx(vdev);
+}
+
+/* Wrappers for EOI client setup that allow VFIO to directly consume the
+ * eventfd from KVM.  This serves the same purpose as irqfd for the EOI. */
+static int vfio_enable_eoi_client(VFIODevice *vdev)
+{
+    int fd, ret;
+
+    ret = ioapic_register_eoi_client(&vdev->intx.eoi_client);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Exit here is ok, just means EOIs bounce through qemu */
+    fd = ioapic_eoi_client_get_fd(&vdev->intx.eoi_client);
+    if (fd < 0) {
+        return 0;
+    }
+
+    ret = ioctl(vdev->vfiofd, VFIO_IRQ_EOI_EVENTFD, &fd);
+    if (ret < 0) {
+        fprintf(stderr, "vfio: VFIO_IRQ_EOI_EVENTFD setup - %s (%d)\n",
+                strerror(-ret), ret);
+        return ret;
+    }
+    qemu_set_fd_handler(fd, NULL, NULL, NULL);
+
+    return 0;
+}
+
+static void vfio_disable_eoi_client(VFIODevice *vdev)
+{
+    int fd = -1;
+
+    ioapic_unregister_eoi_client(&vdev->intx.eoi_client);
+    ioctl(vdev->vfiofd, VFIO_IRQ_EOI_EVENTFD, &fd);
+}
+
+/* Attempt to send the VFIO eventfd directly into the KVM irqchip */
+static void vfio_set_intx_handler(VFIODevice *vdev, IOHandler *fd_read,
+                                  bool irqfd_enable)
+{
+    int fd = event_notifier_get_fd(&vdev->intx.notifier);
+#ifdef QEMU_KVM_BUILD
+    int ret;
+
+    ret = kvm_set_irqfd(vdev->intx.eoi_client.irq, fd, irqfd_enable);
+    if (ret < 0) {
+        if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+            fprintf(stderr, "vfio: Error: irqfd %s failed - %s (%d)\n",
+                    irqfd_enable ? "enable" : "disable", strerror(-ret), ret);
+            goto out;
+        }
+    }
+
+    vdev->intx.irqfd_enabled = irqfd_enable;
+out:
+#endif
+    if (vdev->intx.irqfd_enabled) {
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+    } else {
+        qemu_set_fd_handler(fd, fd_read, NULL, vdev);
+    }
+}
+
+static void vfio_update_irqs(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    int irq = pci_get_irq(pdev, vdev->intx.pin);
+
+    if (irq == vdev->intx.eoi_client.irq) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __FUNCTION__,
+            vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, vdev->intx.eoi_client.irq, irq);
+
+    vfio_set_intx_handler(vdev, vfio_intx_interrupt, false);
+    vfio_disable_eoi_client(vdev);
+
+    vdev->intx.eoi_client.irq = irq;
+
+    if (irq < 0) {
+        fprintf(stderr, "vfio: Error - INTx moved to IRQ %d\n", irq);
+        return;
+    }
+
+    vfio_enable_eoi_client(vdev);
+    vfio_set_intx_handler(vdev, vfio_intx_interrupt, true);
+
+    /* Re-enable the interrupt in cased we missed an EOI */
+    vfio_eoi(&vdev->intx.eoi_client);
+}
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+    int fd;
+    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
+
+    if (!pin) {
+        return 0;
+    }
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+    vdev->intx.eoi_client.eoi = vfio_eoi;
+    vdev->intx.eoi_client.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
+
+    vfio_enable_eoi_client(vdev);
+
+    pci_register_update_irqs(&vdev->pdev, vfio_update_irqs);
+
+    if (event_notifier_init(&vdev->intx.notifier, 0)) {
+        fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        return -1;
+    }
+
+    vfio_set_intx_handler(vdev, vfio_intx_interrupt, true);
+
+    fd = event_notifier_get_fd(&vdev->intx.notifier);
+
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd)) {
+        fprintf(stderr, "vfio: Error: Failed to setup INTx fd %s\n",
+                strerror(errno));
+        return -1;
+    }
+    vfio_unmask_intx(vdev);
+
+    vdev->interrupt = INT_INTx;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func);
+
+    return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+    int fd = -1;
+
+    if (vdev->interrupt != INT_INTx) {
+        return;
+    }
+
+    pci_register_update_irqs(&vdev->pdev, NULL);
+    vfio_set_intx_handler(vdev, NULL, false);
+    vfio_disable_eoi_client(vdev);
+    ioctl(vdev->vfiofd, VFIO_EVENTFD_IRQ, &fd);
+    event_notifier_cleanup(&vdev->intx.notifier);
+    vdev->interrupt = INT_NONE;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func);
+}
+
+/*
+ * MSI-X
+ */
+static void vfio_msix_interrupt(void *opaque)
+{
+    MSIVector *vec = opaque;
+    VFIODevice *vdev = vec->vdev;
+
+    if (!event_notifier_test_and_clear(&vec->notifier)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func, vec->vector);
+
+    msix_notify(&vdev->pdev, vec->vector);
+}
+
+#ifdef QEMU_KVM_BUILD
+/* When a vector is masked, we disable the irqfd, forcing the interrupt
+ * through qemu userspace.  We can then filter masked vectors in msix_notify. */
+static int vfio_msix_mask_notify(PCIDevice *pdev, unsigned vector, int masked)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    int fd, ret;
+
+    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].notifier);
+    ret = kvm_set_irqfd(pdev->msix_irq_entries[vector].gsi, fd, !masked);
+    if (ret == -ENOSYS) {
+        return 0; /* w/o irqfd, interrupts pass through qemu anyway */
+    } else if (ret < 0) {
+        fprintf(stderr, "vfio: Error - irqfd setup failed\n");
+        return ret;
+    }
+
+    if (masked) {
+        qemu_set_fd_handler(fd, vfio_msix_interrupt, NULL,
+                            &vdev->msi_vectors[vector]);
+    } else {
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+    }
+
+    return ret;
+}
+#endif
+
+static void vfio_enable_msix(VFIODevice *vdev)
+{
+    int i, *fds;
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->nr_vectors = vdev->pdev.msix_entries_nr;
+    vdev->msi_vectors = qemu_malloc(vdev->nr_vectors * sizeof(MSIVector));
+
+    fds = qemu_malloc((vdev->nr_vectors + 1) * sizeof(int));
+    fds[0] = vdev->nr_vectors;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        vdev->msi_vectors[i].vdev = vdev;
+        vdev->msi_vectors[i].vector = i;
+
+        if (event_notifier_init(&vdev->msi_vectors[i].notifier, 0)) {
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        }
+
+        fds[i + 1] = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msix_interrupt, NULL,
+                            &vdev->msi_vectors[i]);
+
+        if (msix_vector_use(&vdev->pdev, i) < 0) {
+            fprintf(stderr, "vfio: Error msix_vector_use\n");
+        }
+    }
+
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, fds)) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSIX fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    vdev->interrupt = INT_MSIX;
+
+    qemu_free(fds);
+
+#ifdef QEMU_KVM_BUILD
+    if (msix_set_mask_notifier(&vdev->pdev, vfio_msix_mask_notify)) {
+        fprintf(stderr, "vfio: Error msix_set_mask_notifier\n");
+    }
+#endif
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d vectors\n", __FUNCTION__,
+            vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, vdev->nr_vectors);
+}
+
+static void vfio_disable_msix(VFIODevice *vdev)
+{
+    int i, vectors = 0;
+
+    if (vdev->interrupt != INT_MSIX) {
+        return;
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSIX, &vectors);
+
+#ifdef QEMU_KVM_BUILD
+    if (msix_unset_mask_notifier(&vdev->pdev)) {
+        fprintf(stderr, "vfio: Error msix_unset_mask_notifier\n");
+    }
+#endif
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        int fd = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+
+        msix_vector_unuse(&vdev->pdev, i);
+
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msi_vectors[i].notifier);
+    }
+
+    qemu_free(vdev->msi_vectors);
+    vdev->nr_vectors = 0;
+    vdev->interrupt = INT_NONE;
+    vfio_enable_intx(vdev);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func);
+}
+
+/*
+ * MSI
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+    MSIVector *vec = opaque;
+    VFIODevice *vdev = vec->vdev;
+
+    if (!event_notifier_test_and_clear(&vec->notifier)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func, vec->vector);
+
+    msi_notify(&vdev->pdev, vec->vector);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+    int i, *fds;
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
+    vdev->msi_vectors = qemu_malloc(vdev->nr_vectors * sizeof(MSIVector));
+
+    fds = qemu_malloc((vdev->nr_vectors + 1) * sizeof(int));
+    fds[0] = vdev->nr_vectors;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        vdev->msi_vectors[i].vdev = vdev;
+        vdev->msi_vectors[i].vector = i;
+
+        if (event_notifier_init(&vdev->msi_vectors[i].notifier, 0)) {
+            fprintf(stderr, "vfio: Error: event_notifier_init failed\n");
+        }
+
+        fds[i + 1] = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+        qemu_set_fd_handler(fds[i + 1], vfio_msi_interrupt, NULL,
+                            &vdev->msi_vectors[i]);
+    }
+    
+    if (ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSI, fds)) {
+        fprintf(stderr, "vfio: Error: Failed to setup MSI fds %s\n",
+                strerror(errno));
+        qemu_free(fds);
+        return;
+    }
+
+    vdev->interrupt = INT_MSI;
+
+    qemu_free(fds);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d vectors\n", __FUNCTION__,
+            vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, vdev->nr_vectors);
+}
+
+static void vfio_disable_msi(VFIODevice *vdev)
+{
+    int i, vectors = 0;
+
+    if (vdev->interrupt != INT_MSI) {
+        return;
+    }
+
+    ioctl(vdev->vfiofd, VFIO_EVENTFDS_MSI, &vectors);
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        int fd = event_notifier_get_fd(&vdev->msi_vectors[i].notifier);
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        event_notifier_cleanup(&vdev->msi_vectors[i].notifier);
+    }
+
+    qemu_free(vdev->msi_vectors);
+    vdev->nr_vectors = 0;
+    vdev->interrupt = INT_NONE;
+    vfio_enable_intx(vdev);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __FUNCTION__, vdev->host.seg,
+            vdev->host.bus, vdev->host.dev, vdev->host.func);
+}
+
+/*
+ * IO Port/MMIO
+ */
+static void vfio_resource_write(PCIResource *res, uint32_t addr,
+                                uint32_t val, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+
+    if (pwrite(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, 0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, val, len, strerror(errno));
+    }
+    DPRINTF("%s(BAR%d+0x%x, 0x%x, %d)\n", __FUNCTION__, res->bar,
+            addr, val, len);
+}
+
+static void vfio_resource_writeb(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 1);
+}
+
+static void vfio_resource_writew(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 2);
+}
+
+static void vfio_resource_writel(void *opaque, target_phys_addr_t addr,
+                                 uint32_t val)
+{
+    vfio_resource_write(opaque, addr, val, 4);
+}
+
+static CPUWriteMemoryFunc * const vfio_resource_writes[] = {
+    &vfio_resource_writeb,
+    &vfio_resource_writew,
+    &vfio_resource_writel
+};
+
+static void vfio_ioport_writeb(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 1);
+}
+
+static void vfio_ioport_writew(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 2);
+}
+
+static void vfio_ioport_writel(void *opaque, uint32_t addr, uint32_t val)
+{
+    PCIResource *res = opaque;
+    vfio_resource_write(res, addr - res->e_phys, val, 4);
+}
+
+static uint32_t vfio_resource_read(PCIResource *res, uint32_t addr, int len)
+{
+    size_t offset = vfio_pci_space_to_offset(VFIO_PCI_BAR0_RESOURCE + res->bar);
+    uint32_t val;
+
+    if (pread(res->vfiofd, &val, len, offset + addr) != len) {
+        fprintf(stderr, "%s(,0x%x, %d) failed: %s\n",
+                __FUNCTION__, addr, len, strerror(errno));
+        return 0xffffffffU;
+    }
+    DPRINTF("%s(BAR%d+0x%x, %d) = 0x%x\n", __FUNCTION__, res->bar,
+            addr, len, val);
+    return val;
+}
+
+static uint32_t vfio_resource_readb(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 1) & 0xff;
+}
+
+static uint32_t vfio_resource_readw(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 2) & 0xffff;
+}
+
+static uint32_t vfio_resource_readl(void *opaque, target_phys_addr_t addr)
+{
+    return vfio_resource_read(opaque, addr, 4);
+}
+
+static CPUReadMemoryFunc * const vfio_resource_reads[] = {
+    &vfio_resource_readb,
+    &vfio_resource_readw,
+    &vfio_resource_readl
+};
+
+static uint32_t vfio_ioport_readb(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 1) & 0xff;
+}
+
+static uint32_t vfio_ioport_readw(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 2) & 0xffff;
+}
+
+static uint32_t vfio_ioport_readl(void *opaque, uint32_t addr)
+{
+    PCIResource *res = opaque;
+    return vfio_resource_read(res, addr - res->e_phys, 4);
+}
+
+static void vfio_ioport_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", __FUNCTION__,
+            vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    register_ioport_write(e_phys, e_size, 1, vfio_ioport_writeb, res);
+    register_ioport_write(e_phys, e_size, 2, vfio_ioport_writew, res);
+    register_ioport_write(e_phys, e_size, 4, vfio_ioport_writel, res);
+    register_ioport_read(e_phys, e_size, 1, vfio_ioport_readb, res);
+    register_ioport_read(e_phys, e_size, 2, vfio_ioport_readw, res);
+    register_ioport_read(e_phys, e_size, 4, vfio_ioport_readl, res);
+}
+
+static void vfio_iomem_map(PCIDevice *pdev, int bar,
+                           pcibus_t e_phys, pcibus_t e_size, int type)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIResource *res = &vdev->resources[bar];
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, %d, 0x%lx, 0x%lx, %d)\n", __FUNCTION__,
+            vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, bar, e_phys, e_size, type);
+
+    res->e_phys = e_phys;
+    res->e_size = e_size;
+
+    if (res->msix) {
+        if (res->msix_offset > 0) {
+            cpu_register_physical_memory(e_phys, res->msix_offset, res->slow ?
+                                         res->io_mem : res->memory_index[0]);
+        }
+
+        DPRINTF("Overlaying MSI-X table page\n");
+        msix_mmio_map(pdev, bar, e_phys, e_size, type);
+
+        if (e_size > res->msix_offset + MSIX_PAGE_SIZE) {
+            uint32_t offset = res->msix_offset + MSIX_PAGE_SIZE;
+            e_phys += offset;
+            e_size -= offset;
+            cpu_register_physical_memory_offset(e_phys, e_size,
+                            res->slow ? res->io_mem : res->memory_index[1],
+                            res->slow ? offset : 0);
+        }
+    } else {
+        cpu_register_physical_memory(e_phys, e_size, res->slow ?
+                                     res->io_mem : res->memory_index[0]);
+    }
+}
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val = 0;
+
+    if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+         ranges_overlap(addr, len, pdev->msi_cap, pdev->msi_cap_size))) {
+
+        val = pci_default_read_config(pdev, addr, len);
+    } else {
+        if (pread(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+            fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %s\n",
+                    __FUNCTION__, vdev->host.seg, vdev->host.bus,
+                    vdev->host.dev, vdev->host.func, addr, len,
+                    strerror(errno));
+            return -1;
+        }
+    }
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) %x\n", __FUNCTION__,
+            vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, len, val);
+    return val;
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x)\n", __FUNCTION__,
+            vdev->host.seg, vdev->host.bus, vdev->host.dev,
+            vdev->host.func, addr, val, len);
+
+    /* Write everything to VFIO, let it filter out what we can't write */
+    if (pwrite(vdev->vfiofd, &val, len, VFIO_PCI_CONFIG_OFF + addr) != len) {
+        fprintf(stderr, "%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %s\n",
+                __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, addr, val, len, strerror(errno));
+    }
+
+    /* Write standard header bits to emulation */
+    if (addr < 0x40) {
+        pci_default_write_config(pdev, addr, val, len);
+        return;
+    }
+
+    /* MSI/MSI-X Enabling/Disabling */
+    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+        ranges_overlap(addr, len, pdev->msi_cap, pdev->msi_cap_size)) {
+        int is_enabled, was_enabled = msi_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+        msi_write_config(pdev, addr, val, len);
+
+        is_enabled = msi_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            vfio_enable_msi(vdev);
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msi(vdev);
+        }
+    }
+
+    if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
+        int is_enabled, was_enabled = msix_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+        msix_write_config(pdev, addr, val, len);
+
+        is_enabled = msix_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            vfio_enable_msix(vdev);
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msix(vdev);
+        }
+    }
+}
+
+/*
+ * DMA
+ */
+static int vfio_dma_map(void *opaque, target_phys_addr_t start_addr,
+                        ram_addr_t size, ram_addr_t phys_offset)
+{
+    VFIODevice *vdev = opaque;
+    struct vfio_dma_map dma_map;
+
+    dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(phys_offset);
+    dma_map.dmaaddr = start_addr;
+    dma_map.size = size;
+    dma_map.flags = VFIO_FLAG_WRITE;
+
+    return ioctl(vdev->vfiofd, VFIO_DMA_MAP_IOVA, &dma_map);
+}
+
+static int vfio_dma_unmap(void *opaque, target_phys_addr_t start_addr,
+                          ram_addr_t size, ram_addr_t phys_offset)
+{
+    VFIODevice *vdev = opaque;
+    struct vfio_dma_map dma_map;
+
+    dma_map.vaddr = (uint64_t)qemu_get_ram_ptr(phys_offset);
+    dma_map.dmaaddr = start_addr;
+    dma_map.size = size;
+    dma_map.flags = VFIO_FLAG_WRITE;
+
+    return ioctl(vdev->vfiofd, VFIO_DMA_UNMAP, &dma_map);
+}
+
+static int vfio_map_iommu(VFIODevice *vdev)
+{
+    return qemu_ram_for_each_slot(vdev, vfio_dma_map);
+}
+
+static int vfio_unmap_iommu(VFIODevice *vdev)
+{
+    return qemu_ram_for_each_slot(vdev, vfio_dma_unmap);
+}
+
+/*
+ * Interrupt setup
+ */
+static void vfio_disable_interrupts(VFIODevice *vdev)
+{
+    switch (vdev->interrupt) {
+    case INT_INTx:
+        vfio_disable_intx(vdev);
+        break;
+    case INT_MSI:
+        vfio_disable_msi(vdev);
+        break;
+    case INT_MSIX:
+        vfio_disable_msix(vdev);
+    }
+}
+
+static int vfio_setup_msi(VFIODevice *vdev)
+{
+    int pos;
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSI))) {
+        uint16_t ctrl;
+        bool msi_64bit, msi_maskbit;
+        int entries;
+
+        if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+            return -1;
+        }
+
+        msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
+        msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
+        entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
+
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.seg,
+                vdev->host.bus, vdev->host.dev, vdev->host.func, pos);
+
+        if (msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit) < 0) {
+            fprintf(stderr, "vfio: msi_init failed\n");
+            return -1;
+        }
+    }
+
+    if ((pos = pci_find_cap_offset(&vdev->pdev, PCI_CAP_ID_MSIX))) {
+        uint16_t ctrl;
+        uint32_t table, len, offset;
+        int bar, entries;
+
+        if (pread(vdev->vfiofd, &ctrl, sizeof(ctrl),
+                  VFIO_PCI_CONFIG_OFF + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+            return -1;
+        }
+
+        if (pread(vdev->vfiofd, &table, sizeof(table), VFIO_PCI_CONFIG_OFF +
+                  pos + PCI_MSIX_TABLE) != sizeof(table)) {
+            return -1;
+        }
+
+        ctrl = le16_to_cpu(ctrl);
+        table = le32_to_cpu(table);
+
+        bar = table & PCI_MSIX_BIR;
+        offset = table & ~PCI_MSIX_BIR;
+        entries = (ctrl & PCI_MSIX_TABSIZE) + 1;
+
+        vdev->resources[bar].msix = true;
+        vdev->resources[bar].msix_offset = offset;
+
+        DPRINTF("%04x:%02x:%02x.%x PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x\n",
+                vdev->host.seg, vdev->host.bus, vdev->host.dev,
+                vdev->host.func, pos, bar, offset);
+
+        len = table & PCI_MSIX_BIR;
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for MSIX BAR\n");
+            return -1;
+        }
+
+        if (msix_init(&vdev->pdev, entries, bar, len) < 0) {
+            fprintf(stderr, "vfio: msix_init failed\n");
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+    msi_uninit(&vdev->pdev);
+    msix_uninit(&vdev->pdev);
+}
+
+/*
+ * Resource setup
+ */
+static int vfio_setup_resources(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        uint32_t len, bar;
+        PCIResource *res;
+        uint8_t offset;
+        int ret, space;
+
+        res = &vdev->resources[i];
+        res->vfiofd = vdev->vfiofd;
+        res->bar = len = i;
+
+        if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &len)) {
+            fprintf(stderr, "vfio: VFIO_BAR_LEN failed for BAR %d\n", i);
+            return -1;
+        }
+        if (!len) {
+            continue;
+        }
+
+        offset = PCI_BASE_ADDRESS_0 + (4 * i);
+        ret = pread(vdev->vfiofd, &bar, sizeof(bar),
+                    VFIO_PCI_CONFIG_OFF + offset);
+        if (ret != sizeof(bar)) {
+            fprintf(stderr, "vfio: Failed to read BAR %d\n", i);
+            return -1;
+        }
+        bar = le32_to_cpu(bar);
+        space = bar & PCI_BASE_ADDRESS_SPACE;
+
+        if (space == PCI_BASE_ADDRESS_SPACE_MEMORY && !(len & 0xfff)) {
+            int off = VFIO_PCI_BAR0_RESOURCE + i;
+            int flags = PROT_READ | PROT_WRITE;
+            char name[32];
+
+            res->mem = true;
+            res->size = len;
+
+            if (vdev->pdev.qdev.info->vmsd) {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->vmsd->name, i);
+            } else {
+                snprintf(name, sizeof(name), "%s.bar%d",
+                         vdev->pdev.qdev.info->name, i);
+            }
+
+            if (res->msix) {
+                if (res->msix_offset) {
+                    char *c = &name[strlen(name)];
+                    
+                    res->r_virtbase[0] = mmap(NULL, res->msix_offset, flags,
+                                              MAP_SHARED, vdev->vfiofd,
+                                              vfio_pci_space_to_offset(off));
+
+                    if (res->r_virtbase[0] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".0", sizeof(name));
+                    res->memory_index[0] =
+                        qemu_ram_alloc_from_ptr(&vdev->pdev.qdev,
+                                                name, res->msix_offset,
+                                                res->r_virtbase[0]);
+                    *c = 0;
+                }
+                if (len > res->msix_offset + MSIX_PAGE_SIZE) {
+                    char *c = &name[strlen(name)];
+
+                    res->r_virtbase[1] = mmap(NULL,
+                                        len - res->msix_offset - MSIX_PAGE_SIZE,
+                                        flags, MAP_SHARED, vdev->vfiofd,
+                                        vfio_pci_space_to_offset(off) +
+                                        res->msix_offset + MSIX_PAGE_SIZE);
+
+                    if (res->r_virtbase[1] == MAP_FAILED) {
+                        fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                        return -1;
+                    }
+                    strncat(name, ".1", sizeof(name));
+                    res->memory_index[1] =
+                        qemu_ram_alloc_from_ptr(&vdev->pdev.qdev, name,
+                                        len - MSIX_PAGE_SIZE - res->msix_offset,
+                                        res->r_virtbase[1]);
+                    *c = 0;
+                }
+            } else {
+                res->r_virtbase[0] = mmap(NULL, len, flags, MAP_SHARED,
+                                          vdev->vfiofd,
+                                          vfio_pci_space_to_offset(off));
+
+                if (res->r_virtbase[0] == MAP_FAILED) {
+                    fprintf(stderr, "vfio: Failed to mmap BAR %d\n", i);
+                    return -1;
+                }
+                res->memory_index[0] =
+                    qemu_ram_alloc_from_ptr(&vdev->pdev.qdev,
+                                            name, len, res->r_virtbase[0]);
+            }
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+                  
+            if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+                i++;
+            }
+        } else if (space == PCI_BASE_ADDRESS_SPACE_MEMORY) {
+            res->mem = true;
+            res->size = len;
+            res->slow = true;
+
+            DPRINTF("%s(%04x:%02x:%02x.%x) Using slow mapping for BAR %d\n",
+                    __FUNCTION__, vdev->host.seg, vdev->host.bus,
+                    vdev->host.dev, vdev->host.func, i);
+
+            res->io_mem = cpu_register_io_memory(vfio_resource_reads,
+                                                 vfio_resource_writes, res);
+
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             bar & PCI_BASE_ADDRESS_MEM_PREFETCH ?
+                             PCI_BASE_ADDRESS_MEM_PREFETCH :
+                             PCI_BASE_ADDRESS_SPACE_MEMORY,
+                             vfio_iomem_map);
+
+            if (bar & PCI_BASE_ADDRESS_MEM_TYPE_64) {
+                i++;
+            }
+        } else if (space == PCI_BASE_ADDRESS_SPACE_IO) {
+            res->size = len;
+            pci_register_bar(&vdev->pdev, i, res->size,
+                             PCI_BASE_ADDRESS_SPACE_IO, vfio_ioport_map);
+        }
+        res->valid = true;
+    }
+    return 0;
+}
+
+static void vfio_unmap_resources(VFIODevice *vdev)
+{
+    int i;
+    PCIResource *res = vdev->resources;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++, res++) {
+        if (res->valid && res->mem) {
+            if (res->msix) {
+                if (res->msix_offset) {
+                    cpu_register_physical_memory(res->e_phys, res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_free_from_ptr(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->msix_offset);
+                }
+                if (res->size > res->msix_offset + MSIX_PAGE_SIZE) {
+                    cpu_register_physical_memory(res->e_phys + MSIX_PAGE_SIZE +
+                                                 res->msix_offset,
+                                                 res->e_size - MSIX_PAGE_SIZE -
+                                                 res->msix_offset,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_free_from_ptr(res->memory_index[1]);
+                    munmap(res->r_virtbase[1],
+                           res->size - MSIX_PAGE_SIZE - res->msix_offset);
+                }
+            } else {
+                if (!res->slow) {
+                    cpu_register_physical_memory(res->e_phys, res->e_size,
+                                                 IO_MEM_UNASSIGNED);
+                    qemu_ram_free_from_ptr(res->memory_index[0]);
+                    munmap(res->r_virtbase[0], res->size);
+                } else {
+                    cpu_unregister_io_memory(res->io_mem);
+                }
+            }
+        }
+    }
+}
+
+/*
+ * General setup
+ */
+static int get_vfio_fd(VFIODevice *vdev)
+{
+    if (vdev->vfiofd_name && strlen(vdev->vfiofd_name) > 0) {
+        if (qemu_isdigit(vdev->vfiofd_name[0])) {
+            vdev->vfiofd = strtol(vdev->vfiofd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->vfiofd = monitor_get_fd(cur_mon, vdev->vfiofd_name);
+            if (vdev->vfiofd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->vfiofd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        char vfio_dir[64], vfio_dev[16];
+        DIR *dir;
+        struct dirent *de;
+
+        sprintf(vfio_dir, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/vfio/",
+                vdev->host.seg, vdev->host.bus,
+                vdev->host.dev, vdev->host.func);
+        dir = opendir(vfio_dir);
+        if (!dir) {
+            error_report("vfio: error: Driver not attached\n");
+            return -1;
+        }
+
+        while ((de = readdir(dir))) {
+            if (de->d_name[0] == '.')
+                continue;
+            if (!strncmp(de->d_name, "vfio", 4))
+                break;
+        }
+
+        if (!de) {
+            error_report("vfio: error: Cannot find vfio* in %s\n", vfio_dir);
+            return -1;
+        }
+
+        sprintf(vfio_dev, "/dev/%s", de->d_name);
+        vdev->vfiofd = open(vfio_dev, O_RDWR);
+        if (vdev->vfiofd < 0) {
+            error_report("pci-assign: vfio: Failed to open %s: %s\n",
+                         vfio_dev, strerror(errno));
+            return -1;
+        }
+        return 0;
+    }
+}
+
+static int get_uiommu_fd(VFIODevice *vdev)
+{
+    if (vdev->uiommufd_name && strlen(vdev->uiommufd_name) > 0) {
+        if (qemu_isdigit(vdev->uiommufd_name[0])) {
+            vdev->uiommufd = strtol(vdev->uiommufd_name, NULL, 0);
+            return 0;
+        } else {
+            vdev->uiommufd = monitor_get_fd(cur_mon, vdev->uiommufd_name);
+            if (vdev->uiommufd < 0) {
+                fprintf(stderr, "%s: (%s) unkown\n", __func__,
+                        vdev->uiommufd_name);
+                return -1;
+            }
+            return 0;
+        }
+    } else {
+        vdev->uiommufd = open("/dev/uiommu", O_RDONLY);
+        if (vdev->uiommufd < 0) {
+            return -1;
+        }
+        vdev->uiommufd_name = NULL; /* easier test later */
+        return 0;
+    }
+}
+
+static int vfio_load_rom(VFIODevice *vdev)
+{
+    uint32_t len, size = PCI_ROM_SLOT;
+    char name[32];
+    off_t off = 0, voff = vfio_pci_space_to_offset(VFIO_PCI_ROM_RESOURCE);
+    ssize_t bytes;
+    void *ptr;
+
+    /* If loading ROM from file, pci handles it */
+    if (vdev->pdev.romfile || !vdev->pdev.rom_bar)
+        return 0;
+
+    if (ioctl(vdev->vfiofd, VFIO_BAR_LEN, &size)) {
+        fprintf(stderr, "vfio: VFIO_BAR_LEN failed for OPTION ROM");
+        return -1;
+    }
+
+    if (!size)
+        return 0;
+
+    len = size;
+    snprintf(name, sizeof(name), "%s.rom", vdev->pdev.qdev.info->name);
+    vdev->pdev.rom_offset = qemu_ram_alloc(&vdev->pdev.qdev, name, size);
+    ptr = qemu_get_ram_ptr(vdev->pdev.rom_offset);
+    memset(ptr, 0xff, size);
+
+    while (size) {
+        bytes = pread(vdev->vfiofd, ptr + off, size, voff + off);
+        if (bytes == 0) {
+            break; /* expect that we could get back less than the ROM BAR */
+        } else if (bytes > 0) {
+            off += bytes;
+            size -= bytes;
+        } else {
+            if (errno == EINTR || errno == EAGAIN) {
+                continue;
+            }
+            fprintf(stderr, "vfio: Error reading device ROM: %s\n",
+                    strerror(errno));
+            qemu_ram_free(vdev->pdev.rom_offset);
+            vdev->pdev.rom_offset = 0;
+            return -1;
+        }
+    }
+
+    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, len, 0, pci_map_option_rom);
+    return 0;
+}
+
+static int vfio_initfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    char sys[64];
+    struct stat st;
+    int ret;
+
+    /* Check that the host device exists */
+    sprintf(sys, "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+            vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
+    if (stat(sys, &st) < 0) {
+        error_report("vfio: error: no such host device "
+                     "%04x:%02x:%02x.%01x", vdev->host.seg, vdev->host.bus,
+                     vdev->host.dev, vdev->host.func);
+        return -1;
+    }
+
+    if (get_uiommu_fd(vdev))
+        return -1;
+
+    if (get_vfio_fd(vdev))
+        goto out_close_uiommu;
+
+    if (ioctl(vdev->vfiofd, VFIO_DOMAIN_SET, &vdev->uiommufd))
+        goto out_close_vfiofd;
+
+    /* Get a copy of config space */
+    ret = pread(vdev->vfiofd, vdev->pdev.config,
+                pci_config_size(&vdev->pdev), VFIO_PCI_CONFIG_OFF);
+    if (ret < pci_config_size(&vdev->pdev)) {
+        fprintf(stderr, "vfio: Failed to read device config space\n");
+        goto out_unset_domain;
+    }
+
+    /* Clear host resource mapping info.  If we choose not to register a
+     * BAR, such as might be the case with the option ROM, we can get
+     * confusing, unwritable, residual addresses from the host here. */
+    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
+    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
+
+    vfio_load_rom(vdev);
+
+    if (vfio_setup_msi(vdev))
+        goto out_unset_domain;
+
+    if (vfio_setup_resources(vdev))
+        goto out_disable_msix;
+
+    if (vfio_map_iommu(vdev))
+        goto out_unmap_resources;
+
+    if (vfio_enable_intx(vdev))
+        goto out_unmap_iommu;
+
+    return 0;
+
+out_unmap_iommu:
+    vfio_unmap_iommu(vdev);
+out_unmap_resources:
+    vfio_unmap_resources(vdev);
+out_disable_msix:
+    vfio_teardown_msi(vdev);
+out_unset_domain:
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+out_close_vfiofd:
+    close(vdev->vfiofd);
+out_close_uiommu:
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return -1;
+}
+
+static int vfio_exitfn(struct PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    
+    vfio_disable_interrupts(vdev);
+    vfio_teardown_msi(vdev);
+    vfio_unmap_iommu(vdev);
+    vfio_unmap_resources(vdev);
+    ioctl(vdev->vfiofd, VFIO_DOMAIN_UNSET);
+    close(vdev->vfiofd);
+    if (!vdev->uiommufd_name)
+        close(vdev->uiommufd);
+    return 0;
+}
+
+static PropertyInfo qdev_prop_hostaddr = {
+    .name  = "pci-hostaddr",
+    .type  = -1,
+    .size  = sizeof(PCIHostDevice),
+    .parse = parse_hostaddr,
+    .print = print_hostaddr,
+};
+
+static PCIDeviceInfo vfio_info = {
+    .qdev.name    = "vfio",
+    .qdev.desc    = "pass through host pci devices to the guest via vfio",
+    .qdev.size    = sizeof(VFIODevice),
+    .init         = vfio_initfn,
+    .exit         = vfio_exitfn,
+    .config_read  = vfio_pci_read_config,
+    .config_write = vfio_pci_write_config,
+    .qdev.props   = (Property[]) {
+        DEFINE_PROP("host", VFIODevice, host,
+                    qdev_prop_hostaddr, PCIHostDevice),
+        DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+        DEFINE_PROP_STRING("uiommufd", VFIODevice, uiommufd_name),
+        DEFINE_PROP_END_OF_LIST(),
+    },
+};
+
+static void vfio_register_devices(void)
+{
+    pci_qdev_register(&vfio_info);
+}
+
+device_init(vfio_register_devices)
diff --git a/hw/vfio.h b/hw/vfio.h
new file mode 100644
index 0000000..20ae5db
--- /dev/null
+++ b/hw/vfio.h
@@ -0,0 +1,68 @@ 
+#ifndef __VFIO_H__
+#define __VFIO_H__
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+
+typedef struct PCIHostDevice {
+    uint16_t seg;
+    uint8_t bus;
+    uint8_t dev:5;
+    uint8_t func:3;
+} PCIHostDevice;
+
+typedef struct PCIResource {
+    bool valid;
+    bool mem;
+    bool msix;
+    bool slow;
+    uint8_t bar;
+    uint64_t size;
+    ram_addr_t memory_index[2];  /* cpu_register_physical_memory() index */
+    void *r_virtbase[2];         /* mmapped address */
+    int io_mem;                  /* cpu_register_io_memory index */
+    pcibus_t e_phys;             /* emulated base address */
+    pcibus_t e_size;             /* emulated size of region in bytes */
+    uint32_t msix_offset;
+    int vfiofd;                  /* see vfio_resource_read/write */
+} PCIResource;
+
+typedef struct INTx {
+    bool pending;
+    uint8_t pin;
+    bool irqfd_enabled;
+    EventNotifier notifier;
+    ioapic_eoi_client eoi_client;
+} INTx;
+
+struct VFIODevice;
+
+typedef struct MSIVector {
+    EventNotifier notifier;
+    struct VFIODevice *vdev;
+    int vector;
+} MSIVector;
+
+enum {
+    INT_NONE = 0,
+    INT_INTx = 1,
+    INT_MSI  = 2,
+    INT_MSIX = 3,
+};
+
+typedef struct VFIODevice {
+    PCIDevice pdev;
+    PCIHostDevice host;
+    PCIResource resources[PCI_NUM_REGIONS - 1]; /* No ROM */
+    INTx intx;
+    MSIVector *msi_vectors;
+    int nr_vectors;
+    int interrupt;
+    int vfiofd;
+    int uiommufd;
+    char *vfiofd_name;
+    char *uiommufd_name;
+} VFIODevice;
+
+#endif /* __VFIO_H__ */