Patchwork [RFC,3/5] VFIO: Base framework for new VFIO driver

login
register
mail settings
Submitter Alex Williamson
Date Sept. 1, 2011, 7:50 p.m.
Message ID <20110901195043.2391.31843.stgit@s20.home>
Download mbox | patch
Permalink /patch/112971/
State New
Headers show

Comments

Alex Williamson - Sept. 1, 2011, 7:50 p.m.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 drivers/Kconfig             |    2 
 drivers/Makefile            |    1 
 drivers/vfio/Kconfig        |    5 
 drivers/vfio/Makefile       |    3 
 drivers/vfio/vfio_device.c  |  109 +++++
 drivers/vfio/vfio_iommu.c   |   81 ++++
 drivers/vfio/vfio_main.c    |  879 +++++++++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio_private.h |   82 ++++
 8 files changed, 1162 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/Kconfig
 create mode 100644 drivers/vfio/Makefile
 create mode 100644 drivers/vfio/vfio_device.c
 create mode 100644 drivers/vfio/vfio_iommu.c
 create mode 100644 drivers/vfio/vfio_main.c
 create mode 100644 drivers/vfio/vfio_private.h
Konrad Rzeszutek Wilk - Sept. 7, 2011, 2:52 p.m.
> +static long vfio_iommu_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_iommu *viommu = filep->private_data;
> +	struct vfio_dma_map dm;
> +	int ret = -ENOSYS;
> +
> +	switch (cmd) {
> +	case VFIO_IOMMU_MAP_DMA:
> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> +			return -EFAULT;
> +		ret = 0; // XXX - Do something

<chuckles>

> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> +			ret = -EFAULT;
> +		break;
> +
> +	case VFIO_IOMMU_UNMAP_DMA:
> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> +			return -EFAULT;
> +		ret = 0; // XXX - Do something
> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> +			ret = -EFAULT;
> +		break;
> +	}
> +	return ret;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long vfio_iommu_compat_ioctl(struct file *filep,
> +				    unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +const struct file_operations vfio_iommu_fops = {
> +	.owner		= THIS_MODULE,
> +	.release	= vfio_iommu_release,
> +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_iommu_compat_ioctl,
> +#endif
> +};
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
.. snip..
> +int vfio_group_add_dev(struct device *dev, void *data)
> +{
> +	struct vfio_device_ops *ops = data;
> +	struct list_head *pos;
> +	struct vfio_group *vgroup = NULL;
> +	struct vfio_device *vdev = NULL;
> +	unsigned int group;
> +	int ret = 0, new_group = 0;

'new_group' should probably be 'bool'.

> +
> +	if (iommu_device_group(dev, &group))
> +		return 0;

-EEXIST?

> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	list_for_each(pos, &vfio.group_list) {
> +		vgroup = list_entry(pos, struct vfio_group, next);
> +		if (vgroup->group == group)
> +			break;
> +		vgroup = NULL;
> +	}
> +
> +	if (!vgroup) {
> +		int id;
> +
> +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		vgroup = kzalloc(sizeof(*vgroup), GFP_KERNEL);
> +		if (!vgroup) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		vgroup->group = group;
> +		INIT_LIST_HEAD(&vgroup->device_list);
> +
> +		ret = idr_get_new(&vfio.idr, vgroup, &id);
> +		if (ret == 0 && id > MINORMASK) {
> +			idr_remove(&vfio.idr, id);
> +			kfree(vgroup);
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +
> +		vgroup->devt = MKDEV(MAJOR(vfio.devt), id);
> +		list_add(&vgroup->next, &vfio.group_list);
> +		device_create(vfio.class, NULL, vgroup->devt,
> +			      vgroup, "%u", group);
> +
> +		new_group = 1;
> +	} else {
> +		list_for_each(pos, &vgroup->device_list) {
> +			vdev = list_entry(pos, struct vfio_device, next);
> +			if (vdev->dev == dev)
> +				break;
> +			vdev = NULL;
> +		}
> +	}
> +
> +	if (!vdev) {
> +		/* Adding a device for a group that's already in use? */
> +		/* Maybe we should attach to the domain so others can't */
> +		BUG_ON(vgroup->container &&
> +		       vgroup->container->iommu &&
> +		       vgroup->container->iommu->refcnt);
> +
> +		vdev = ops->new(dev);
> +		if (IS_ERR(vdev)) {
> +			/* If we just created this vgroup, tear it down */
> +			if (new_group) {
> +				device_destroy(vfio.class, vgroup->devt);
> +				idr_remove(&vfio.idr, MINOR(vgroup->devt));
> +				list_del(&vgroup->next);
> +				kfree(vgroup);
> +			}
> +			ret = PTR_ERR(vdev);
> +			goto out;
> +		}
> +		list_add(&vdev->next, &vgroup->device_list);
> +		vdev->dev = dev;
> +		vdev->ops = ops;
> +		vdev->vfio = &vfio;
> +	}
> +out:
> +	mutex_unlock(&vfio.group_lock);
> +	return ret;
> +}
> +
> +void vfio_group_del_dev(struct device *dev)
> +{
> +	struct list_head *pos;
> +	struct vfio_container *vcontainer;
> +	struct vfio_group *vgroup = NULL;
> +	struct vfio_device *vdev = NULL;
> +	unsigned int group;
> +
> +	if (iommu_device_group(dev, &group))
> +		return;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	list_for_each(pos, &vfio.group_list) {
> +		vgroup = list_entry(pos, struct vfio_group, next);
> +		if (vgroup->group == group)
> +			break;
> +		vgroup = NULL;
> +	}
> +
> +	if (!vgroup)
> +		goto out;
> +
> +	vcontainer = vgroup->container;
> +
> +	list_for_each(pos, &vgroup->device_list) {
> +		vdev = list_entry(pos, struct vfio_device, next);
> +		if (vdev->dev == dev)
> +			break;
> +		vdev = NULL;
> +	}
> +
> +	if (!vdev)
> +		goto out;
> +
> +	/* XXX Did a device we're using go away? */
> +	BUG_ON(vdev->refcnt);
> +
> +	if (vcontainer && vcontainer->iommu) {
> +		iommu_detach_device(vcontainer->iommu->domain, vdev->dev);
> +		vfio_container_reset_read(vcontainer);
> +	}
> +
> +	list_del(&vdev->next);
> +	vdev->ops->free(vdev);
> +
> +	if (list_empty(&vgroup->device_list) && vgroup->refcnt == 0) {
> +		device_destroy(vfio.class, vgroup->devt);
> +		idr_remove(&vfio.idr, MINOR(vgroup->devt));
> +		list_del(&vgroup->next);
> +		kfree(vgroup);
> +	}
> +out:
> +	mutex_unlock(&vfio.group_lock);
> +}
> +
> +static int __vfio_group_viable(struct vfio_container *vcontainer)

Just return 'bool'

> +{
> +	struct list_head *gpos, *dpos;
> +
> +	list_for_each(gpos, &vfio.group_list) {
> +		struct vfio_group *vgroup;
> +		vgroup = list_entry(gpos, struct vfio_group, next);
> +		if (vgroup->container != vcontainer)
> +			continue;
> +
> +		list_for_each(dpos, &vgroup->device_list) {
> +			struct vfio_device *vdev;
> +			vdev = list_entry(dpos, struct vfio_device, next);
> +
> +			if (!vdev->dev->driver ||
> +			    vdev->dev->driver->owner != THIS_MODULE)
> +				return 0;
> +		}
> +	}
> +	return 1;
> +}
> +
> +static int __vfio_close_iommu(struct vfio_container *vcontainer)
> +{
> +	struct list_head *gpos, *dpos;
> +	struct vfio_iommu *viommu = vcontainer->iommu;
> +	struct vfio_group *vgroup;
> +	struct vfio_device *vdev;
> +
> +	if (!viommu)
> +		return 0;
> +
> +	if (viommu->refcnt)
> +		return -EBUSY;
> +
> +	list_for_each(gpos, &vfio.group_list) {
> +		vgroup = list_entry(gpos, struct vfio_group, next);
> +		if (vgroup->container != vcontainer)
> +			continue;
> +
> +		list_for_each(dpos, &vgroup->device_list) {
> +			vdev = list_entry(dpos, struct vfio_device, next);
> +			iommu_detach_device(viommu->domain, vdev->dev);
> +			vdev->iommu = NULL;
> +		}
> +	}
> +	iommu_domain_free(viommu->domain);
> +	kfree(viommu);
> +	vcontainer->iommu = NULL;
> +	return 0;
> +}
> +
> +static int __vfio_open_iommu(struct vfio_container *vcontainer)
> +{
> +	struct list_head *gpos, *dpos;
> +	struct vfio_iommu *viommu;
> +	struct vfio_group *vgroup;
> +	struct vfio_device *vdev;
> +
> +	if (!__vfio_group_viable(vcontainer))
> +		return -EBUSY;
> +
> +	viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);
> +	if (!viommu)
> +		return -ENOMEM;
> +
> +	viommu->domain = iommu_domain_alloc();
> +	if (!viommu->domain) {
> +		kfree(viommu);
> +		return -EFAULT;
> +	}
> +
> +	viommu->vfio = &vfio;
> +	vcontainer->iommu = viommu;
> +

No need for
  mutex_lock(&vfio.group_lock);

Ah, you already hold the lock when using this function.

> +	list_for_each(gpos, &vfio.group_list) {
> +		vgroup = list_entry(gpos, struct vfio_group, next);
> +		if (vgroup->container != vcontainer)
> +			continue;
> +
> +		list_for_each(dpos, &vgroup->device_list) {
> +			int ret;
> +
> +			vdev = list_entry(dpos, struct vfio_device, next);
> +
> +			ret = iommu_attach_device(viommu->domain, vdev->dev);
> +			if (ret) {
> +				__vfio_close_iommu(vcontainer);
> +				return ret;
> +			}
> +			vdev->iommu = viommu;
> +		}
> +	}
> +
> +	if (!allow_unsafe_intrs &&
> +	    !iommu_domain_has_cap(viommu->domain, IOMMU_CAP_INTR_REMAP)) {
> +		__vfio_close_iommu(vcontainer);
> +		return -EFAULT;
> +	}
> +
> +	return 0;
> +}
> +
> +static int vfio_group_merge(struct vfio_group *vgroup, int fd)
> +{
> +	struct vfio_group *vgroup2;
> +	struct iommu_domain *domain;
> +	struct list_head *pos;
> +	struct file *file;
> +	int ret = 0;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	file = fget(fd);
> +	if (!file) {
> +		ret = -EBADF;
> +		goto out_noput;
> +	}
> +	if (file->f_op != &vfio_group_fops) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	vgroup2 = file->private_data;
> +	if (!vgroup2 || vgroup2 == vgroup || vgroup2->mm != vgroup->mm ||
> +	    (vgroup2->container->iommu && vgroup2->container->iommu->refcnt)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!vgroup->container->iommu) {
> +		ret = __vfio_open_iommu(vgroup->container);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (!vgroup2->container->iommu) {
> +		ret = __vfio_open_iommu(vgroup2->container);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (iommu_domain_has_cap(vgroup->container->iommu->domain,
> +				 IOMMU_CAP_CACHE_COHERENCY) !=
> +	    iommu_domain_has_cap(vgroup2->container->iommu->domain,
> +				 IOMMU_CAP_CACHE_COHERENCY)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	ret = __vfio_close_iommu(vgroup2->container);
> +	if (ret)
> +		goto out;
> +
> +	domain = vgroup->container->iommu->domain;
> +
> +	list_for_each(pos, &vgroup2->device_list) {
> +		struct vfio_device *vdev;
> +
> +		vdev = list_entry(pos, struct vfio_device, next);
> +
> +		ret = iommu_attach_device(domain, vdev->dev);
> +		if (ret) {
> +			list_for_each(pos, &vgroup2->device_list) {
> +				struct vfio_device *vdev2;
> +
> +				vdev2 = list_entry(pos,
> +						   struct vfio_device, next);
> +				if (vdev2 == vdev)
> +					break;
> +
> +				iommu_detach_device(domain, vdev2->dev);
> +				vdev2->iommu = NULL;
> +			}
> +			goto out;
> +		}
> +		vdev->iommu = vgroup->container->iommu;
> +	}
> +
> +	kfree(vgroup2->container->read_buf);
> +	kfree(vgroup2->container);
> +
> +	vgroup2->container = vgroup->container;
> +	vgroup->container->refcnt++;
> +	vfio_container_reset_read(vgroup->container);
> +
> +out:
> +	fput(file);
> +out_noput:
> +	mutex_unlock(&vfio.group_lock);
> +	return ret;
> +}
> +
> +static int vfio_group_unmerge(struct vfio_group *vgroup, int fd)
> +{
> +	struct vfio_group *vgroup2;
> +	struct vfio_container *vcontainer2;
> +	struct vfio_device *vdev;
> +	struct list_head *pos;
> +	struct file *file;
> +	int ret = 0;
> +
> +	vcontainer2 = kzalloc(sizeof(*vcontainer2), GFP_KERNEL);
> +	if (!vcontainer2)
> +		return -ENOMEM;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	file = fget(fd);
> +	if (!file) {
> +		ret = -EBADF;
> +		goto out_noput;
> +	}
> +	if (file->f_op != &vfio_group_fops) {

Hm, I think scripts/checkpath.pl will not like that, but as
you said - it is RFC.

> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	vgroup2 = file->private_data;
> +	if (!vgroup2 || vgroup2 == vgroup ||
> +	    vgroup2->container != vgroup->container) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	list_for_each(pos, &vgroup2->device_list) {
> +		vdev = list_entry(pos, struct vfio_device, next);
> +		if (vdev->refcnt) {
> +			ret = -EBUSY;
> +			goto out;
> +		}
> +	}
> +
> +	list_for_each(pos, &vgroup2->device_list) {
> +		vdev = list_entry(pos, struct vfio_device, next);
> +		iommu_detach_device(vgroup->container->iommu->domain,
> +				    vdev->dev);
> +		vdev->iommu = NULL;
> +	}
> +
> +	vgroup2->container = vcontainer2;
> +	vcontainer2->refcnt++;
> +	vgroup->container->refcnt--;
> +	vfio_container_reset_read(vgroup->container);
> +out:
> +	fput(file);
> +out_noput:
> +	if (ret)
> +		kfree(vcontainer2);
> +	mutex_unlock(&vfio.group_lock);
> +	return ret;
> +}
> +
> +static int vfio_group_get_iommu_fd(struct vfio_group *vgroup)
> +{
> +	int ret = 0;
> +	struct vfio_iommu *viommu;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	if (!vgroup->container->iommu) {
> +		ret = __vfio_open_iommu(vgroup->container);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	viommu = vgroup->container->iommu;
> +
> +	if (!viommu->file) {
> +		viommu->file = anon_inode_getfile("vfio-iommu",
> +						  &vfio_iommu_fops,
> +						  viommu, O_RDWR);
> +		if (IS_ERR(viommu->file)) {
> +			ret = PTR_ERR(viommu->file);
> +			viommu->file = NULL;
> +			goto out;
> +		}
> +	}
> +	ret = get_unused_fd();
> +	if (ret < 0)
> +		goto out;
> +
> +	fd_install(ret, viommu->file);
> +
> +	vgroup->container->iommu->refcnt++;
> +out:
> +	mutex_unlock(&vfio.group_lock);
> +	return ret;
> +}
> +
> +static int vfio_group_get_device_fd(struct vfio_group *vgroup, char *buf)
> +{
> +	struct vfio_container *vcontainer = vgroup->container;
> +	struct list_head *gpos, *dpos;
> +	int ret = -ENODEV;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	if (!vcontainer->iommu) {
> +		ret = __vfio_open_iommu(vcontainer);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	list_for_each(gpos, &vfio.group_list) {
> +		vgroup = list_entry(gpos, struct vfio_group, next);
> +		if (vgroup->container != vcontainer)
> +			continue;
> +
> +		list_for_each(dpos, &vgroup->device_list) {
> +			struct vfio_device *vdev;
> +			char buf2[MAX_PATH];
> +
> +			vdev = list_entry(dpos, struct vfio_device, next);
> +
> +			snprintf(buf2, MAX_PATH, "%s", dev_name(vdev->dev));
> +
> +			if (!strncmp(buf, buf2, MAX_PATH)) {
> +				if (!vdev->file) {
> +					vdev->file = anon_inode_getfile(
> +							"vfio-device",
> +							&vfio_device_fops,
> +							vdev, O_RDWR);
> +					if (IS_ERR(vdev->file)) {
> +						ret = PTR_ERR(vdev->file);
> +						vdev->file = NULL;
> +						goto out;
> +					}
> +				}
> +				ret = get_unused_fd();
> +				if (ret < 0)
> +					goto out;
> +
> +				fd_install(ret, vdev->file);
> +
> +				vdev->refcnt++;
> +				vcontainer->iommu->refcnt++;
> +				goto out;
> +			}
> +		}
> +	}
> +out:
> +	mutex_unlock(&vfio.group_lock);
> +	return ret;
> +}
> +
> +static long vfio_group_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_group *vgroup = filep->private_data;
> +
> +	if (vgroup->mm != current->mm)
> +		return -EIO;
> +
> +	switch (cmd) {
> +	case VFIO_GROUP_MERGE:
> +	case VFIO_GROUP_UNMERGE:
> +		{
> +			int fd;
> +		
> +			if (get_user(fd, (int __user *)arg))
> +				return -EFAULT;
> +			if (fd < 0)
> +				return -EINVAL;
> +
> +			if (cmd == VFIO_GROUP_MERGE)
> +				return vfio_group_merge(vgroup, fd);
> +			else
> +				return vfio_group_unmerge(vgroup, fd);
> +		}
> +	case VFIO_GROUP_GET_IOMMU_FD:
> +		return vfio_group_get_iommu_fd(vgroup);
> +	case VFIO_GROUP_GET_DEVICE_FD:
> +		{
> +			char *buf;
> +			int ret;
> +
> +			buf = strndup_user((const char __user *)arg, MAX_PATH);
> +			if (IS_ERR(buf))
> +				return PTR_ERR(buf);
> +
> +			ret = vfio_group_get_device_fd(vgroup, buf);
> +			kfree(buf);
> +			return ret;
> +		}
> +	}
> +	return -ENOSYS;
> +}
> +
> +
> +#ifdef CONFIG_COMPAT
> +static long vfio_group_compat_ioctl(struct file *filep,
> +				    unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_group_unl_ioctl(filep, cmd, arg);
> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +static int vfio_group_open(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_group *vgroup;
> +	int ret = 0;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	vgroup = idr_find(&vfio.idr, iminor(inode));
> +
> +	if (!vgroup) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	if (!vgroup->refcnt) {
> +		struct vfio_container *vcontainer;
> +		vcontainer = kzalloc(sizeof(*vcontainer), GFP_KERNEL);
> +		if (!vcontainer) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		vgroup->container = vcontainer;
> +		vgroup->mm = current->mm;
> +	} else if (current->mm != vgroup->mm) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +	filep->private_data = vgroup;
> +	vgroup->refcnt++;
> +	vgroup->container->refcnt++;
> +out:
> +	mutex_unlock(&vfio.group_lock);
> +
> +	return ret;
> +}
> +
> +static int vfio_group_release(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_group *vgroup = filep->private_data;
> +	struct vfio_container *vcontainer = vgroup->container;
> +	struct list_head *pos;
> +	int ret = 0;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	if (vgroup->refcnt > 1) {
> +		vgroup->refcnt--;
> +		vcontainer->refcnt--;
> +		goto out;
> +	}
> +
> +	list_for_each(pos, &vgroup->device_list) {
> +		struct vfio_device *vdev;
> +		vdev = list_entry(pos, struct vfio_device, next);
> +		if (vdev->refcnt) {
> +			ret = -EBUSY;
> +			goto out;
> +		}
> +	}
> +
> +	/* Merged group? */
> +	if (vcontainer->refcnt > 1) {
> +		if (vcontainer->iommu) {
> +			list_for_each(pos, &vgroup->device_list) {
> +				struct vfio_device *vdev;
> +				vdev = list_entry(pos,
> +						  struct vfio_device, next);
> +				iommu_detach_device(vcontainer->iommu->domain,
> +						    vdev->dev);
> +				vdev->iommu = NULL;
> +			}
> +		}
> +		vcontainer->refcnt--;
> +		vfio_container_reset_read(vcontainer);
> +	} else {
> +		if (vcontainer->iommu && vcontainer->iommu->refcnt) {
> +			ret = -EBUSY;
> +			goto out;
> +		}
> +
> +		ret = __vfio_close_iommu(vcontainer);
> +		if (ret)
> +			goto out;
> +
> +		kfree(vcontainer->read_buf);
> +		kfree(vcontainer);
> +	}
> +
> +	vgroup->refcnt--;
> +	vgroup->mm = NULL;
> +	vgroup->container = NULL;
> +
> +	/* Possible we had the group open while device members were removed */
> +	if (list_empty(&vgroup->device_list)) {
> +		device_destroy(vfio.class, vgroup->devt);
> +		idr_remove(&vfio.idr, MINOR(vgroup->devt));
> +		list_del(&vgroup->next);
> +		kfree(vgroup);
> +	}
> +out:
> +	mutex_unlock(&vfio.group_lock);
> +	return 0;
> +}
> +
> +static int __vfio_container_create_read_buf(struct vfio_container *vcontainer)
> +{
> +	struct list_head *gpos, *dpos;
> +	struct vfio_group *vgroup;
> +	struct vfio_device *vdev;
> +	int off = 0;
> +	char *buf;
> +
> +	buf = kzalloc(MAX_PATH, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	list_for_each(gpos, &vfio.group_list) {
> +		vgroup = list_entry(gpos, struct vfio_group, next);
> +		if (vgroup->container != vcontainer)
> +			continue;
> +
> +		off += snprintf(buf + off, MAX_PATH,
> +				"group: %u\n", vgroup->group);
> +		buf = krealloc(buf, off + MAX_PATH, GFP_KERNEL);
> +		if (!buf)
> +			return -ENOMEM;
> +		memset(buf + off, 0, MAX_PATH);
> +
> +		list_for_each(dpos, &vgroup->device_list) {
> +			vdev = list_entry(dpos, struct vfio_device, next);
> +
> +			off += snprintf(buf + off, MAX_PATH,
> +					"device: %s\n", dev_name(vdev->dev));
> +			buf = krealloc(buf, off + MAX_PATH, GFP_KERNEL);
> +			if (!buf)
> +				return -ENOMEM;
> +			memset(buf + off, 0, MAX_PATH);
> +		}
> +	}
> +	buf = krealloc(buf, off + 1, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	vcontainer->read_buf = buf;
> +	return 0;
> +}
> +
> +static ssize_t vfio_group_read(struct file *filep, char __user *buf,
> +			       size_t count, loff_t *ppos)
> +{
> +	struct vfio_group *vgroup = filep->private_data;
> +	struct vfio_container *vcontainer;
> +	ssize_t ret = 0;
> +
> +	mutex_lock(&vfio.group_lock);
> +
> +	vcontainer = vgroup->container;
> +
> +	if (!vcontainer) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!vcontainer->read_buf) {
> +		ret = __vfio_container_create_read_buf(vcontainer);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (*ppos >= strlen(vcontainer->read_buf) + 1) {
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	if (*ppos + count > strlen(vcontainer->read_buf) + 1)
> +		count = strlen(vcontainer->read_buf) + 1 - *ppos;
> +
> +	if (copy_to_user(buf, vcontainer->read_buf + *ppos, count)) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	*ppos += count;
> +	ret = count;
> +out:
> +	mutex_unlock(&vfio.group_lock);
> +	return ret;
> +}
> +
> +static const struct file_operations vfio_group_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= vfio_group_open,
> +	.release	= vfio_group_release,
> +	.read		= vfio_group_read,
> +	.unlocked_ioctl	= vfio_group_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_group_compat_ioctl,
> +#endif
> +};
> +
> +static void vfio_class_release(struct kref *kref)
> +{
> +	class_destroy(vfio.class);
> +	vfio.class = NULL;
> +}
> +
> +static char *vfio_devnode(struct device *dev, mode_t *mode)
> +{
> +	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
> +}
> +
> +static int __init vfio_init(void)
> +{
> +	int ret;
> +
> +	idr_init(&vfio.idr);
> +	mutex_init(&vfio.group_lock);
> +	INIT_LIST_HEAD(&vfio.group_list);
> +
> +	kref_init(&vfio.kref);
> +	vfio.class = class_create(THIS_MODULE, "vfio");
> +	if (IS_ERR(vfio.class)) {
> +		ret = PTR_ERR(vfio.class);
> +		goto err_class;
> +	}
> +
> +	vfio.class->devnode = vfio_devnode;
> +
> +	/* FIXME - how many minors to allocate... all of them! */
> +	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
> +	if (ret)
> +		goto err_chrdev;
> +
> +	cdev_init(&vfio.cdev, &vfio_group_fops);
> +	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
> +	if (ret)
> +		goto err_cdev;
> +
> +	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
> +
> +	return 0;
> +
> +err_cdev:
> +	unregister_chrdev_region(vfio.devt, MINORMASK);
> +err_chrdev:
> +	kref_put(&vfio.kref, vfio_class_release);
> +err_class:
> +	return ret;
> +}
> +
> +static void __exit vfio_cleanup(void)
> +{
> +	struct list_head *gpos, *gppos;
> +
> +	list_for_each_safe(gpos, gppos, &vfio.group_list) {
> +		struct vfio_group *vgroup;
> +		struct list_head *dpos, *dppos;
> +
> +		vgroup = list_entry(gpos, struct vfio_group, next);
> +
> +		list_for_each_safe(dpos, dppos, &vgroup->device_list) {
> +			struct vfio_device *vdev;
> +
> +			vdev = list_entry(dpos, struct vfio_device, next);
> +			vfio_group_del_dev(vdev->dev);
> +		}
> +	}
> +
> +	idr_destroy(&vfio.idr);
> +	cdev_del(&vfio.cdev);
> +	unregister_chrdev_region(vfio.devt, MINORMASK);
> +	kref_put(&vfio.kref, vfio_class_release);
> +}
> +
> +module_init(vfio_init);
> +module_exit(vfio_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
> new file mode 100644
> index 0000000..2cc300c
> --- /dev/null
> +++ b/drivers/vfio/vfio_private.h
> @@ -0,0 +1,82 @@
> +/*
> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + */
> +
> +#include <linux/cdev.h>
> +#include <linux/device.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/idr.h>
> +#include <linux/iommu.h>
> +#include <linux/list.h>
> +#include <linux/mm.h>
> +#include <linux/mutex.h>
> +
> +#ifndef VFIO_PRIVATE_H
> +#define VFIO_PRIVATE_H
> +
> +extern const struct file_operations vfio_iommu_fops;
> +extern const struct file_operations vfio_device_fops;
> +
> +struct vfio {
> +	dev_t			devt;
> +	struct cdev		cdev;
> +	struct list_head	group_list;
> +	struct mutex		group_lock;
> +	struct kref		kref;
> +	struct class		*class;
> +	struct idr		idr;
> +};
> +
> +struct vfio_device_ops {
> +	struct vfio_device	*(* new)(struct device *);
> +	void			(* free)(struct vfio_device *);
> +	struct file_operations	fops;
> +};
> +
> +struct vfio_iommu {
> +	struct iommu_domain	*domain;
> +	struct vfio		*vfio;
> +	int			refcnt;
> +	struct file		*file;
> +};
> +
> +struct vfio_device {
> +	struct device		*dev;
> +	struct list_head	next;
> +	struct file		*file;
> +	struct vfio_device_ops	*ops;
> +	struct vfio		*vfio;
> +	struct vfio_iommu	*iommu;
> +	int			refcnt;
> +};
> +
> +struct vfio_container {
> +	struct vfio_iommu	*iommu;
> +	char			*read_buf;
> +	int			refcnt;
> +};
> +
> +struct vfio_group {
> +	dev_t			devt;
> +	unsigned int		group;
> +	int			refcnt;
> +	struct mm_struct	*mm;
> +	struct vfio_container	*container;
> +	struct list_head	device_list;
> +	struct list_head	next;
> +};
> +
> +extern int vfio_group_add_dev(struct device *dev, void *data);
> +extern void vfio_group_del_dev(struct device *dev);
> +
> +#endif /* VFIO_PRIVATE_H */
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alex Williamson - Sept. 19, 2011, 4:42 p.m.
Sorry for the delay, just getting back from LPC and some time off...

On Wed, 2011-09-07 at 10:52 -0400, Konrad Rzeszutek Wilk wrote:
> > +static long vfio_iommu_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_iommu *viommu = filep->private_data;
> > +	struct vfio_dma_map dm;
> > +	int ret = -ENOSYS;
> > +
> > +	switch (cmd) {
> > +	case VFIO_IOMMU_MAP_DMA:
> > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > +			return -EFAULT;
> > +		ret = 0; // XXX - Do something
> 
> <chuckles>

Truly an RFC ;)

> > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > +			ret = -EFAULT;
> > +		break;
> > +
> > +	case VFIO_IOMMU_UNMAP_DMA:
> > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > +			return -EFAULT;
> > +		ret = 0; // XXX - Do something
> > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > +			ret = -EFAULT;
> > +		break;
> > +	}
> > +	return ret;
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_iommu_compat_ioctl(struct file *filep,
> > +				    unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +const struct file_operations vfio_iommu_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.release	= vfio_iommu_release,
> > +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_iommu_compat_ioctl,
> > +#endif
> > +};
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> .. snip..
> > +int vfio_group_add_dev(struct device *dev, void *data)
> > +{
> > +	struct vfio_device_ops *ops = data;
> > +	struct list_head *pos;
> > +	struct vfio_group *vgroup = NULL;
> > +	struct vfio_device *vdev = NULL;
> > +	unsigned int group;
> > +	int ret = 0, new_group = 0;
> 
> 'new_group' should probably be 'bool'.

ok

> > +
> > +	if (iommu_device_group(dev, &group))
> > +		return 0;
> 
> -EEXIST?

I think I made this return 0 because it's called from device add
notifiers and walking devices lists.  It's ok for it to fail, not all
devices have to be backed by an iommu, they just won't show up in vfio.
Maybe I should leave that to the leaf callers though.  EINVAL is
probably more appropriate.

> > +
> > +	mutex_lock(&vfio.group_lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		vgroup = list_entry(pos, struct vfio_group, next);
> > +		if (vgroup->group == group)
> > +			break;
> > +		vgroup = NULL;
> > +	}
> > +
> > +	if (!vgroup) {
> > +		int id;
> > +
> > +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +		vgroup = kzalloc(sizeof(*vgroup), GFP_KERNEL);
> > +		if (!vgroup) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		vgroup->group = group;
> > +		INIT_LIST_HEAD(&vgroup->device_list);
> > +
> > +		ret = idr_get_new(&vfio.idr, vgroup, &id);
> > +		if (ret == 0 && id > MINORMASK) {
> > +			idr_remove(&vfio.idr, id);
> > +			kfree(vgroup);
> > +			ret = -ENOSPC;
> > +			goto out;
> > +		}
> > +
> > +		vgroup->devt = MKDEV(MAJOR(vfio.devt), id);
> > +		list_add(&vgroup->next, &vfio.group_list);
> > +		device_create(vfio.class, NULL, vgroup->devt,
> > +			      vgroup, "%u", group);
> > +
> > +		new_group = 1;
> > +	} else {
> > +		list_for_each(pos, &vgroup->device_list) {
> > +			vdev = list_entry(pos, struct vfio_device, next);
> > +			if (vdev->dev == dev)
> > +				break;
> > +			vdev = NULL;
> > +		}
> > +	}
> > +
> > +	if (!vdev) {
> > +		/* Adding a device for a group that's already in use? */
> > +		/* Maybe we should attach to the domain so others can't */
> > +		BUG_ON(vgroup->container &&
> > +		       vgroup->container->iommu &&
> > +		       vgroup->container->iommu->refcnt);
> > +
> > +		vdev = ops->new(dev);
> > +		if (IS_ERR(vdev)) {
> > +			/* If we just created this vgroup, tear it down */
> > +			if (new_group) {
> > +				device_destroy(vfio.class, vgroup->devt);
> > +				idr_remove(&vfio.idr, MINOR(vgroup->devt));
> > +				list_del(&vgroup->next);
> > +				kfree(vgroup);
> > +			}
> > +			ret = PTR_ERR(vdev);
> > +			goto out;
> > +		}
> > +		list_add(&vdev->next, &vgroup->device_list);
> > +		vdev->dev = dev;
> > +		vdev->ops = ops;
> > +		vdev->vfio = &vfio;
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.group_lock);
> > +	return ret;
> > +}
> > +
> > +void vfio_group_del_dev(struct device *dev)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_container *vcontainer;
> > +	struct vfio_group *vgroup = NULL;
> > +	struct vfio_device *vdev = NULL;
> > +	unsigned int group;
> > +
> > +	if (iommu_device_group(dev, &group))
> > +		return;
> > +
> > +	mutex_lock(&vfio.group_lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		vgroup = list_entry(pos, struct vfio_group, next);
> > +		if (vgroup->group == group)
> > +			break;
> > +		vgroup = NULL;
> > +	}
> > +
> > +	if (!vgroup)
> > +		goto out;
> > +
> > +	vcontainer = vgroup->container;
> > +
> > +	list_for_each(pos, &vgroup->device_list) {
> > +		vdev = list_entry(pos, struct vfio_device, next);
> > +		if (vdev->dev == dev)
> > +			break;
> > +		vdev = NULL;
> > +	}
> > +
> > +	if (!vdev)
> > +		goto out;
> > +
> > +	/* XXX Did a device we're using go away? */
> > +	BUG_ON(vdev->refcnt);
> > +
> > +	if (vcontainer && vcontainer->iommu) {
> > +		iommu_detach_device(vcontainer->iommu->domain, vdev->dev);
> > +		vfio_container_reset_read(vcontainer);
> > +	}
> > +
> > +	list_del(&vdev->next);
> > +	vdev->ops->free(vdev);
> > +
> > +	if (list_empty(&vgroup->device_list) && vgroup->refcnt == 0) {
> > +		device_destroy(vfio.class, vgroup->devt);
> > +		idr_remove(&vfio.idr, MINOR(vgroup->devt));
> > +		list_del(&vgroup->next);
> > +		kfree(vgroup);
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.group_lock);
> > +}
> > +
> > +static int __vfio_group_viable(struct vfio_container *vcontainer)
> 
> Just return 'bool'

Sure

> > +{
> > +	struct list_head *gpos, *dpos;
> > +
> > +	list_for_each(gpos, &vfio.group_list) {
> > +		struct vfio_group *vgroup;
> > +		vgroup = list_entry(gpos, struct vfio_group, next);
> > +		if (vgroup->container != vcontainer)
> > +			continue;
> > +
> > +		list_for_each(dpos, &vgroup->device_list) {
> > +			struct vfio_device *vdev;
> > +			vdev = list_entry(dpos, struct vfio_device, next);
> > +
> > +			if (!vdev->dev->driver ||
> > +			    vdev->dev->driver->owner != THIS_MODULE)
> > +				return 0;
> > +		}
> > +	}
> > +	return 1;
> > +}
> > +
> > +static int __vfio_close_iommu(struct vfio_container *vcontainer)
> > +{
> > +	struct list_head *gpos, *dpos;
> > +	struct vfio_iommu *viommu = vcontainer->iommu;
> > +	struct vfio_group *vgroup;
> > +	struct vfio_device *vdev;
> > +
> > +	if (!viommu)
> > +		return 0;
> > +
> > +	if (viommu->refcnt)
> > +		return -EBUSY;
> > +
> > +	list_for_each(gpos, &vfio.group_list) {
> > +		vgroup = list_entry(gpos, struct vfio_group, next);
> > +		if (vgroup->container != vcontainer)
> > +			continue;
> > +
> > +		list_for_each(dpos, &vgroup->device_list) {
> > +			vdev = list_entry(dpos, struct vfio_device, next);
> > +			iommu_detach_device(viommu->domain, vdev->dev);
> > +			vdev->iommu = NULL;
> > +		}
> > +	}
> > +	iommu_domain_free(viommu->domain);
> > +	kfree(viommu);
> > +	vcontainer->iommu = NULL;
> > +	return 0;
> > +}
> > +
> > +static int __vfio_open_iommu(struct vfio_container *vcontainer)
> > +{
> > +	struct list_head *gpos, *dpos;
> > +	struct vfio_iommu *viommu;
> > +	struct vfio_group *vgroup;
> > +	struct vfio_device *vdev;
> > +
> > +	if (!__vfio_group_viable(vcontainer))
> > +		return -EBUSY;
> > +
> > +	viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);
> > +	if (!viommu)
> > +		return -ENOMEM;
> > +
> > +	viommu->domain = iommu_domain_alloc();
> > +	if (!viommu->domain) {
> > +		kfree(viommu);
> > +		return -EFAULT;
> > +	}
> > +
> > +	viommu->vfio = &vfio;
> > +	vcontainer->iommu = viommu;
> > +
> 
> No need for
>   mutex_lock(&vfio.group_lock);
> 
> Ah, you already hold the lock when using this function.

Right, just really simple, broad locking right now.

> > +	list_for_each(gpos, &vfio.group_list) {
> > +		vgroup = list_entry(gpos, struct vfio_group, next);
> > +		if (vgroup->container != vcontainer)
> > +			continue;
> > +
> > +		list_for_each(dpos, &vgroup->device_list) {
> > +			int ret;
> > +
> > +			vdev = list_entry(dpos, struct vfio_device, next);
> > +
> > +			ret = iommu_attach_device(viommu->domain, vdev->dev);
> > +			if (ret) {
> > +				__vfio_close_iommu(vcontainer);
> > +				return ret;
> > +			}
> > +			vdev->iommu = viommu;
> > +		}
> > +	}
> > +
> > +	if (!allow_unsafe_intrs &&
> > +	    !iommu_domain_has_cap(viommu->domain, IOMMU_CAP_INTR_REMAP)) {
> > +		__vfio_close_iommu(vcontainer);
> > +		return -EFAULT;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int vfio_group_merge(struct vfio_group *vgroup, int fd)
> > +{
> > +	struct vfio_group *vgroup2;
> > +	struct iommu_domain *domain;
> > +	struct list_head *pos;
> > +	struct file *file;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.group_lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	vgroup2 = file->private_data;
> > +	if (!vgroup2 || vgroup2 == vgroup || vgroup2->mm != vgroup->mm ||
> > +	    (vgroup2->container->iommu && vgroup2->container->iommu->refcnt)) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if (!vgroup->container->iommu) {
> > +		ret = __vfio_open_iommu(vgroup->container);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	if (!vgroup2->container->iommu) {
> > +		ret = __vfio_open_iommu(vgroup2->container);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	if (iommu_domain_has_cap(vgroup->container->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY) !=
> > +	    iommu_domain_has_cap(vgroup2->container->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY)) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	ret = __vfio_close_iommu(vgroup2->container);
> > +	if (ret)
> > +		goto out;
> > +
> > +	domain = vgroup->container->iommu->domain;
> > +
> > +	list_for_each(pos, &vgroup2->device_list) {
> > +		struct vfio_device *vdev;
> > +
> > +		vdev = list_entry(pos, struct vfio_device, next);
> > +
> > +		ret = iommu_attach_device(domain, vdev->dev);
> > +		if (ret) {
> > +			list_for_each(pos, &vgroup2->device_list) {
> > +				struct vfio_device *vdev2;
> > +
> > +				vdev2 = list_entry(pos,
> > +						   struct vfio_device, next);
> > +				if (vdev2 == vdev)
> > +					break;
> > +
> > +				iommu_detach_device(domain, vdev2->dev);
> > +				vdev2->iommu = NULL;
> > +			}
> > +			goto out;
> > +		}
> > +		vdev->iommu = vgroup->container->iommu;
> > +	}
> > +
> > +	kfree(vgroup2->container->read_buf);
> > +	kfree(vgroup2->container);
> > +
> > +	vgroup2->container = vgroup->container;
> > +	vgroup->container->refcnt++;
> > +	vfio_container_reset_read(vgroup->container);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	mutex_unlock(&vfio.group_lock);
> > +	return ret;
> > +}
> > +
> > +static int vfio_group_unmerge(struct vfio_group *vgroup, int fd)
> > +{
> > +	struct vfio_group *vgroup2;
> > +	struct vfio_container *vcontainer2;
> > +	struct vfio_device *vdev;
> > +	struct list_head *pos;
> > +	struct file *file;
> > +	int ret = 0;
> > +
> > +	vcontainer2 = kzalloc(sizeof(*vcontainer2), GFP_KERNEL);
> > +	if (!vcontainer2)
> > +		return -ENOMEM;
> > +
> > +	mutex_lock(&vfio.group_lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +	if (file->f_op != &vfio_group_fops) {
> 
> Hm, I think scripts/checkpath.pl will not like that, but as
> you said - it is RFC.

Will check

Thanks for the review!

Alex

Patch

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 3bb154d..5b5fffc 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -112,6 +112,8 @@  source "drivers/auxdisplay/Kconfig"
 
 source "drivers/uio/Kconfig"
 
+source "drivers/vfio/Kconfig"
+
 source "drivers/vlynq/Kconfig"
 
 source "drivers/xen/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 09f3232..6b17848 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -56,6 +56,7 @@  obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_FUSION)		+= message/
 obj-y				+= firewire/
 obj-$(CONFIG_UIO)		+= uio/
+obj-$(CONFIG_VFIO)		+= vfio/
 obj-y				+= cdrom/
 obj-y				+= auxdisplay/
 obj-$(CONFIG_PCCARD)		+= pcmcia/
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
new file mode 100644
index 0000000..a150521
--- /dev/null
+++ b/drivers/vfio/Kconfig
@@ -0,0 +1,5 @@ 
+menuconfig VFIO
+	tristate "Non-Privileged User Space driver"
+	depends on IOMMU_API
+	help
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
new file mode 100644
index 0000000..5eaa074
--- /dev/null
+++ b/drivers/vfio/Makefile
@@ -0,0 +1,3 @@ 
+obj-$(CONFIG_VFIO) := vfio.o
+
+vfio-y := vfio_main.o vfio_iommu.o vfio_device.o
diff --git a/drivers/vfio/vfio_device.c b/drivers/vfio/vfio_device.c
new file mode 100644
index 0000000..101cbbf
--- /dev/null
+++ b/drivers/vfio/vfio_device.c
@@ -0,0 +1,109 @@ 
+/*
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+/*
+ * VFIO device module: Common device handling and callouts to other drivers
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/eventfd.h>
+#include <linux/uaccess.h>
+#include <linux/compat.h>
+#include <linux/vfio.h>
+
+#include "vfio_private.h"
+
+static int vfio_device_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_device *vdev = filep->private_data;
+
+	mutex_lock(&vdev->vfio->group_lock);
+	vdev->refcnt--;
+	vdev->iommu->refcnt--;
+	mutex_unlock(&vdev->vfio->group_lock);
+
+	return 0;
+}
+
+static long vfio_device_unl_ioctl(struct file *filep,
+				  unsigned int cmd, unsigned long arg)
+{
+	struct vfio_device *vdev = filep->private_data;
+	int ret = -EINVAL;
+
+	switch (cmd) {
+	// TBD - what can we handle as common device ioctls?
+	default:
+		if (vdev->ops->fops.unlocked_ioctl)
+			ret = vdev->ops->fops.unlocked_ioctl(filep, cmd, arg);
+	}
+	return ret;
+}
+
+static ssize_t vfio_device_read(struct file *filep, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct vfio_device *vdev = filep->private_data;
+
+	if (vdev->ops->fops.read)
+		return vdev->ops->fops.read(filep, buf, count, ppos);
+
+	return -EINVAL;
+}
+
+static ssize_t vfio_device_write(struct file *filep, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct vfio_device *vdev = filep->private_data;
+
+	if (vdev->ops->fops.write)
+		return vdev->ops->fops.write(filep, buf, count, ppos);
+
+	return -EINVAL;
+}
+
+static int vfio_device_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	struct vfio_device *vdev = filep->private_data;
+
+	if (vdev->ops->fops.mmap)
+		return vdev->ops->fops.mmap(filep, vma);
+
+	return -EINVAL;
+}
+	
+#ifdef CONFIG_COMPAT
+static long vfio_device_compat_ioctl(struct file *filep,
+				     unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_device_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+const struct file_operations vfio_device_fops = {
+	.owner		= THIS_MODULE,
+	.release	= vfio_device_release,
+	.read		= vfio_device_read,
+	.write		= vfio_device_write,
+	.unlocked_ioctl	= vfio_device_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_device_compat_ioctl,
+#endif
+	.mmap		= vfio_device_mmap,
+};
diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
new file mode 100644
index 0000000..1a6f321
--- /dev/null
+++ b/drivers/vfio/vfio_iommu.c
@@ -0,0 +1,81 @@ 
+/*
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+/*
+ * VFIO iomm module: iommu fd callbacks
+ */
+
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+#include "vfio_private.h"
+
+static int vfio_iommu_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_iommu *viommu = filep->private_data;
+
+	mutex_lock(&viommu->vfio->group_lock);
+	viommu->refcnt--;
+	mutex_unlock(&viommu->vfio->group_lock);
+	return 0;
+}
+
+static long vfio_iommu_unl_ioctl(struct file *filep,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct vfio_iommu *viommu = filep->private_data;
+	struct vfio_dma_map dm;
+	int ret = -ENOSYS;
+
+	switch (cmd) {
+	case VFIO_IOMMU_MAP_DMA:
+		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
+			return -EFAULT;
+		ret = 0; // XXX - Do something
+		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
+			ret = -EFAULT;
+		break;
+
+	case VFIO_IOMMU_UNMAP_DMA:
+		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
+			return -EFAULT;
+		ret = 0; // XXX - Do something
+		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
+			ret = -EFAULT;
+		break;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long vfio_iommu_compat_ioctl(struct file *filep,
+				    unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_iommu_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+const struct file_operations vfio_iommu_fops = {
+	.owner		= THIS_MODULE,
+	.release	= vfio_iommu_release,
+	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_iommu_compat_ioctl,
+#endif
+};
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
new file mode 100644
index 0000000..7f05692
--- /dev/null
+++ b/drivers/vfio/vfio_main.c
@@ -0,0 +1,879 @@ 
+/*
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+/*
+ * VFIO main module: IOMMU group framework
+ */
+
+#include <linux/cdev.h>
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/iommu.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+#include "vfio_private.h"
+
+#define DRIVER_VERSION	"0.2"
+#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC	"VFIO - User Level meta-driver"
+
+#define MAX_PATH	256
+
+static int allow_unsafe_intrs;
+module_param(allow_unsafe_intrs, int, 0);
+MODULE_PARM_DESC(allow_unsafe_intrs,
+        "Allow use of IOMMUs which do not support interrupt remapping");
+
+static struct vfio vfio;
+static const struct file_operations vfio_group_fops;
+
+static inline void vfio_container_reset_read(struct vfio_container *vcontainer)
+{
+	kfree(vcontainer->read_buf);
+	vcontainer->read_buf = NULL;
+}
+
+int vfio_group_add_dev(struct device *dev, void *data)
+{
+	struct vfio_device_ops *ops = data;
+	struct list_head *pos;
+	struct vfio_group *vgroup = NULL;
+	struct vfio_device *vdev = NULL;
+	unsigned int group;
+	int ret = 0, new_group = 0;
+
+	if (iommu_device_group(dev, &group))
+		return 0;
+
+	mutex_lock(&vfio.group_lock);
+
+	list_for_each(pos, &vfio.group_list) {
+		vgroup = list_entry(pos, struct vfio_group, next);
+		if (vgroup->group == group)
+			break;
+		vgroup = NULL;
+	}
+
+	if (!vgroup) {
+		int id;
+
+		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		vgroup = kzalloc(sizeof(*vgroup), GFP_KERNEL);
+		if (!vgroup) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		vgroup->group = group;
+		INIT_LIST_HEAD(&vgroup->device_list);
+
+		ret = idr_get_new(&vfio.idr, vgroup, &id);
+		if (ret == 0 && id > MINORMASK) {
+			idr_remove(&vfio.idr, id);
+			kfree(vgroup);
+			ret = -ENOSPC;
+			goto out;
+		}
+
+		vgroup->devt = MKDEV(MAJOR(vfio.devt), id);
+		list_add(&vgroup->next, &vfio.group_list);
+		device_create(vfio.class, NULL, vgroup->devt,
+			      vgroup, "%u", group);
+
+		new_group = 1;
+	} else {
+		list_for_each(pos, &vgroup->device_list) {
+			vdev = list_entry(pos, struct vfio_device, next);
+			if (vdev->dev == dev)
+				break;
+			vdev = NULL;
+		}
+	}
+
+	if (!vdev) {
+		/* Adding a device for a group that's already in use? */
+		/* Maybe we should attach to the domain so others can't */
+		BUG_ON(vgroup->container &&
+		       vgroup->container->iommu &&
+		       vgroup->container->iommu->refcnt);
+
+		vdev = ops->new(dev);
+		if (IS_ERR(vdev)) {
+			/* If we just created this vgroup, tear it down */
+			if (new_group) {
+				device_destroy(vfio.class, vgroup->devt);
+				idr_remove(&vfio.idr, MINOR(vgroup->devt));
+				list_del(&vgroup->next);
+				kfree(vgroup);
+			}
+			ret = PTR_ERR(vdev);
+			goto out;
+		}
+		list_add(&vdev->next, &vgroup->device_list);
+		vdev->dev = dev;
+		vdev->ops = ops;
+		vdev->vfio = &vfio;
+	}
+out:
+	mutex_unlock(&vfio.group_lock);
+	return ret;
+}
+
+void vfio_group_del_dev(struct device *dev)
+{
+	struct list_head *pos;
+	struct vfio_container *vcontainer;
+	struct vfio_group *vgroup = NULL;
+	struct vfio_device *vdev = NULL;
+	unsigned int group;
+
+	if (iommu_device_group(dev, &group))
+		return;
+
+	mutex_lock(&vfio.group_lock);
+
+	list_for_each(pos, &vfio.group_list) {
+		vgroup = list_entry(pos, struct vfio_group, next);
+		if (vgroup->group == group)
+			break;
+		vgroup = NULL;
+	}
+
+	if (!vgroup)
+		goto out;
+
+	vcontainer = vgroup->container;
+
+	list_for_each(pos, &vgroup->device_list) {
+		vdev = list_entry(pos, struct vfio_device, next);
+		if (vdev->dev == dev)
+			break;
+		vdev = NULL;
+	}
+
+	if (!vdev)
+		goto out;
+
+	/* XXX Did a device we're using go away? */
+	BUG_ON(vdev->refcnt);
+
+	if (vcontainer && vcontainer->iommu) {
+		iommu_detach_device(vcontainer->iommu->domain, vdev->dev);
+		vfio_container_reset_read(vcontainer);
+	}
+
+	list_del(&vdev->next);
+	vdev->ops->free(vdev);
+
+	if (list_empty(&vgroup->device_list) && vgroup->refcnt == 0) {
+		device_destroy(vfio.class, vgroup->devt);
+		idr_remove(&vfio.idr, MINOR(vgroup->devt));
+		list_del(&vgroup->next);
+		kfree(vgroup);
+	}
+out:
+	mutex_unlock(&vfio.group_lock);
+}
+
+static int __vfio_group_viable(struct vfio_container *vcontainer)
+{
+	struct list_head *gpos, *dpos;
+
+	list_for_each(gpos, &vfio.group_list) {
+		struct vfio_group *vgroup;
+		vgroup = list_entry(gpos, struct vfio_group, next);
+		if (vgroup->container != vcontainer)
+			continue;
+
+		list_for_each(dpos, &vgroup->device_list) {
+			struct vfio_device *vdev;
+			vdev = list_entry(dpos, struct vfio_device, next);
+
+			if (!vdev->dev->driver ||
+			    vdev->dev->driver->owner != THIS_MODULE)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+static int __vfio_close_iommu(struct vfio_container *vcontainer)
+{
+	struct list_head *gpos, *dpos;
+	struct vfio_iommu *viommu = vcontainer->iommu;
+	struct vfio_group *vgroup;
+	struct vfio_device *vdev;
+
+	if (!viommu)
+		return 0;
+
+	if (viommu->refcnt)
+		return -EBUSY;
+
+	list_for_each(gpos, &vfio.group_list) {
+		vgroup = list_entry(gpos, struct vfio_group, next);
+		if (vgroup->container != vcontainer)
+			continue;
+
+		list_for_each(dpos, &vgroup->device_list) {
+			vdev = list_entry(dpos, struct vfio_device, next);
+			iommu_detach_device(viommu->domain, vdev->dev);
+			vdev->iommu = NULL;
+		}
+	}
+	iommu_domain_free(viommu->domain);
+	kfree(viommu);
+	vcontainer->iommu = NULL;
+	return 0;
+}
+
+static int __vfio_open_iommu(struct vfio_container *vcontainer)
+{
+	struct list_head *gpos, *dpos;
+	struct vfio_iommu *viommu;
+	struct vfio_group *vgroup;
+	struct vfio_device *vdev;
+
+	if (!__vfio_group_viable(vcontainer))
+		return -EBUSY;
+
+	viommu = kzalloc(sizeof(*viommu), GFP_KERNEL);
+	if (!viommu)
+		return -ENOMEM;
+
+	viommu->domain = iommu_domain_alloc();
+	if (!viommu->domain) {
+		kfree(viommu);
+		return -EFAULT;
+	}
+
+	viommu->vfio = &vfio;
+	vcontainer->iommu = viommu;
+
+	list_for_each(gpos, &vfio.group_list) {
+		vgroup = list_entry(gpos, struct vfio_group, next);
+		if (vgroup->container != vcontainer)
+			continue;
+
+		list_for_each(dpos, &vgroup->device_list) {
+			int ret;
+
+			vdev = list_entry(dpos, struct vfio_device, next);
+
+			ret = iommu_attach_device(viommu->domain, vdev->dev);
+			if (ret) {
+				__vfio_close_iommu(vcontainer);
+				return ret;
+			}
+			vdev->iommu = viommu;
+		}
+	}
+
+	if (!allow_unsafe_intrs &&
+	    !iommu_domain_has_cap(viommu->domain, IOMMU_CAP_INTR_REMAP)) {
+		__vfio_close_iommu(vcontainer);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int vfio_group_merge(struct vfio_group *vgroup, int fd)
+{
+	struct vfio_group *vgroup2;
+	struct iommu_domain *domain;
+	struct list_head *pos;
+	struct file *file;
+	int ret = 0;
+
+	mutex_lock(&vfio.group_lock);
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out_noput;
+	}
+	if (file->f_op != &vfio_group_fops) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	vgroup2 = file->private_data;
+	if (!vgroup2 || vgroup2 == vgroup || vgroup2->mm != vgroup->mm ||
+	    (vgroup2->container->iommu && vgroup2->container->iommu->refcnt)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!vgroup->container->iommu) {
+		ret = __vfio_open_iommu(vgroup->container);
+		if (ret)
+			goto out;
+	}
+
+	if (!vgroup2->container->iommu) {
+		ret = __vfio_open_iommu(vgroup2->container);
+		if (ret)
+			goto out;
+	}
+
+	if (iommu_domain_has_cap(vgroup->container->iommu->domain,
+				 IOMMU_CAP_CACHE_COHERENCY) !=
+	    iommu_domain_has_cap(vgroup2->container->iommu->domain,
+				 IOMMU_CAP_CACHE_COHERENCY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = __vfio_close_iommu(vgroup2->container);
+	if (ret)
+		goto out;
+
+	domain = vgroup->container->iommu->domain;
+
+	list_for_each(pos, &vgroup2->device_list) {
+		struct vfio_device *vdev;
+
+		vdev = list_entry(pos, struct vfio_device, next);
+
+		ret = iommu_attach_device(domain, vdev->dev);
+		if (ret) {
+			list_for_each(pos, &vgroup2->device_list) {
+				struct vfio_device *vdev2;
+
+				vdev2 = list_entry(pos,
+						   struct vfio_device, next);
+				if (vdev2 == vdev)
+					break;
+
+				iommu_detach_device(domain, vdev2->dev);
+				vdev2->iommu = NULL;
+			}
+			goto out;
+		}
+		vdev->iommu = vgroup->container->iommu;
+	}
+
+	kfree(vgroup2->container->read_buf);
+	kfree(vgroup2->container);
+
+	vgroup2->container = vgroup->container;
+	vgroup->container->refcnt++;
+	vfio_container_reset_read(vgroup->container);
+
+out:
+	fput(file);
+out_noput:
+	mutex_unlock(&vfio.group_lock);
+	return ret;
+}
+
+static int vfio_group_unmerge(struct vfio_group *vgroup, int fd)
+{
+	struct vfio_group *vgroup2;
+	struct vfio_container *vcontainer2;
+	struct vfio_device *vdev;
+	struct list_head *pos;
+	struct file *file;
+	int ret = 0;
+
+	vcontainer2 = kzalloc(sizeof(*vcontainer2), GFP_KERNEL);
+	if (!vcontainer2)
+		return -ENOMEM;
+
+	mutex_lock(&vfio.group_lock);
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out_noput;
+	}
+	if (file->f_op != &vfio_group_fops) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	vgroup2 = file->private_data;
+	if (!vgroup2 || vgroup2 == vgroup ||
+	    vgroup2->container != vgroup->container) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	list_for_each(pos, &vgroup2->device_list) {
+		vdev = list_entry(pos, struct vfio_device, next);
+		if (vdev->refcnt) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	list_for_each(pos, &vgroup2->device_list) {
+		vdev = list_entry(pos, struct vfio_device, next);
+		iommu_detach_device(vgroup->container->iommu->domain,
+				    vdev->dev);
+		vdev->iommu = NULL;
+	}
+
+	vgroup2->container = vcontainer2;
+	vcontainer2->refcnt++;
+	vgroup->container->refcnt--;
+	vfio_container_reset_read(vgroup->container);
+out:
+	fput(file);
+out_noput:
+	if (ret)
+		kfree(vcontainer2);
+	mutex_unlock(&vfio.group_lock);
+	return ret;
+}
+
+static int vfio_group_get_iommu_fd(struct vfio_group *vgroup)
+{
+	int ret = 0;
+	struct vfio_iommu *viommu;
+
+	mutex_lock(&vfio.group_lock);
+
+	if (!vgroup->container->iommu) {
+		ret = __vfio_open_iommu(vgroup->container);
+		if (ret)
+			goto out;
+	}
+
+	viommu = vgroup->container->iommu;
+
+	if (!viommu->file) {
+		viommu->file = anon_inode_getfile("vfio-iommu",
+						  &vfio_iommu_fops,
+						  viommu, O_RDWR);
+		if (IS_ERR(viommu->file)) {
+			ret = PTR_ERR(viommu->file);
+			viommu->file = NULL;
+			goto out;
+		}
+	}
+	ret = get_unused_fd();
+	if (ret < 0)
+		goto out;
+
+	fd_install(ret, viommu->file);
+
+	vgroup->container->iommu->refcnt++;
+out:
+	mutex_unlock(&vfio.group_lock);
+	return ret;
+}
+
+static int vfio_group_get_device_fd(struct vfio_group *vgroup, char *buf)
+{
+	struct vfio_container *vcontainer = vgroup->container;
+	struct list_head *gpos, *dpos;
+	int ret = -ENODEV;
+
+	mutex_lock(&vfio.group_lock);
+
+	if (!vcontainer->iommu) {
+		ret = __vfio_open_iommu(vcontainer);
+		if (ret)
+			goto out;
+	}
+
+	list_for_each(gpos, &vfio.group_list) {
+		vgroup = list_entry(gpos, struct vfio_group, next);
+		if (vgroup->container != vcontainer)
+			continue;
+
+		list_for_each(dpos, &vgroup->device_list) {
+			struct vfio_device *vdev;
+			char buf2[MAX_PATH];
+
+			vdev = list_entry(dpos, struct vfio_device, next);
+
+			snprintf(buf2, MAX_PATH, "%s", dev_name(vdev->dev));
+
+			if (!strncmp(buf, buf2, MAX_PATH)) {
+				if (!vdev->file) {
+					vdev->file = anon_inode_getfile(
+							"vfio-device",
+							&vfio_device_fops,
+							vdev, O_RDWR);
+					if (IS_ERR(vdev->file)) {
+						ret = PTR_ERR(vdev->file);
+						vdev->file = NULL;
+						goto out;
+					}
+				}
+				ret = get_unused_fd();
+				if (ret < 0)
+					goto out;
+
+				fd_install(ret, vdev->file);
+
+				vdev->refcnt++;
+				vcontainer->iommu->refcnt++;
+				goto out;
+			}
+		}
+	}
+out:
+	mutex_unlock(&vfio.group_lock);
+	return ret;
+}
+
+static long vfio_group_unl_ioctl(struct file *filep,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct vfio_group *vgroup = filep->private_data;
+
+	if (vgroup->mm != current->mm)
+		return -EIO;
+
+	switch (cmd) {
+	case VFIO_GROUP_MERGE:
+	case VFIO_GROUP_UNMERGE:
+		{
+			int fd;
+		
+			if (get_user(fd, (int __user *)arg))
+				return -EFAULT;
+			if (fd < 0)
+				return -EINVAL;
+
+			if (cmd == VFIO_GROUP_MERGE)
+				return vfio_group_merge(vgroup, fd);
+			else
+				return vfio_group_unmerge(vgroup, fd);
+		}
+	case VFIO_GROUP_GET_IOMMU_FD:
+		return vfio_group_get_iommu_fd(vgroup);
+	case VFIO_GROUP_GET_DEVICE_FD:
+		{
+			char *buf;
+			int ret;
+
+			buf = strndup_user((const char __user *)arg, MAX_PATH);
+			if (IS_ERR(buf))
+				return PTR_ERR(buf);
+
+			ret = vfio_group_get_device_fd(vgroup, buf);
+			kfree(buf);
+			return ret;
+		}
+	}
+	return -ENOSYS;
+}
+
+
+#ifdef CONFIG_COMPAT
+static long vfio_group_compat_ioctl(struct file *filep,
+				    unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_group_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+static int vfio_group_open(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *vgroup;
+	int ret = 0;
+
+	mutex_lock(&vfio.group_lock);
+
+	vgroup = idr_find(&vfio.idr, iminor(inode));
+
+	if (!vgroup) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!vgroup->refcnt) {
+		struct vfio_container *vcontainer;
+		vcontainer = kzalloc(sizeof(*vcontainer), GFP_KERNEL);
+		if (!vcontainer) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		vgroup->container = vcontainer;
+		vgroup->mm = current->mm;
+	} else if (current->mm != vgroup->mm) {
+		ret = -EBUSY;
+		goto out;
+	}
+	filep->private_data = vgroup;
+	vgroup->refcnt++;
+	vgroup->container->refcnt++;
+out:
+	mutex_unlock(&vfio.group_lock);
+
+	return ret;
+}
+
+static int vfio_group_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *vgroup = filep->private_data;
+	struct vfio_container *vcontainer = vgroup->container;
+	struct list_head *pos;
+	int ret = 0;
+
+	mutex_lock(&vfio.group_lock);
+
+	if (vgroup->refcnt > 1) {
+		vgroup->refcnt--;
+		vcontainer->refcnt--;
+		goto out;
+	}
+
+	list_for_each(pos, &vgroup->device_list) {
+		struct vfio_device *vdev;
+		vdev = list_entry(pos, struct vfio_device, next);
+		if (vdev->refcnt) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	/* Merged group? */
+	if (vcontainer->refcnt > 1) {
+		if (vcontainer->iommu) {
+			list_for_each(pos, &vgroup->device_list) {
+				struct vfio_device *vdev;
+				vdev = list_entry(pos,
+						  struct vfio_device, next);
+				iommu_detach_device(vcontainer->iommu->domain,
+						    vdev->dev);
+				vdev->iommu = NULL;
+			}
+		}
+		vcontainer->refcnt--;
+		vfio_container_reset_read(vcontainer);
+	} else {
+		if (vcontainer->iommu && vcontainer->iommu->refcnt) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		ret = __vfio_close_iommu(vcontainer);
+		if (ret)
+			goto out;
+
+		kfree(vcontainer->read_buf);
+		kfree(vcontainer);
+	}
+
+	vgroup->refcnt--;
+	vgroup->mm = NULL;
+	vgroup->container = NULL;
+
+	/* Possible we had the group open while device members were removed */
+	if (list_empty(&vgroup->device_list)) {
+		device_destroy(vfio.class, vgroup->devt);
+		idr_remove(&vfio.idr, MINOR(vgroup->devt));
+		list_del(&vgroup->next);
+		kfree(vgroup);
+	}
+out:
+	mutex_unlock(&vfio.group_lock);
+	return 0;
+}
+
+static int __vfio_container_create_read_buf(struct vfio_container *vcontainer)
+{
+	struct list_head *gpos, *dpos;
+	struct vfio_group *vgroup;
+	struct vfio_device *vdev;
+	int off = 0;
+	char *buf;
+
+	buf = kzalloc(MAX_PATH, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	list_for_each(gpos, &vfio.group_list) {
+		vgroup = list_entry(gpos, struct vfio_group, next);
+		if (vgroup->container != vcontainer)
+			continue;
+
+		off += snprintf(buf + off, MAX_PATH,
+				"group: %u\n", vgroup->group);
+		buf = krealloc(buf, off + MAX_PATH, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		memset(buf + off, 0, MAX_PATH);
+
+		list_for_each(dpos, &vgroup->device_list) {
+			vdev = list_entry(dpos, struct vfio_device, next);
+
+			off += snprintf(buf + off, MAX_PATH,
+					"device: %s\n", dev_name(vdev->dev));
+			buf = krealloc(buf, off + MAX_PATH, GFP_KERNEL);
+			if (!buf)
+				return -ENOMEM;
+			memset(buf + off, 0, MAX_PATH);
+		}
+	}
+	buf = krealloc(buf, off + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	vcontainer->read_buf = buf;
+	return 0;
+}
+
+static ssize_t vfio_group_read(struct file *filep, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct vfio_group *vgroup = filep->private_data;
+	struct vfio_container *vcontainer;
+	ssize_t ret = 0;
+
+	mutex_lock(&vfio.group_lock);
+
+	vcontainer = vgroup->container;
+
+	if (!vcontainer) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!vcontainer->read_buf) {
+		ret = __vfio_container_create_read_buf(vcontainer);
+		if (ret)
+			goto out;
+	}
+
+	if (*ppos >= strlen(vcontainer->read_buf) + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	if (*ppos + count > strlen(vcontainer->read_buf) + 1)
+		count = strlen(vcontainer->read_buf) + 1 - *ppos;
+
+	if (copy_to_user(buf, vcontainer->read_buf + *ppos, count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	*ppos += count;
+	ret = count;
+out:
+	mutex_unlock(&vfio.group_lock);
+	return ret;
+}
+
+static const struct file_operations vfio_group_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vfio_group_open,
+	.release	= vfio_group_release,
+	.read		= vfio_group_read,
+	.unlocked_ioctl	= vfio_group_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_group_compat_ioctl,
+#endif
+};
+
+static void vfio_class_release(struct kref *kref)
+{
+	class_destroy(vfio.class);
+	vfio.class = NULL;
+}
+
+static char *vfio_devnode(struct device *dev, mode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
+}
+
+static int __init vfio_init(void)
+{
+	int ret;
+
+	idr_init(&vfio.idr);
+	mutex_init(&vfio.group_lock);
+	INIT_LIST_HEAD(&vfio.group_list);
+
+	kref_init(&vfio.kref);
+	vfio.class = class_create(THIS_MODULE, "vfio");
+	if (IS_ERR(vfio.class)) {
+		ret = PTR_ERR(vfio.class);
+		goto err_class;
+	}
+
+	vfio.class->devnode = vfio_devnode;
+
+	/* FIXME - how many minors to allocate... all of them! */
+	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
+	if (ret)
+		goto err_chrdev;
+
+	cdev_init(&vfio.cdev, &vfio_group_fops);
+	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
+	if (ret)
+		goto err_cdev;
+
+	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
+
+	return 0;
+
+err_cdev:
+	unregister_chrdev_region(vfio.devt, MINORMASK);
+err_chrdev:
+	kref_put(&vfio.kref, vfio_class_release);
+err_class:
+	return ret;
+}
+
+static void __exit vfio_cleanup(void)
+{
+	struct list_head *gpos, *gppos;
+
+	list_for_each_safe(gpos, gppos, &vfio.group_list) {
+		struct vfio_group *vgroup;
+		struct list_head *dpos, *dppos;
+
+		vgroup = list_entry(gpos, struct vfio_group, next);
+
+		list_for_each_safe(dpos, dppos, &vgroup->device_list) {
+			struct vfio_device *vdev;
+
+			vdev = list_entry(dpos, struct vfio_device, next);
+			vfio_group_del_dev(vdev->dev);
+		}
+	}
+
+	idr_destroy(&vfio.idr);
+	cdev_del(&vfio.cdev);
+	unregister_chrdev_region(vfio.devt, MINORMASK);
+	kref_put(&vfio.kref, vfio_class_release);
+}
+
+module_init(vfio_init);
+module_exit(vfio_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
new file mode 100644
index 0000000..2cc300c
--- /dev/null
+++ b/drivers/vfio/vfio_private.h
@@ -0,0 +1,82 @@ 
+/*
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+
+#ifndef VFIO_PRIVATE_H
+#define VFIO_PRIVATE_H
+
+extern const struct file_operations vfio_iommu_fops;
+extern const struct file_operations vfio_device_fops;
+
+struct vfio {
+	dev_t			devt;
+	struct cdev		cdev;
+	struct list_head	group_list;
+	struct mutex		group_lock;
+	struct kref		kref;
+	struct class		*class;
+	struct idr		idr;
+};
+
+struct vfio_device_ops {
+	struct vfio_device	*(* new)(struct device *);
+	void			(* free)(struct vfio_device *);
+	struct file_operations	fops;
+};
+
+struct vfio_iommu {
+	struct iommu_domain	*domain;
+	struct vfio		*vfio;
+	int			refcnt;
+	struct file		*file;
+};
+
+struct vfio_device {
+	struct device		*dev;
+	struct list_head	next;
+	struct file		*file;
+	struct vfio_device_ops	*ops;
+	struct vfio		*vfio;
+	struct vfio_iommu	*iommu;
+	int			refcnt;
+};
+
+struct vfio_container {
+	struct vfio_iommu	*iommu;
+	char			*read_buf;
+	int			refcnt;
+};
+
+struct vfio_group {
+	dev_t			devt;
+	unsigned int		group;
+	int			refcnt;
+	struct mm_struct	*mm;
+	struct vfio_container	*container;
+	struct list_head	device_list;
+	struct list_head	next;
+};
+
+extern int vfio_group_add_dev(struct device *dev, void *data);
+extern void vfio_group_del_dev(struct device *dev);
+
+#endif /* VFIO_PRIVATE_H */