diff mbox

[RFC] vfio: VFIO Driver core framework

Message ID 20111103195452.21259.93021.stgit@bling.home
State New
Headers show

Commit Message

Alex Williamson Nov. 3, 2011, 8:12 p.m. UTC
VFIO provides a secure, IOMMU based interface for user space
drivers, including device assignment to virtual machines.
This provides the base management of IOMMU groups, devices,
and IOMMU objects.  See Documentation/vfio.txt included in
this patch for user and kernel API description.

Note, this implements the new API discussed at KVM Forum
2011, as represented by the drvier version 0.2.  It's hoped
that this provides a modular enough interface to support PCI
and non-PCI userspace drivers across various architectures
and IOMMU implementations.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

Fingers crossed, this is the last RFC for VFIO, but we need
the iommu group support before this can go upstream
(http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
hoping this helps push that along.

Since the last posting, this version completely modularizes
the device backends and better defines the APIs between the
core VFIO code and the device backends.  I expect that we
might also adopt a modular IOMMU interface as iommu_ops learns
about different types of hardware.  Also many, many cleanups.
Check the complete git history for details:

git://github.com/awilliam/linux-vfio.git vfio-ng

(matching qemu tree: git://github.com/awilliam/qemu-vfio.git)

This version, along with the supporting VFIO PCI backend can
be found here:

git://github.com/awilliam/linux-vfio.git vfio-next-20111103

I've held off on implementing a kernel->user signaling
mechanism for now since the previous netlink version produced
too many gag reflexes.  It's easy enough to set a bit in the
group flags too indicate such support in the future, so I
think we can move ahead without it.

Appreciate any feedback or suggestions.  Thanks,

Alex

 Documentation/ioctl/ioctl-number.txt |    1 
 Documentation/vfio.txt               |  304 +++++++++
 MAINTAINERS                          |    8 
 drivers/Kconfig                      |    2 
 drivers/Makefile                     |    1 
 drivers/vfio/Kconfig                 |    8 
 drivers/vfio/Makefile                |    3 
 drivers/vfio/vfio_iommu.c            |  530 ++++++++++++++++
 drivers/vfio/vfio_main.c             | 1151 ++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio_private.h          |   34 +
 include/linux/vfio.h                 |  155 +++++
 11 files changed, 2197 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/vfio.txt
 create mode 100644 drivers/vfio/Kconfig
 create mode 100644 drivers/vfio/Makefile
 create mode 100644 drivers/vfio/vfio_iommu.c
 create mode 100644 drivers/vfio/vfio_main.c
 create mode 100644 drivers/vfio/vfio_private.h
 create mode 100644 include/linux/vfio.h

Comments

Aaron Fabbri Nov. 9, 2011, 4:17 a.m. UTC | #1
I'm going to send out chunks of comments as I go over this stuff.  Below
I've covered the documentation file and vfio_iommu.c.  More comments coming
soon...

On 11/3/11 1:12 PM, "Alex Williamson" <alex.williamson@redhat.com> wrote:

> VFIO provides a secure, IOMMU based interface for user space
> drivers, including device assignment to virtual machines.
> This provides the base management of IOMMU groups, devices,
> and IOMMU objects.  See Documentation/vfio.txt included in
> this patch for user and kernel API description.
> 
> Note, this implements the new API discussed at KVM Forum
> 2011, as represented by the drvier version 0.2.  It's hoped
> that this provides a modular enough interface to support PCI
> and non-PCI userspace drivers across various architectures
> and IOMMU implementations.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
<snip>
> +
> +Groups, Devices, IOMMUs, oh my
> +-----------------------------------------------------------------------------
> --
> +
> +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> +can't always distinguish transactions from each individual device in
> +the system.  Sometimes this is because of the IOMMU design, such as with
> +PEs, other times it's caused by the I/O topology, for instance a

Can you define this acronym the first time you use it, i.e.

+ PEs (partitionable endpoints), ...

> +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> +devices created by these restictions IOMMU groups (or just "groups" for

restrictions

> +this document).
> +
> +The IOMMU cannot distiguish transactions between the individual devices

distinguish

> +within the group, therefore the group is the basic unit of ownership for
> +a userspace process.  Because of this, groups are also the primary
> +interface to both devices and IOMMU domains in VFIO.
> +
<snip>
> +file descriptor referencing the same internal IOMMU object from either
> +X or Y).  Merged groups can be dissolved either explictly with UNMERGE

explicitly

<snip>
> +
> +Device tree devices also invlude ioctls for further defining the

include

<snip>
> diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> new file mode 100644
> index 0000000..029dae3
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu.c
<snip>
> +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
> +                      dma_addr_t start, size_t size)
> +{
> +    struct list_head *pos;
> +    struct dma_map_page *mlp;
> +
> +    list_for_each(pos, &iommu->dm_list) {
> +        mlp = list_entry(pos, struct dma_map_page, list);
> +        if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> +                   start, size))
> +            return mlp;
> +    }
> +    return NULL;
> +}
> +

This function below should be static.

> +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
> +                size_t size, struct dma_map_page *mlp)
> +{
> +    struct dma_map_page *split;
> +    int npage_lo, npage_hi;
> +
> +    /* Existing dma region is completely covered, unmap all */
> +    if (start <= mlp->daddr &&
> +        start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> +        vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> +        list_del(&mlp->list);
> +        npage_lo = mlp->npage;
> +        kfree(mlp);
> +        return npage_lo;
> +    }
> +
> +    /* Overlap low address of existing range */
> +    if (start <= mlp->daddr) {
> +        size_t overlap;
> +
> +        overlap = start + size - mlp->daddr;
> +        npage_lo = overlap >> PAGE_SHIFT;
> +        npage_hi = mlp->npage - npage_lo;

npage_hi not used.. Delete this line ^

> +
> +        vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> +        mlp->daddr += overlap;
> +        mlp->vaddr += overlap;
> +        mlp->npage -= npage_lo;
> +        return npage_lo;
> +    }
> +
> +    /* Overlap high address of existing range */
> +    if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> +        size_t overlap;
> +
> +        overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> +        npage_hi = overlap >> PAGE_SHIFT;
> +        npage_lo = mlp->npage - npage_hi;
> +
> +        vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> +        mlp->npage -= npage_hi;
> +        return npage_hi;
> +    }
> +
> +    /* Split existing */
> +    npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
> +    npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
> +
> +    split = kzalloc(sizeof *split, GFP_KERNEL);
> +    if (!split)
> +        return -ENOMEM;
> +
> +    vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
> +
> +    mlp->npage = npage_lo;
> +
> +    split->npage = npage_hi;
> +    split->daddr = start + size;
> +    split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
> +    split->rdwr = mlp->rdwr;
> +    list_add(&split->list, &iommu->dm_list);
> +    return size >> PAGE_SHIFT;
> +}
> +

Function should be static.

> +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> +{
> +    int ret = 0;
> +    size_t npage = dmp->size >> PAGE_SHIFT;
> +    struct list_head *pos, *n;
> +
> +    if (dmp->dmaaddr & ~PAGE_MASK)
> +        return -EINVAL;
> +    if (dmp->size & ~PAGE_MASK)
> +        return -EINVAL;
> +
> +    mutex_lock(&iommu->dgate);
> +
> +    list_for_each_safe(pos, n, &iommu->dm_list) {
> +        struct dma_map_page *mlp;
> +
> +        mlp = list_entry(pos, struct dma_map_page, list);
> +        if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> +                   dmp->dmaaddr, dmp->size)) {
> +            ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
> +                              dmp->size, mlp);
> +            if (ret > 0)
> +                npage -= NPAGE_TO_SIZE(ret);

Why NPAGE_TO_SIZE here?

> +            if (ret < 0 || npage == 0)
> +                break;
> +        }
> +    }
> +    mutex_unlock(&iommu->dgate);
> +    return ret > 0 ? 0 : ret;
> +}
> +

Function should be static.

> +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> +{
> +    int npage;
> +    struct dma_map_page *mlp, *mmlp = NULL;
> +    dma_addr_t daddr = dmp->dmaaddr;
Alex Williamson Nov. 9, 2011, 4:41 a.m. UTC | #2
On Tue, 2011-11-08 at 20:17 -0800, Aaron Fabbri wrote:
> I'm going to send out chunks of comments as I go over this stuff.  Below
> I've covered the documentation file and vfio_iommu.c.  More comments coming
> soon...
> 
> On 11/3/11 1:12 PM, "Alex Williamson" <alex.williamson@redhat.com> wrote:
> 
> > VFIO provides a secure, IOMMU based interface for user space
> > drivers, including device assignment to virtual machines.
> > This provides the base management of IOMMU groups, devices,
> > and IOMMU objects.  See Documentation/vfio.txt included in
> > this patch for user and kernel API description.
> > 
> > Note, this implements the new API discussed at KVM Forum
> > 2011, as represented by the drvier version 0.2.  It's hoped
> > that this provides a modular enough interface to support PCI
> > and non-PCI userspace drivers across various architectures
> > and IOMMU implementations.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> <snip>
> > +
> > +Groups, Devices, IOMMUs, oh my
> > +-----------------------------------------------------------------------------
> > --
> > +
> > +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> > +can't always distinguish transactions from each individual device in
> > +the system.  Sometimes this is because of the IOMMU design, such as with
> > +PEs, other times it's caused by the I/O topology, for instance a
> 
> Can you define this acronym the first time you use it, i.e.
> 
> + PEs (partitionable endpoints), ...

It was actually up in the <snip>:

... POWER systems with Partitionable Endpoints (PEs) ...

I tried to make sure I defined them, but let me know if anything else is
missing/non-obvious.

> > +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> > +devices created by these restictions IOMMU groups (or just "groups" for
> 
> restrictions

Ugh, lost w/o a spell checker.  Fixed all these.

> > diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> > new file mode 100644
> > index 0000000..029dae3
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_iommu.c
> <snip>
> > +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
> > +                      dma_addr_t start, size_t size)
> > +{
> > +    struct list_head *pos;
> > +    struct dma_map_page *mlp;
> > +
> > +    list_for_each(pos, &iommu->dm_list) {
> > +        mlp = list_entry(pos, struct dma_map_page, list);
> > +        if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > +                   start, size))
> > +            return mlp;
> > +    }
> > +    return NULL;
> > +}
> > +
> 
> This function below should be static.

Fixed

> > +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
> > +                size_t size, struct dma_map_page *mlp)
> > +{
> > +    struct dma_map_page *split;
> > +    int npage_lo, npage_hi;
> > +
> > +    /* Existing dma region is completely covered, unmap all */
> > +    if (start <= mlp->daddr &&
> > +        start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +        vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > +        list_del(&mlp->list);
> > +        npage_lo = mlp->npage;
> > +        kfree(mlp);
> > +        return npage_lo;
> > +    }
> > +
> > +    /* Overlap low address of existing range */
> > +    if (start <= mlp->daddr) {
> > +        size_t overlap;
> > +
> > +        overlap = start + size - mlp->daddr;
> > +        npage_lo = overlap >> PAGE_SHIFT;
> > +        npage_hi = mlp->npage - npage_lo;
> 
> npage_hi not used.. Delete this line ^

Yep, and npage_lo in the next block.  I was setting them just for
symmetry, but they can be removed now.

> > +
> > +        vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> > +        mlp->daddr += overlap;
> > +        mlp->vaddr += overlap;
> > +        mlp->npage -= npage_lo;
> > +        return npage_lo;
> > +    }
> > +
> > +    /* Overlap high address of existing range */
> > +    if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +        size_t overlap;
> > +
> > +        overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> > +        npage_hi = overlap >> PAGE_SHIFT;
> > +        npage_lo = mlp->npage - npage_hi;
> > +
> > +        vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> > +        mlp->npage -= npage_hi;
> > +        return npage_hi;
> > +    }
> > +
> > +    /* Split existing */
> > +    npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
> > +    npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
> > +
> > +    split = kzalloc(sizeof *split, GFP_KERNEL);
> > +    if (!split)
> > +        return -ENOMEM;
> > +
> > +    vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
> > +
> > +    mlp->npage = npage_lo;
> > +
> > +    split->npage = npage_hi;
> > +    split->daddr = start + size;
> > +    split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
> > +    split->rdwr = mlp->rdwr;
> > +    list_add(&split->list, &iommu->dm_list);
> > +    return size >> PAGE_SHIFT;
> > +}
> > +
> 
> Function should be static.

Fixed

> > +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > +{
> > +    int ret = 0;
> > +    size_t npage = dmp->size >> PAGE_SHIFT;
> > +    struct list_head *pos, *n;
> > +
> > +    if (dmp->dmaaddr & ~PAGE_MASK)
> > +        return -EINVAL;
> > +    if (dmp->size & ~PAGE_MASK)
> > +        return -EINVAL;
> > +
> > +    mutex_lock(&iommu->dgate);
> > +
> > +    list_for_each_safe(pos, n, &iommu->dm_list) {
> > +        struct dma_map_page *mlp;
> > +
> > +        mlp = list_entry(pos, struct dma_map_page, list);
> > +        if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > +                   dmp->dmaaddr, dmp->size)) {
> > +            ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
> > +                              dmp->size, mlp);
> > +            if (ret > 0)
> > +                npage -= NPAGE_TO_SIZE(ret);
> 
> Why NPAGE_TO_SIZE here?

Looks like a bug, I'll change and test.

> > +            if (ret < 0 || npage == 0)
> > +                break;
> > +        }
> > +    }
> > +    mutex_unlock(&iommu->dgate);
> > +    return ret > 0 ? 0 : ret;
> > +}
> > +
> 
> Function should be static.

Fixed.

> > +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > +{
> > +    int npage;
> > +    struct dma_map_page *mlp, *mmlp = NULL;
> > +    dma_addr_t daddr = dmp->dmaaddr;
> 

Thanks!

Alex
Christian Benvenuti (benve) Nov. 9, 2011, 8:11 a.m. UTC | #3
I have not gone through the all patch yet, but here are
my first comments/questions about the code in vfio_main.c
(and pci/vfio_pci.c).

> -----Original Message-----

> From: Alex Williamson [mailto:alex.williamson@redhat.com]

> Sent: Thursday, November 03, 2011 1:12 PM

> To: chrisw@sous-sol.org; aik@au1.ibm.com; pmac@au1.ibm.com;

> dwg@au1.ibm.com; joerg.roedel@amd.com; agraf@suse.de; Christian

> Benvenuti (benve); Aaron Fabbri (aafabbri); B08248@freescale.com;

> B07421@freescale.com; avi@redhat.com; konrad.wilk@oracle.com;

> kvm@vger.kernel.org; qemu-devel@nongnu.org; iommu@lists.linux-

> foundation.org; linux-pci@vger.kernel.org

> Subject: [RFC PATCH] vfio: VFIO Driver core framework


<snip>

> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c

> new file mode 100644

> index 0000000..6169356

> --- /dev/null

> +++ b/drivers/vfio/vfio_main.c

> @@ -0,0 +1,1151 @@

> +/*

> + * VFIO framework

> + *

> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.

> + *     Author: Alex Williamson <alex.williamson@redhat.com>

> + *

> + * This program is free software; you can redistribute it and/or

> modify

> + * it under the terms of the GNU General Public License version 2 as

> + * published by the Free Software Foundation.

> + *

> + * Derived from original vfio:

> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.

> + * Author: Tom Lyon, pugs@cisco.com

> + */

> +

> +#include <linux/cdev.h>

> +#include <linux/compat.h>

> +#include <linux/device.h>

> +#include <linux/file.h>

> +#include <linux/anon_inodes.h>

> +#include <linux/fs.h>

> +#include <linux/idr.h>

> +#include <linux/iommu.h>

> +#include <linux/mm.h>

> +#include <linux/module.h>

> +#include <linux/slab.h>

> +#include <linux/string.h>

> +#include <linux/uaccess.h>

> +#include <linux/vfio.h>

> +#include <linux/wait.h>

> +

> +#include "vfio_private.h"

> +

> +#define DRIVER_VERSION	"0.2"

> +#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"

> +#define DRIVER_DESC	"VFIO - User Level meta-driver"

> +

> +static int allow_unsafe_intrs;

> +module_param(allow_unsafe_intrs, int, 0);

> +MODULE_PARM_DESC(allow_unsafe_intrs,

> +        "Allow use of IOMMUs which do not support interrupt

> remapping");

> +

> +static struct vfio {

> +	dev_t			devt;

> +	struct cdev		cdev;

> +	struct list_head	group_list;

> +	struct mutex		lock;

> +	struct kref		kref;

> +	struct class		*class;

> +	struct idr		idr;

> +	wait_queue_head_t	release_q;

> +} vfio;

> +

> +static const struct file_operations vfio_group_fops;

> +extern const struct file_operations vfio_iommu_fops;

> +

> +struct vfio_group {

> +	dev_t			devt;

> +	unsigned int		groupid;


This groupid is returned by the device_group callback you recently added
with a separate (not yet in tree) IOMMU patch.
Is it correct to say that the scope of this ID is the bus the iommu
belongs too (but you use it as if it was global)?
I believe there is nothing right now to ensure the uniqueness of such
ID across bus types (assuming there will be other bus drivers in the
future besides vfio-pci).
If that's the case, the vfio.group_list global list and the __vfio_lookup_dev
routine should be changed to account for the bus too?
Ops, I just saw the error msg in vfio_group_add_dev about the group id conflict.
Is that warning related to what I mentioned above?

> +	struct bus_type		*bus;

> +	struct vfio_iommu	*iommu;

> +	struct list_head	device_list;

> +	struct list_head	iommu_next;

> +	struct list_head	group_next;

> +	int			refcnt;

> +};

> +

> +struct vfio_device {

> +	struct device			*dev;

> +	const struct vfio_device_ops	*ops;

> +	struct vfio_iommu		*iommu;


I wonder if you need to have the 'iommu' field here.
vfio_device.iommu is always set and reset together with
vfio_group.iommu.
Given that a vfio_device instance is always linked to a vfio_group
instance, do we need this duplication? Is this duplication there
because you do not want the double dereference device->group->iommu?

> +	struct vfio_group		*group;

> +	struct list_head		device_next;

> +	bool				attached;

> +	int				refcnt;

> +	void				*device_data;

> +};

> +

> +/*

> + * Helper functions called under vfio.lock

> + */

> +

> +/* Return true if any devices within a group are opened */

> +static bool __vfio_group_devs_inuse(struct vfio_group *group)

> +{

> +	struct list_head *pos;

> +

> +	list_for_each(pos, &group->device_list) {

> +		struct vfio_device *device;

> +

> +		device = list_entry(pos, struct vfio_device, device_next);

> +		if (device->refcnt)

> +			return true;

> +	}

> +	return false;

> +}

> +

> +/* Return true if any of the groups attached to an iommu are opened.

> + * We can only tear apart merged groups when nothing is left open. */

> +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)

> +{

> +	struct list_head *pos;

> +

> +	list_for_each(pos, &iommu->group_list) {

> +		struct vfio_group *group;

> +

> +		group = list_entry(pos, struct vfio_group, iommu_next);

> +		if (group->refcnt)

> +			return true;

> +	}

> +	return false;

> +}

> +

> +/* An iommu is "in use" if it has a file descriptor open or if any of

> + * the groups assigned to the iommu have devices open. */

> +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)

> +{

> +	struct list_head *pos;

> +

> +	if (iommu->refcnt)

> +		return true;

> +

> +	list_for_each(pos, &iommu->group_list) {

> +		struct vfio_group *group;

> +

> +		group = list_entry(pos, struct vfio_group, iommu_next);

> +

> +		if (__vfio_group_devs_inuse(group))

> +			return true;

> +	}

> +	return false;

> +}


I looked at how you take care of ref counts ...

This is how the tree of vfio_iommu/vfio_group/vfio_device data
Structures is organized (I'll use just iommu/group/dev to make
the graph smaller):

            iommu
           /     \
          /       \ 
    group   ...     group
    /  \           /  \   
   /    \         /    \
dev  ..  dev   dev  ..  dev

This is how you get a file descriptor for the three kind of objects:

- group : open /dev/vfio/xxx for group xxx
- iommu : group ioctl VFIO_GROUP_GET_IOMMU_FD
- device: group ioctl VFIO_GROUP_GET_DEVICE_FD

Given the above topology, I would assume that:

(1) an iommu is 'inuse' if : a) iommu refcnt > 0, or
                             b) any of its groups is 'inuse'

(2) a  group is 'inuse' if : a) group refcnt > 0, or
                             b) any of its devices is 'inuse'

(3) a device is 'inuse' if : a) device refcnt > 0

You have coded the 'inuse' logic with these three routines:

    __vfio_iommu_inuse, which implements (1) above

and
    __vfio_iommu_groups_inuse
    __vfio_group_devs_inuse

which are used by __vfio_iommu_inuse.
Why don't you check the group refcnt in __vfio_iommu_groups_inuse?

Would it make sense (and the code more readable) to structure the
nested refcnt/inuse check like this?
(The numbers (1)(2)(3) refer to the three 'inuse' conditions above)

   (1)__vfio_iommu_inuse
   |
   +-> check iommu refcnt
   +-> __vfio_iommu_groups_inuse
       |
       +->LOOP: (2)__vfio_iommu_group_inuse<--MISSING
                |
                +-> check group refcnt<--MISSING
                +-> __vfio_group_devs_inuse()
                    |
                    +-> LOOP: (3)__vfio_group_dev_inuse<--MISSING
                              |
                              +-> check device refcnt

> +static void __vfio_group_set_iommu(struct vfio_group *group,

> +				   struct vfio_iommu *iommu)

> +{

> +	struct list_head *pos;

> +

> +	if (group->iommu)

> +		list_del(&group->iommu_next);

> +	if (iommu)

> +		list_add(&group->iommu_next, &iommu->group_list);

> +

> +	group->iommu = iommu;


If you remove the vfio_device.iommu field (as suggested above in a previous
Comment), the block below would not be needed anymore.

> +	list_for_each(pos, &group->device_list) {

> +		struct vfio_device *device;

> +

> +		device = list_entry(pos, struct vfio_device, device_next);

> +		device->iommu = iommu;

> +	}

> +}

> +

> +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,

> +				    struct vfio_device *device)

> +{

> +	BUG_ON(!iommu->domain && device->attached);

> +

> +	if (!iommu->domain || !device->attached)

> +		return;

> +

> +	iommu_detach_device(iommu->domain, device->dev);

> +	device->attached = false;

> +}

> +

> +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,

> +				      struct vfio_group *group)

> +{

> +	struct list_head *pos;

> +

> +	list_for_each(pos, &group->device_list) {

> +		struct vfio_device *device;

> +

> +		device = list_entry(pos, struct vfio_device, device_next);

> +		__vfio_iommu_detach_dev(iommu, device);

> +	}

> +}

> +

> +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,

> +				   struct vfio_device *device)

> +{

> +	int ret;

> +

> +	BUG_ON(device->attached);

> +

> +	if (!iommu || !iommu->domain)

> +		return -EINVAL;

> +

> +	ret = iommu_attach_device(iommu->domain, device->dev);

> +	if (!ret)

> +		device->attached = true;

> +

> +	return ret;

> +}

> +

> +static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,

> +				     struct vfio_group *group)

> +{

> +	struct list_head *pos;

> +

> +	list_for_each(pos, &group->device_list) {

> +		struct vfio_device *device;

> +		int ret;

> +

> +		device = list_entry(pos, struct vfio_device, device_next);

> +		ret = __vfio_iommu_attach_dev(iommu, device);

> +		if (ret) {

> +			__vfio_iommu_detach_group(iommu, group);

> +			return ret;

> +		}

> +	}

> +	return 0;

> +}

> +

> +/* The iommu is viable, ie. ready to be configured, when all the

> devices

> + * for all the groups attached to the iommu are bound to their vfio

> device

> + * drivers (ex. vfio-pci).  This sets the device_data private data

> pointer. */

> +static bool __vfio_iommu_viable(struct vfio_iommu *iommu)

> +{

> +	struct list_head *gpos, *dpos;

> +

> +	list_for_each(gpos, &iommu->group_list) {

> +		struct vfio_group *group;

> +		group = list_entry(gpos, struct vfio_group, iommu_next);

> +

> +		list_for_each(dpos, &group->device_list) {

> +			struct vfio_device *device;

> +			device = list_entry(dpos,

> +					    struct vfio_device, device_next);

> +

> +			if (!device->device_data)

> +				return false;

> +		}

> +	}

> +	return true;

> +}

> +

> +static void __vfio_close_iommu(struct vfio_iommu *iommu)

> +{

> +	struct list_head *pos;

> +

> +	if (!iommu->domain)

> +		return;

> +

> +	list_for_each(pos, &iommu->group_list) {

> +		struct vfio_group *group;

> +		group = list_entry(pos, struct vfio_group, iommu_next);

> +

> +		__vfio_iommu_detach_group(iommu, group);

> +	}

> +

> +	vfio_iommu_unmapall(iommu);

> +

> +	iommu_domain_free(iommu->domain);

> +	iommu->domain = NULL;

> +	iommu->mm = NULL;

> +}

> +

> +/* Open the IOMMU.  This gates all access to the iommu or device file

> + * descriptors and sets current->mm as the exclusive user. */


Given the fn  vfio_group_open (ie, 1st object, 2nd operation), I would have
called this one __vfio_iommu_open (instead of __vfio_open_iommu).
Is it named __vfio_open_iommu to avoid a conflict with the namespace in vfio_iommu.c?      

> +static int __vfio_open_iommu(struct vfio_iommu *iommu)

> +{

> +	struct list_head *pos;

> +	int ret;

> +

> +	if (!__vfio_iommu_viable(iommu))

> +		return -EBUSY;

> +

> +	if (iommu->domain)

> +		return -EINVAL;

> +

> +	iommu->domain = iommu_domain_alloc(iommu->bus);

> +	if (!iommu->domain)

> +		return -EFAULT;

> +

> +	list_for_each(pos, &iommu->group_list) {

> +		struct vfio_group *group;

> +		group = list_entry(pos, struct vfio_group, iommu_next);

> +

> +		ret = __vfio_iommu_attach_group(iommu, group);

> +		if (ret) {

> +			__vfio_close_iommu(iommu);

> +			return ret;

> +		}

> +	}

> +

> +	if (!allow_unsafe_intrs &&

> +	    !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {

> +		__vfio_close_iommu(iommu);

> +		return -EFAULT;

> +	}

> +

> +	iommu->cache = (iommu_domain_has_cap(iommu->domain,

> +					     IOMMU_CAP_CACHE_COHERENCY) != 0);

> +	iommu->mm = current->mm;

> +

> +	return 0;

> +}

> +

> +/* Actively try to tear down the iommu and merged groups.  If there

> are no

> + * open iommu or device fds, we close the iommu.  If we close the

> iommu and

> + * there are also no open group fds, we can futher dissolve the group

> to

> + * iommu association and free the iommu data structure. */

> +static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)

> +{

> +

> +	if (__vfio_iommu_inuse(iommu))

> +		return -EBUSY;

> +

> +	__vfio_close_iommu(iommu);

> +

> +	if (!__vfio_iommu_groups_inuse(iommu)) {

> +		struct list_head *pos, *ppos;

> +

> +		list_for_each_safe(pos, ppos, &iommu->group_list) {

> +			struct vfio_group *group;

> +

> +			group = list_entry(pos, struct vfio_group,

> iommu_next);

> +			__vfio_group_set_iommu(group, NULL);

> +		}

> +

> +

> +		kfree(iommu);

> +	}

> +

> +	return 0;

> +}

> +

> +static struct vfio_device *__vfio_lookup_dev(struct device *dev)

> +{

> +	struct list_head *gpos;

> +	unsigned int groupid;

> +

> +	if (iommu_device_group(dev, &groupid))

> +		return NULL;

> +

> +	list_for_each(gpos, &vfio.group_list) {

> +		struct vfio_group *group;

> +		struct list_head *dpos;

> +

> +		group = list_entry(gpos, struct vfio_group, group_next);

> +

> +		if (group->groupid != groupid)

> +			continue;

> +

> +		list_for_each(dpos, &group->device_list) {

> +			struct vfio_device *device;

> +

> +			device = list_entry(dpos,

> +					    struct vfio_device, device_next);

> +

> +			if (device->dev == dev)

> +				return device;

> +		}

> +	}

> +	return NULL;

> +}

> +

> +/* All release paths simply decrement the refcnt, attempt to teardown

> + * the iommu and merged groups, and wakeup anything that might be

> + * waiting if we successfully dissolve anything. */

> +static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)

> +{

> +	bool wake;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	(*refcnt)--;

> +	wake = (__vfio_try_dissolve_iommu(iommu) == 0);

> +

> +	mutex_unlock(&vfio.lock);

> +

> +	if (wake)

> +		wake_up(&vfio.release_q);

> +

> +	return 0;

> +}

> +

> +/*

> + * Device fops - passthrough to vfio device driver w/ device_data

> + */

> +static int vfio_device_release(struct inode *inode, struct file

> *filep)

> +{

> +	struct vfio_device *device = filep->private_data;

> +

> +	vfio_do_release(&device->refcnt, device->iommu);

> +

> +	device->ops->put(device->device_data);

> +

> +	return 0;

> +}

> +

> +static long vfio_device_unl_ioctl(struct file *filep,

> +				  unsigned int cmd, unsigned long arg)

> +{

> +	struct vfio_device *device = filep->private_data;

> +

> +	return device->ops->ioctl(device->device_data, cmd, arg);

> +}

> +

> +static ssize_t vfio_device_read(struct file *filep, char __user *buf,

> +				size_t count, loff_t *ppos)

> +{

> +	struct vfio_device *device = filep->private_data;

> +

> +	return device->ops->read(device->device_data, buf, count, ppos);

> +}

> +

> +static ssize_t vfio_device_write(struct file *filep, const char __user

> *buf,

> +				 size_t count, loff_t *ppos)

> +{

> +	struct vfio_device *device = filep->private_data;

> +

> +	return device->ops->write(device->device_data, buf, count, ppos);

> +}

> +

> +static int vfio_device_mmap(struct file *filep, struct vm_area_struct

> *vma)

> +{

> +	struct vfio_device *device = filep->private_data;

> +

> +	return device->ops->mmap(device->device_data, vma);

> +}

> +

> +#ifdef CONFIG_COMPAT

> +static long vfio_device_compat_ioctl(struct file *filep,

> +				     unsigned int cmd, unsigned long arg)

> +{

> +	arg = (unsigned long)compat_ptr(arg);

> +	return vfio_device_unl_ioctl(filep, cmd, arg);

> +}

> +#endif	/* CONFIG_COMPAT */

> +

> +const struct file_operations vfio_device_fops = {

> +	.owner		= THIS_MODULE,

> +	.release	= vfio_device_release,

> +	.read		= vfio_device_read,

> +	.write		= vfio_device_write,

> +	.unlocked_ioctl	= vfio_device_unl_ioctl,

> +#ifdef CONFIG_COMPAT

> +	.compat_ioctl	= vfio_device_compat_ioctl,

> +#endif

> +	.mmap		= vfio_device_mmap,

> +};

> +

> +/*

> + * Group fops

> + */

> +static int vfio_group_open(struct inode *inode, struct file *filep)

> +{

> +	struct vfio_group *group;

> +	int ret = 0;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	group = idr_find(&vfio.idr, iminor(inode));

> +

> +	if (!group) {

> +		ret = -ENODEV;

> +		goto out;

> +	}

> +

> +	filep->private_data = group;

> +

> +	if (!group->iommu) {

> +		struct vfio_iommu *iommu;

> +

> +		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);

> +		if (!iommu) {

> +			ret = -ENOMEM;

> +			goto out;

> +		}

> +		INIT_LIST_HEAD(&iommu->group_list);

> +		INIT_LIST_HEAD(&iommu->dm_list);

> +		mutex_init(&iommu->dgate);

> +		iommu->bus = group->bus;

> +		__vfio_group_set_iommu(group, iommu);

> +	}

> +	group->refcnt++;

> +

> +out:

> +	mutex_unlock(&vfio.lock);

> +

> +	return ret;

> +}

> +

> +static int vfio_group_release(struct inode *inode, struct file *filep)

> +{

> +	struct vfio_group *group = filep->private_data;

> +

> +	return vfio_do_release(&group->refcnt, group->iommu);

> +}

> +

> +/* Attempt to merge the group pointed to by fd into group.  The merge-

> ee

> + * group must not have an iommu or any devices open because we cannot

> + * maintain that context across the merge.  The merge-er group can be

> + * in use. */

> +static int vfio_group_merge(struct vfio_group *group, int fd)


The documentation in vfio.txt explains clearly the logic implemented by
the merge/unmerge group ioctls.
However, what you are doing is not merging groups, but rather adding/removing
groups to/from iommus (and creating flat lists of groups).
For example, when you do

  merge(A,B)

you actually mean to say "merge B to the list of groups assigned to the
same iommu as group A".
For the same reason, you do not really need to provide the group you want
to unmerge from, which means that instead of

  unmerge(A,B) 

you would just need

  unmerge(B)

I understand the reason why it is not a real merge/unmerge (ie, to keep the
original groups so that you can unmerge later) ... however I just wonder if
it wouldn't be more natural to implement the VFIO_IOMMU_ADD_GROUP/DEL_GROUP
iommu ioctls instead? (the relationships between the data structure would
remain the same)
I guess you already discarded this option for some reasons, right? What was
the reason?

> +{

> +	struct vfio_group *new;

> +	struct vfio_iommu *old_iommu;

> +	struct file *file;

> +	int ret = 0;

> +	bool opened = false;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	file = fget(fd);

> +	if (!file) {

> +		ret = -EBADF;

> +		goto out_noput;

> +	}

> +

> +	/* Sanity check, is this really our fd? */

> +	if (file->f_op != &vfio_group_fops) {

> +		ret = -EINVAL;

> +		goto out;

> +	}

> +

> +	new = file->private_data;

> +

> +	if (!new || new == group || !new->iommu ||

> +	    new->iommu->domain || new->bus != group->bus) {

> +		ret = -EINVAL;

> +		goto out;

> +	}

> +

> +	/* We need to attach all the devices to each domain separately

> +	 * in order to validate that the capabilities match for both.  */

> +	ret = __vfio_open_iommu(new->iommu);

> +	if (ret)

> +		goto out;

> +

> +	if (!group->iommu->domain) {

> +		ret = __vfio_open_iommu(group->iommu);

> +		if (ret)

> +			goto out;

> +		opened = true;

> +	}

> +

> +	/* If cache coherency doesn't match we'd potentialy need to

> +	 * remap existing iommu mappings in the merge-er domain.

> +	 * Poor return to bother trying to allow this currently. */

> +	if (iommu_domain_has_cap(group->iommu->domain,

> +				 IOMMU_CAP_CACHE_COHERENCY) !=

> +	    iommu_domain_has_cap(new->iommu->domain,

> +				 IOMMU_CAP_CACHE_COHERENCY)) {

> +		__vfio_close_iommu(new->iommu);

> +		if (opened)

> +			__vfio_close_iommu(group->iommu);

> +		ret = -EINVAL;

> +		goto out;

> +	}

> +

> +	/* Close the iommu for the merge-ee and attach all its devices

> +	 * to the merge-er iommu. */

> +	__vfio_close_iommu(new->iommu);

> +

> +	ret = __vfio_iommu_attach_group(group->iommu, new);

> +	if (ret)

> +		goto out;

> +

> +	/* set_iommu unlinks new from the iommu, so save a pointer to it

> */

> +	old_iommu = new->iommu;

> +	__vfio_group_set_iommu(new, group->iommu);

> +	kfree(old_iommu);

> +

> +out:

> +	fput(file);

> +out_noput:

> +	mutex_unlock(&vfio.lock);

> +	return ret;

> +}

> +

> +/* Unmerge the group pointed to by fd from group. */

> +static int vfio_group_unmerge(struct vfio_group *group, int fd)

> +{

> +	struct vfio_group *new;

> +	struct vfio_iommu *new_iommu;

> +	struct file *file;

> +	int ret = 0;

> +

> +	/* Since the merge-out group is already opened, it needs to

> +	 * have an iommu struct associated with it. */

> +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);

> +	if (!new_iommu)

> +		return -ENOMEM;

> +

> +	INIT_LIST_HEAD(&new_iommu->group_list);

> +	INIT_LIST_HEAD(&new_iommu->dm_list);

> +	mutex_init(&new_iommu->dgate);

> +	new_iommu->bus = group->bus;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	file = fget(fd);

> +	if (!file) {

> +		ret = -EBADF;

> +		goto out_noput;

> +	}

> +

> +	/* Sanity check, is this really our fd? */

> +	if (file->f_op != &vfio_group_fops) {

> +		ret = -EINVAL;

> +		goto out;

> +	}

> +

> +	new = file->private_data;

> +	if (!new || new == group || new->iommu != group->iommu) {

> +		ret = -EINVAL;

> +		goto out;

> +	}

> +

> +	/* We can't merge-out a group with devices still in use. */

> +	if (__vfio_group_devs_inuse(new)) {

> +		ret = -EBUSY;

> +		goto out;

> +	}

> +

> +	__vfio_iommu_detach_group(group->iommu, new);

> +	__vfio_group_set_iommu(new, new_iommu);

> +

> +out:

> +	fput(file);

> +out_noput:

> +	if (ret)

> +		kfree(new_iommu);

> +	mutex_unlock(&vfio.lock);

> +	return ret;

> +}

> +

> +/* Get a new iommu file descriptor.  This will open the iommu, setting

> + * the current->mm ownership if it's not already set. */

> +static int vfio_group_get_iommu_fd(struct vfio_group *group)

> +{

> +	int ret = 0;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	if (!group->iommu->domain) {

> +		ret = __vfio_open_iommu(group->iommu);

> +		if (ret)

> +			goto out;

> +	}

> +

> +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,

> +			       group->iommu, O_RDWR);

> +	if (ret < 0)

> +		goto out;

> +

> +	group->iommu->refcnt++;

> +out:

> +	mutex_unlock(&vfio.lock);

> +	return ret;

> +}

> +

> +/* Get a new device file descriptor.  This will open the iommu,

> setting

> + * the current->mm ownership if it's not already set.  It's difficult

> to

> + * specify the requirements for matching a user supplied buffer to a

> + * device, so we use a vfio driver callback to test for a match.  For

> + * PCI, dev_name(dev) is unique, but other drivers may require

> including

> + * a parent device string. */

> +static int vfio_group_get_device_fd(struct vfio_group *group, char

> *buf)

> +{

> +	struct vfio_iommu *iommu = group->iommu;

> +	struct list_head *gpos;

> +	int ret = -ENODEV;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	if (!iommu->domain) {

> +		ret = __vfio_open_iommu(iommu);

> +		if (ret)

> +			goto out;

> +	}

> +

> +	list_for_each(gpos, &iommu->group_list) {

> +		struct list_head *dpos;

> +

> +		group = list_entry(gpos, struct vfio_group, iommu_next);

> +

> +		list_for_each(dpos, &group->device_list) {

> +			struct vfio_device *device;

> +

> +			device = list_entry(dpos,

> +					    struct vfio_device, device_next);

> +

> +			if (device->ops->match(device->dev, buf)) {

> +				struct file *file;

> +

> +				if (device->ops->get(device->device_data)) {

> +					ret = -EFAULT;

> +					goto out;

> +				}

> +

> +				/* We can't use anon_inode_getfd(), like above

> +				 * because we need to modify the f_mode flags

> +				 * directly to allow more than just ioctls */

> +				ret = get_unused_fd();

> +				if (ret < 0) {

> +					device->ops->put(device->device_data);

> +					goto out;

> +				}

> +

> +				file = anon_inode_getfile("[vfio-device]",

> +							  &vfio_device_fops,

> +							  device, O_RDWR);

> +				if (IS_ERR(file)) {

> +					put_unused_fd(ret);

> +					ret = PTR_ERR(file);

> +					device->ops->put(device->device_data);

> +					goto out;

> +				}

> +

> +				/* Todo: add an anon_inode interface to do

> +				 * this.  Appears to be missing by lack of

> +				 * need rather than explicitly prevented.

> +				 * Now there's need. */

> +				file->f_mode |= (FMODE_LSEEK |

> +						 FMODE_PREAD |

> +						 FMODE_PWRITE);

> +

> +				fd_install(ret, file);

> +

> +				device->refcnt++;

> +				goto out;

> +			}

> +		}

> +	}

> +out:

> +	mutex_unlock(&vfio.lock);

> +	return ret;

> +}

> +

> +static long vfio_group_unl_ioctl(struct file *filep,

> +				 unsigned int cmd, unsigned long arg)

> +{

> +	struct vfio_group *group = filep->private_data;

> +

> +	if (cmd == VFIO_GROUP_GET_FLAGS) {

> +		u64 flags = 0;

> +

> +		mutex_lock(&vfio.lock);

> +		if (__vfio_iommu_viable(group->iommu))

> +			flags |= VFIO_GROUP_FLAGS_VIABLE;

> +		mutex_unlock(&vfio.lock);

> +

> +		if (group->iommu->mm)

> +			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;

> +

> +		return put_user(flags, (u64 __user *)arg);

> +	}

> +

> +	/* Below commands are restricted once the mm is set */

> +	if (group->iommu->mm && group->iommu->mm != current->mm)

> +		return -EPERM;

> +	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {

> +		int fd;

> +

> +		if (get_user(fd, (int __user *)arg))

> +			return -EFAULT;

> +		if (fd < 0)

> +			return -EINVAL;

> +

> +		if (cmd == VFIO_GROUP_MERGE)

> +			return vfio_group_merge(group, fd);

> +		else

> +			return vfio_group_unmerge(group, fd);

> +	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {

> +		return vfio_group_get_iommu_fd(group);

> +	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {

> +		char *buf;

> +		int ret;

> +

> +		buf = strndup_user((const char __user *)arg, PAGE_SIZE);

> +		if (IS_ERR(buf))

> +			return PTR_ERR(buf);

> +

> +		ret = vfio_group_get_device_fd(group, buf);

> +		kfree(buf);

> +		return ret;

> +	}

> +

> +	return -ENOSYS;

> +}

> +

> +#ifdef CONFIG_COMPAT

> +static long vfio_group_compat_ioctl(struct file *filep,

> +				    unsigned int cmd, unsigned long arg)

> +{

> +	arg = (unsigned long)compat_ptr(arg);

> +	return vfio_group_unl_ioctl(filep, cmd, arg);

> +}

> +#endif	/* CONFIG_COMPAT */

> +

> +static const struct file_operations vfio_group_fops = {

> +	.owner		= THIS_MODULE,

> +	.open		= vfio_group_open,

> +	.release	= vfio_group_release,

> +	.unlocked_ioctl	= vfio_group_unl_ioctl,

> +#ifdef CONFIG_COMPAT

> +	.compat_ioctl	= vfio_group_compat_ioctl,

> +#endif

> +};

> +

> +/* iommu fd release hook */


Given vfio_device_release and
      vfio_group_release (ie, 1st object, 2nd operation), I was
going to suggest renaming the fn below to vfio_iommu_release, but
then I saw the latter name being already used in vfio_iommu.c ...
a bit confusing but I guess it's ok then.

> +int vfio_release_iommu(struct vfio_iommu *iommu)

> +{

> +	return vfio_do_release(&iommu->refcnt, iommu);

> +}

> +

> +/*

> + * VFIO driver API

> + */

> +

> +/* Add a new device to the vfio framework with associated vfio driver

> + * callbacks.  This is the entry point for vfio drivers to register

> devices. */

> +int vfio_group_add_dev(struct device *dev, const struct

> vfio_device_ops *ops)

> +{

> +	struct list_head *pos;

> +	struct vfio_group *group = NULL;

> +	struct vfio_device *device = NULL;

> +	unsigned int groupid;

> +	int ret = 0;

> +	bool new_group = false;

> +

> +	if (!ops)

> +		return -EINVAL;

> +

> +	if (iommu_device_group(dev, &groupid))

> +		return -ENODEV;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	list_for_each(pos, &vfio.group_list) {

> +		group = list_entry(pos, struct vfio_group, group_next);

> +		if (group->groupid == groupid)

> +			break;

> +		group = NULL;

> +	}

> +

> +	if (!group) {

> +		int minor;

> +

> +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {

> +			ret = -ENOMEM;

> +			goto out;

> +		}

> +

> +		group = kzalloc(sizeof(*group), GFP_KERNEL);

> +		if (!group) {

> +			ret = -ENOMEM;

> +			goto out;

> +		}

> +

> +		group->groupid = groupid;

> +		INIT_LIST_HEAD(&group->device_list);

> +

> +		ret = idr_get_new(&vfio.idr, group, &minor);

> +		if (ret == 0 && minor > MINORMASK) {

> +			idr_remove(&vfio.idr, minor);

> +			kfree(group);

> +			ret = -ENOSPC;

> +			goto out;

> +		}

> +

> +		group->devt = MKDEV(MAJOR(vfio.devt), minor);

> +		device_create(vfio.class, NULL, group->devt,

> +			      group, "%u", groupid);

> +

> +		group->bus = dev->bus;

> +		list_add(&group->group_next, &vfio.group_list);

> +		new_group = true;

> +	} else {

> +		if (group->bus != dev->bus) {

> +			printk(KERN_WARNING

> +			       "Error: IOMMU group ID conflict.  Group ID %u

> "

> +				"on both bus %s and %s\n", groupid,

> +				group->bus->name, dev->bus->name);

> +			ret = -EFAULT;

> +			goto out;

> +		}

> +

> +		list_for_each(pos, &group->device_list) {

> +			device = list_entry(pos,

> +					    struct vfio_device, device_next);

> +			if (device->dev == dev)

> +				break;

> +			device = NULL;

> +		}

> +	}

> +

> +	if (!device) {

> +		if (__vfio_group_devs_inuse(group) ||

> +		    (group->iommu && group->iommu->refcnt)) {

> +			printk(KERN_WARNING

> +			       "Adding device %s to group %u while group is

> already in use!!\n",

> +			       dev_name(dev), group->groupid);

> +			/* XXX How to prevent other drivers from claiming? */


Here we are adding a device (not yet assigned to a vfio bus) to a group
that is already in use.
Given that it would not be acceptable for this device to get assigned
to a non vfio driver, why not forcing such assignment here then?
I am not sure though what the best way to do it would be.
What about something like this:

- when the bus vfio-pci processes the BUS_NOTIFY_ADD_DEVICE
  notification it assigns to the device a PCI ID that will make sure
  the vfio-pci's probe routine will be invoked (and no other driver can
  therefore claim the device). That PCI ID would have to be added
  to the vfio_pci_driver's id_table (it would be the exception to the
  "only dynamic IDs" rule). Too hackish?

> +		}

> +

> +		device = kzalloc(sizeof(*device), GFP_KERNEL);

> +		if (!device) {

> +			/* If we just created this group, tear it down */

> +			if (new_group) {

> +				list_del(&group->group_next);

> +				device_destroy(vfio.class, group->devt);

> +				idr_remove(&vfio.idr, MINOR(group->devt));

> +				kfree(group);

> +			}

> +			ret = -ENOMEM;

> +			goto out;

> +		}

> +

> +		list_add(&device->device_next, &group->device_list);

> +		device->dev = dev;

> +		device->ops = ops;

> +		device->iommu = group->iommu; /* NULL if new */


Shouldn't you check the return code of __vfio_iommu_attach_dev?

> +		__vfio_iommu_attach_dev(group->iommu, device);

> +	}

> +out:

> +	mutex_unlock(&vfio.lock);

> +	return ret;

> +}

> +EXPORT_SYMBOL_GPL(vfio_group_add_dev);

> +

> +/* Remove a device from the vfio framework */


This fn below does not return any error code. Ok ...
However, there are a number of errors case that you test, for example
- device that does not belong to any group (according to iommu API)
- device that belongs to a group but that does not appear in the list
  of devices of the vfio_group structure.
Are the above two errors checks just paranoia or are those errors actually possible?
If they were possible, shouldn't we generate a warning (most probably
it would be a bug in the code)?

> +void vfio_group_del_dev(struct device *dev)

> +{

> +	struct list_head *pos;

> +	struct vfio_group *group = NULL;

> +	struct vfio_device *device = NULL;

> +	unsigned int groupid;

> +

> +	if (iommu_device_group(dev, &groupid))

> +		return;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	list_for_each(pos, &vfio.group_list) {

> +		group = list_entry(pos, struct vfio_group, group_next);

> +		if (group->groupid == groupid)

> +			break;

> +		group = NULL;

> +	}

> +

> +	if (!group)

> +		goto out;

> +

> +	list_for_each(pos, &group->device_list) {

> +		device = list_entry(pos, struct vfio_device, device_next);

> +		if (device->dev == dev)

> +			break;

> +		device = NULL;

> +	}

> +

> +	if (!device)

> +		goto out;

> +

> +	BUG_ON(device->refcnt);

> +

> +	if (device->attached)

> +		__vfio_iommu_detach_dev(group->iommu, device);

> +

> +	list_del(&device->device_next);

> +	kfree(device);

> +

> +	/* If this was the only device in the group, remove the group.

> +	 * Note that we intentionally unmerge empty groups here if the

> +	 * group fd isn't opened. */

> +	if (list_empty(&group->device_list) && group->refcnt == 0) {

> +		struct vfio_iommu *iommu = group->iommu;

> +

> +		if (iommu) {

> +			__vfio_group_set_iommu(group, NULL);

> +			__vfio_try_dissolve_iommu(iommu);

> +		}

> +

> +		device_destroy(vfio.class, group->devt);

> +		idr_remove(&vfio.idr, MINOR(group->devt));

> +		list_del(&group->group_next);

> +		kfree(group);

> +	}

> +out:

> +	mutex_unlock(&vfio.lock);

> +}

> +EXPORT_SYMBOL_GPL(vfio_group_del_dev);

> +

> +/* When a device is bound to a vfio device driver (ex. vfio-pci), this

> + * entry point is used to mark the device usable (viable).  The vfio

> + * device driver associates a private device_data struct with the

> device

> + * here, which will later be return for vfio_device_fops callbacks. */

> +int vfio_bind_dev(struct device *dev, void *device_data)

> +{

> +	struct vfio_device *device;

> +	int ret = -EINVAL;

> +

> +	BUG_ON(!device_data);

> +

> +	mutex_lock(&vfio.lock);

> +

> +	device = __vfio_lookup_dev(dev);

> +

> +	BUG_ON(!device);

> +

> +	ret = dev_set_drvdata(dev, device);

> +	if (!ret)

> +		device->device_data = device_data;

> +

> +	mutex_unlock(&vfio.lock);

> +	return ret;

> +}

> +EXPORT_SYMBOL_GPL(vfio_bind_dev);

> +

> +/* A device is only removeable if the iommu for the group is not in

> use. */

> +static bool vfio_device_removeable(struct vfio_device *device)

> +{

> +	bool ret = true;

> +

> +	mutex_lock(&vfio.lock);

> +

> +	if (device->iommu && __vfio_iommu_inuse(device->iommu))

> +		ret = false;

> +

> +	mutex_unlock(&vfio.lock);

> +	return ret;

> +}

> +

> +/* Notify vfio that a device is being unbound from the vfio device

> driver

> + * and return the device private device_data pointer.  If the group is

> + * in use, we need to block or take other measures to make it safe for

> + * the device to be removed from the iommu. */

> +void *vfio_unbind_dev(struct device *dev)

> +{

> +	struct vfio_device *device = dev_get_drvdata(dev);

> +	void *device_data;

> +

> +	BUG_ON(!device);

> +

> +again:

> +	if (!vfio_device_removeable(device)) {

> +		/* XXX signal for all devices in group to be removed or

> +		 * resort to killing the process holding the device fds.

> +		 * For now just block waiting for releases to wake us. */

> +		wait_event(vfio.release_q, vfio_device_removeable(device));


Any new idea/proposal on how to handle this situation?
The last one I remember was to leave the soft/hard/etc timeout handling in
userspace and implement it as a sort of policy. Is that one still the most
likely candidate solution to handle this situation?

> +	}

> +

> +	mutex_lock(&vfio.lock);

> +

> +	/* Need to re-check that the device is still removeable under

> lock. */

> +	if (device->iommu && __vfio_iommu_inuse(device->iommu)) {

> +		mutex_unlock(&vfio.lock);

> +		goto again;

> +	}

> +

> +	device_data = device->device_data;

> +

> +	device->device_data = NULL;

> +	dev_set_drvdata(dev, NULL);

> +

> +	mutex_unlock(&vfio.lock);

> +	return device_data;

> +}

> +EXPORT_SYMBOL_GPL(vfio_unbind_dev);

> +

> +/*

> + * Module/class support

> + */

> +static void vfio_class_release(struct kref *kref)

> +{

> +	class_destroy(vfio.class);

> +	vfio.class = NULL;

> +}

> +

> +static char *vfio_devnode(struct device *dev, mode_t *mode)

> +{

> +	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));

> +}

> +

> +static int __init vfio_init(void)

> +{

> +	int ret;

> +

> +	idr_init(&vfio.idr);

> +	mutex_init(&vfio.lock);

> +	INIT_LIST_HEAD(&vfio.group_list);

> +	init_waitqueue_head(&vfio.release_q);

> +

> +	kref_init(&vfio.kref);

> +	vfio.class = class_create(THIS_MODULE, "vfio");

> +	if (IS_ERR(vfio.class)) {

> +		ret = PTR_ERR(vfio.class);

> +		goto err_class;

> +	}

> +

> +	vfio.class->devnode = vfio_devnode;

> +

> +	/* FIXME - how many minors to allocate... all of them! */

> +	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");

> +	if (ret)

> +		goto err_chrdev;

> +

> +	cdev_init(&vfio.cdev, &vfio_group_fops);

> +	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);

> +	if (ret)

> +		goto err_cdev;

> +

> +	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");

> +

> +	return 0;

> +

> +err_cdev:

> +	unregister_chrdev_region(vfio.devt, MINORMASK);

> +err_chrdev:

> +	kref_put(&vfio.kref, vfio_class_release);

> +err_class:

> +	return ret;

> +}

> +

> +static void __exit vfio_cleanup(void)

> +{

> +	struct list_head *gpos, *gppos;

> +

> +	list_for_each_safe(gpos, gppos, &vfio.group_list) {

> +		struct vfio_group *group;

> +		struct list_head *dpos, *dppos;

> +

> +		group = list_entry(gpos, struct vfio_group, group_next);

> +

> +		list_for_each_safe(dpos, dppos, &group->device_list) {

> +			struct vfio_device *device;

> +

> +			device = list_entry(dpos,

> +					    struct vfio_device, device_next);

> +			vfio_group_del_dev(device->dev);

> +		}

> +	}

> +

> +	idr_destroy(&vfio.idr);

> +	cdev_del(&vfio.cdev);

> +	unregister_chrdev_region(vfio.devt, MINORMASK);

> +	kref_put(&vfio.kref, vfio_class_release);

> +}

> +

> +module_init(vfio_init);

> +module_exit(vfio_cleanup);

> +

> +MODULE_VERSION(DRIVER_VERSION);

> +MODULE_LICENSE("GPL v2");

> +MODULE_AUTHOR(DRIVER_AUTHOR);

> +MODULE_DESCRIPTION(DRIVER_DESC);

> diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h

> new file mode 100644

> index 0000000..350ad67

> --- /dev/null

> +++ b/drivers/vfio/vfio_private.h

> @@ -0,0 +1,34 @@

> +/*

> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.

> + *     Author: Alex Williamson <alex.williamson@redhat.com>

> + *

> + * This program is free software; you can redistribute it and/or

> modify

> + * it under the terms of the GNU General Public License version 2 as

> + * published by the Free Software Foundation.

> + *

> + * Derived from original vfio:

> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.

> + * Author: Tom Lyon, pugs@cisco.com

> + */

> +

> +#include <linux/list.h>

> +#include <linux/mutex.h>

> +

> +#ifndef VFIO_PRIVATE_H

> +#define VFIO_PRIVATE_H

> +

> +struct vfio_iommu {

> +	struct iommu_domain		*domain;

> +	struct bus_type			*bus;

> +	struct mutex			dgate;

> +	struct list_head		dm_list;

> +	struct mm_struct		*mm;

> +	struct list_head		group_list;

> +	int				refcnt;

> +	bool				cache;

> +};

> +

> +extern int vfio_release_iommu(struct vfio_iommu *iommu);

> +extern void vfio_iommu_unmapall(struct vfio_iommu *iommu);

> +

> +#endif /* VFIO_PRIVATE_H */

> diff --git a/include/linux/vfio.h b/include/linux/vfio.h

> new file mode 100644

> index 0000000..4269b08

> --- /dev/null

> +++ b/include/linux/vfio.h

> @@ -0,0 +1,155 @@

> +/*

> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.

> + * Author: Tom Lyon, pugs@cisco.com

> + *

> + * This program is free software; you may redistribute it and/or

> modify

> + * it under the terms of the GNU General Public License as published

> by

> + * the Free Software Foundation; version 2 of the License.

> + *

> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS

> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN

> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN

> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

> + * SOFTWARE.

> + *

> + * Portions derived from drivers/uio/uio.c:

> + * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>

> + * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>

> + * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>

> + * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>

> + *

> + * Portions derived from drivers/uio/uio_pci_generic.c:

> + * Copyright (C) 2009 Red Hat, Inc.

> + * Author: Michael S. Tsirkin <mst@redhat.com>

> + */

> +#include <linux/types.h>

> +

> +#ifndef VFIO_H

> +#define VFIO_H

> +

> +#ifdef __KERNEL__

> +

> +struct vfio_device_ops {

> +	bool			(*match)(struct device *, char *);

> +	int			(*get)(void *);

> +	void			(*put)(void *);

> +	ssize_t			(*read)(void *, char __user *,

> +					size_t, loff_t *);

> +	ssize_t			(*write)(void *, const char __user *,

> +					 size_t, loff_t *);

> +	long			(*ioctl)(void *, unsigned int, unsigned long);

> +	int			(*mmap)(void *, struct vm_area_struct *);

> +};

> +

> +extern int vfio_group_add_dev(struct device *device,

> +			      const struct vfio_device_ops *ops);

> +extern void vfio_group_del_dev(struct device *device);

> +extern int vfio_bind_dev(struct device *device, void *device_data);

> +extern void *vfio_unbind_dev(struct device *device);

> +

> +#endif /* __KERNEL__ */

> +

> +/*

> + * VFIO driver - allow mapping and use of certain devices

> + * in unprivileged user processes. (If IOMMU is present)

> + * Especially useful for Virtual Function parts of SR-IOV devices

> + */

> +

> +

> +/* Kernel & User level defines for ioctls */

> +

> +#define VFIO_GROUP_GET_FLAGS		_IOR(';', 100, __u64)

> + #define VFIO_GROUP_FLAGS_VIABLE	(1 << 0)

> + #define VFIO_GROUP_FLAGS_MM_LOCKED	(1 << 1)

> +#define VFIO_GROUP_MERGE		_IOW(';', 101, int)

> +#define VFIO_GROUP_UNMERGE		_IOW(';', 102, int)

> +#define VFIO_GROUP_GET_IOMMU_FD		_IO(';', 103)

> +#define VFIO_GROUP_GET_DEVICE_FD	_IOW(';', 104, char *)

> +

> +/*

> + * Structure for DMA mapping of user buffers

> + * vaddr, dmaaddr, and size must all be page aligned

> + */

> +struct vfio_dma_map {

> +	__u64	len;		/* length of structure */

> +	__u64	vaddr;		/* process virtual addr */

> +	__u64	dmaaddr;	/* desired and/or returned dma address */

> +	__u64	size;		/* size in bytes */

> +	__u64	flags;

> +#define	VFIO_DMA_MAP_FLAG_WRITE		(1 << 0) /* req writeable DMA

> mem */

> +};

> +

> +#define	VFIO_IOMMU_GET_FLAGS		_IOR(';', 105, __u64)

> + /* Does the IOMMU support mapping any IOVA to any virtual address? */

> + #define VFIO_IOMMU_FLAGS_MAP_ANY	(1 << 0)

> +#define	VFIO_IOMMU_MAP_DMA		_IOWR(';', 106, struct

> vfio_dma_map)

> +#define	VFIO_IOMMU_UNMAP_DMA		_IOWR(';', 107, struct

> vfio_dma_map)

> +

> +#define VFIO_DEVICE_GET_FLAGS		_IOR(';', 108, __u64)

> + #define VFIO_DEVICE_FLAGS_PCI		(1 << 0)

> + #define VFIO_DEVICE_FLAGS_DT		(1 << 1)

> + #define VFIO_DEVICE_FLAGS_RESET	(1 << 2)

> +#define VFIO_DEVICE_GET_NUM_REGIONS	_IOR(';', 109, int)

> +

> +struct vfio_region_info {

> +	__u32	len;		/* length of structure */

> +	__u32	index;		/* region number */

> +	__u64	size;		/* size in bytes of region */

> +	__u64	offset;		/* start offset of region */

> +	__u64	flags;

> +#define VFIO_REGION_INFO_FLAG_MMAP		(1 << 0)

> +#define VFIO_REGION_INFO_FLAG_RO		(1 << 1)

> +#define VFIO_REGION_INFO_FLAG_PHYS_VALID	(1 << 2)

> +	__u64	phys;		/* physical address of region */

> +};

> +

> +#define VFIO_DEVICE_GET_REGION_INFO	_IOWR(';', 110, struct

> vfio_region_info)

> +

> +#define VFIO_DEVICE_GET_NUM_IRQS	_IOR(';', 111, int)

> +

> +struct vfio_irq_info {

> +	__u32	len;		/* length of structure */

> +	__u32	index;		/* IRQ number */

> +	__u32	count;		/* number of individual IRQs */

> +	__u32	flags;

> +#define VFIO_IRQ_INFO_FLAG_LEVEL		(1 << 0)

> +};

> +

> +#define VFIO_DEVICE_GET_IRQ_INFO	_IOWR(';', 112, struct

> vfio_irq_info)

> +

> +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] =

> eventfds */

> +#define VFIO_DEVICE_SET_IRQ_EVENTFDS	_IOW(';', 113, int)

> +

> +/* Unmask IRQ index, arg[0] = index */

> +#define VFIO_DEVICE_UNMASK_IRQ		_IOW(';', 114, int)

> +

> +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */

> +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD	_IOW(';', 115, int)

> +

> +#define VFIO_DEVICE_RESET		_IO(';', 116)

> +

> +struct vfio_dtpath {

> +	__u32	len;		/* length of structure */

> +	__u32	index;

> +	__u64	flags;

> +#define VFIO_DTPATH_FLAGS_REGION	(1 << 0)

> +#define VFIO_DTPATH_FLAGS_IRQ		(1 << 1)

> +	char	*path;

> +};

> +#define VFIO_DEVICE_GET_DTPATH		_IOWR(';', 117, struct

> vfio_dtpath)

> +

> +struct vfio_dtindex {

> +	__u32	len;		/* length of structure */

> +	__u32	index;

> +	__u32	prop_type;

> +	__u32	prop_index;

> +	__u64	flags;

> +#define VFIO_DTINDEX_FLAGS_REGION	(1 << 0)

> +#define VFIO_DTINDEX_FLAGS_IRQ		(1 << 1)

> +};

> +#define VFIO_DEVICE_GET_DTINDEX		_IOWR(';', 118, struct

> vfio_dtindex)

> +

> +#endif /* VFIO_H */


/Chris
Alex Williamson Nov. 9, 2011, 6:02 p.m. UTC | #4
On Wed, 2011-11-09 at 02:11 -0600, Christian Benvenuti (benve) wrote:
> I have not gone through the all patch yet, but here are
> my first comments/questions about the code in vfio_main.c
> (and pci/vfio_pci.c).

Thanks!  Comments inline...

> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Thursday, November 03, 2011 1:12 PM
> > To: chrisw@sous-sol.org; aik@au1.ibm.com; pmac@au1.ibm.com;
> > dwg@au1.ibm.com; joerg.roedel@amd.com; agraf@suse.de; Christian
> > Benvenuti (benve); Aaron Fabbri (aafabbri); B08248@freescale.com;
> > B07421@freescale.com; avi@redhat.com; konrad.wilk@oracle.com;
> > kvm@vger.kernel.org; qemu-devel@nongnu.org; iommu@lists.linux-
> > foundation.org; linux-pci@vger.kernel.org
> > Subject: [RFC PATCH] vfio: VFIO Driver core framework
> 
> <snip>
> 
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > new file mode 100644
> > index 0000000..6169356
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -0,0 +1,1151 @@
> > +/*
> > + * VFIO framework
> > + *
> > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> > + *     Author: Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This program is free software; you can redistribute it and/or
> > modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * Derived from original vfio:
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@cisco.com
> > + */
> > +
> > +#include <linux/cdev.h>
> > +#include <linux/compat.h>
> > +#include <linux/device.h>
> > +#include <linux/file.h>
> > +#include <linux/anon_inodes.h>
> > +#include <linux/fs.h>
> > +#include <linux/idr.h>
> > +#include <linux/iommu.h>
> > +#include <linux/mm.h>
> > +#include <linux/module.h>
> > +#include <linux/slab.h>
> > +#include <linux/string.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/vfio.h>
> > +#include <linux/wait.h>
> > +
> > +#include "vfio_private.h"
> > +
> > +#define DRIVER_VERSION	"0.2"
> > +#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
> > +#define DRIVER_DESC	"VFIO - User Level meta-driver"
> > +
> > +static int allow_unsafe_intrs;
> > +module_param(allow_unsafe_intrs, int, 0);
> > +MODULE_PARM_DESC(allow_unsafe_intrs,
> > +        "Allow use of IOMMUs which do not support interrupt
> > remapping");
> > +
> > +static struct vfio {
> > +	dev_t			devt;
> > +	struct cdev		cdev;
> > +	struct list_head	group_list;
> > +	struct mutex		lock;
> > +	struct kref		kref;
> > +	struct class		*class;
> > +	struct idr		idr;
> > +	wait_queue_head_t	release_q;
> > +} vfio;
> > +
> > +static const struct file_operations vfio_group_fops;
> > +extern const struct file_operations vfio_iommu_fops;
> > +
> > +struct vfio_group {
> > +	dev_t			devt;
> > +	unsigned int		groupid;
> 
> This groupid is returned by the device_group callback you recently added
> with a separate (not yet in tree) IOMMU patch.
> Is it correct to say that the scope of this ID is the bus the iommu
> belongs too (but you use it as if it was global)?
> I believe there is nothing right now to ensure the uniqueness of such
> ID across bus types (assuming there will be other bus drivers in the
> future besides vfio-pci).
> If that's the case, the vfio.group_list global list and the __vfio_lookup_dev
> routine should be changed to account for the bus too?
> Ops, I just saw the error msg in vfio_group_add_dev about the group id conflict.
> Is that warning related to what I mentioned above?

Yeah, this is a concern, but I can't think of a system where we would
manifest a collision.  The IOMMU driver is expected to provide unique
groupids for all devices below them, but we could imagine a system that
implements two different bus_types, each with a different IOMMU driver
and we have no coordination between them.  Perhaps since we have
iommu_ops per bus, we should also expose the bus in the vfio group path,
ie. /dev/vfio/%s/%u, dev->bus->name, iommu_device_group(dev,..).  This
means userspace would need to do a readlink of the subsystem entry where
it finds the iommu_group to find the vfio group.  Reasonable?

> > +	struct bus_type		*bus;
> > +	struct vfio_iommu	*iommu;
> > +	struct list_head	device_list;
> > +	struct list_head	iommu_next;
> > +	struct list_head	group_next;
> > +	int			refcnt;
> > +};
> > +
> > +struct vfio_device {
> > +	struct device			*dev;
> > +	const struct vfio_device_ops	*ops;
> > +	struct vfio_iommu		*iommu;
> 
> I wonder if you need to have the 'iommu' field here.
> vfio_device.iommu is always set and reset together with
> vfio_group.iommu.
> Given that a vfio_device instance is always linked to a vfio_group
> instance, do we need this duplication? Is this duplication there
> because you do not want the double dereference device->group->iommu?

I think that was my initial goal in duplicating the pointer on the
device.  I believe I was also at one point passing a vfio_device around
and needed the pointer.  We seem to be getting along fine w/o that and I
don't see any performance sensitive paths from getting from the device
to iommu, so I'll see about removing it.

> > +	struct vfio_group		*group;
> > +	struct list_head		device_next;
> > +	bool				attached;
> > +	int				refcnt;
> > +	void				*device_data;
> > +};
> > +
> > +/*
> > + * Helper functions called under vfio.lock
> > + */
> > +
> > +/* Return true if any devices within a group are opened */
> > +static bool __vfio_group_devs_inuse(struct vfio_group *group)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		if (device->refcnt)
> > +			return true;
> > +	}
> > +	return false;
> > +}
> > +
> > +/* Return true if any of the groups attached to an iommu are opened.
> > + * We can only tear apart merged groups when nothing is left open. */
> > +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +		if (group->refcnt)
> > +			return true;
> > +	}
> > +	return false;
> > +}
> > +
> > +/* An iommu is "in use" if it has a file descriptor open or if any of
> > + * the groups assigned to the iommu have devices open. */
> > +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	if (iommu->refcnt)
> > +		return true;
> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +
> > +		if (__vfio_group_devs_inuse(group))
> > +			return true;
> > +	}
> > +	return false;
> > +}
> 
> I looked at how you take care of ref counts ...
> 
> This is how the tree of vfio_iommu/vfio_group/vfio_device data
> Structures is organized (I'll use just iommu/group/dev to make
> the graph smaller):
> 
>             iommu
>            /     \
>           /       \ 
>     group   ...     group
>     /  \           /  \   
>    /    \         /    \
> dev  ..  dev   dev  ..  dev
> 
> This is how you get a file descriptor for the three kind of objects:
> 
> - group : open /dev/vfio/xxx for group xxx
> - iommu : group ioctl VFIO_GROUP_GET_IOMMU_FD
> - device: group ioctl VFIO_GROUP_GET_DEVICE_FD
> 
> Given the above topology, I would assume that:
> 
> (1) an iommu is 'inuse' if : a) iommu refcnt > 0, or
>                              b) any of its groups is 'inuse'
> 
> (2) a  group is 'inuse' if : a) group refcnt > 0, or
>                              b) any of its devices is 'inuse'
> 
> (3) a device is 'inuse' if : a) device refcnt > 0

(2) is a bit debatable.  I've wrestled with this one for a while.  The
vfio_iommu serves two purposes.  First, it is the object we use for
managing iommu domains, which includes allocating domains and attaching
devices to domains.  Groups objects aren't involved here, they just
manage the set of devices.  The second role is to manage merged groups,
because whether or not groups can be merged is a function of iommu
domain compatibility.

So if we look at "is the iommu in use?" ie. can I destroy the mapping
context, detach devices and free the domain, the reference count on the
group is irrelevant.  The user has to have a device or iommu file
descriptor opened somewhere, across the group or merged group, for that
context to be maintained.  A reasonable requirement, I think.

However, if we ask "is the group in use?" ie. can I not only destroy the
mappings above, but also automatically tear apart merged groups, then I
think we need to look at the group refcnt.

There's also a symmetry factor, the group is a benign entry point to
device access.  It's only when device or iommu access is granted that
the group gains any real power.  Therefore, shouldn't that power also be
removed when those access points are closed?

> You have coded the 'inuse' logic with these three routines:
> 
>     __vfio_iommu_inuse, which implements (1) above
> 
> and
>     __vfio_iommu_groups_inuse

Implements (2.a)

>     __vfio_group_devs_inuse

Implements (2.b)

> which are used by __vfio_iommu_inuse.
> Why don't you check the group refcnt in __vfio_iommu_groups_inuse?

Hopefully explained above, but open for discussion.

> Would it make sense (and the code more readable) to structure the
> nested refcnt/inuse check like this?
> (The numbers (1)(2)(3) refer to the three 'inuse' conditions above)
> 
>    (1)__vfio_iommu_inuse
>    |
>    +-> check iommu refcnt
>    +-> __vfio_iommu_groups_inuse
>        |
>        +->LOOP: (2)__vfio_iommu_group_inuse<--MISSING
>                 |
>                 +-> check group refcnt<--MISSING
>                 +-> __vfio_group_devs_inuse()
>                     |
>                     +-> LOOP: (3)__vfio_group_dev_inuse<--MISSING
>                               |
>                               +-> check device refcnt

We currently do:

   (1)__vfio_iommu_inuse
    |
    +-> check iommu refcnt
    +-> __vfio_group_devs_inuse
        |
        +->LOOP: (2.b)__vfio_group_devs_inuse
                  |
                  +-> LOOP: (3) check device refcnt

If that passes, the iommu context can be dissolved and we follow up
with:

    __vfio_iommu_groups_inuse
    |
    +-> LOOP: (2.a)__vfio_iommu_groups_inuse
               |
               +-> check group refcnt

If that passes, groups can also be umerged.

Is this right?

> > +static void __vfio_group_set_iommu(struct vfio_group *group,
> > +				   struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	if (group->iommu)
> > +		list_del(&group->iommu_next);
> > +	if (iommu)
> > +		list_add(&group->iommu_next, &iommu->group_list);
> > +
> > +	group->iommu = iommu;
> 
> If you remove the vfio_device.iommu field (as suggested above in a previous
> Comment), the block below would not be needed anymore.

Yep, I'll try removing that and see how it plays out.

> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		device->iommu = iommu;
> > +	}
> > +}
> > +
> > +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
> > +				    struct vfio_device *device)
> > +{
> > +	BUG_ON(!iommu->domain && device->attached);
> > +
> > +	if (!iommu->domain || !device->attached)
> > +		return;
> > +
> > +	iommu_detach_device(iommu->domain, device->dev);
> > +	device->attached = false;
> > +}
> > +
> > +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
> > +				      struct vfio_group *group)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		__vfio_iommu_detach_dev(iommu, device);
> > +	}
> > +}
> > +
> > +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
> > +				   struct vfio_device *device)
> > +{
> > +	int ret;
> > +
> > +	BUG_ON(device->attached);
> > +
> > +	if (!iommu || !iommu->domain)
> > +		return -EINVAL;
> > +
> > +	ret = iommu_attach_device(iommu->domain, device->dev);
> > +	if (!ret)
> > +		device->attached = true;
> > +
> > +	return ret;
> > +}
> > +
> > +static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,
> > +				     struct vfio_group *group)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +		int ret;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		ret = __vfio_iommu_attach_dev(iommu, device);
> > +		if (ret) {
> > +			__vfio_iommu_detach_group(iommu, group);
> > +			return ret;
> > +		}
> > +	}
> > +	return 0;
> > +}
> > +
> > +/* The iommu is viable, ie. ready to be configured, when all the
> > devices
> > + * for all the groups attached to the iommu are bound to their vfio
> > device
> > + * drivers (ex. vfio-pci).  This sets the device_data private data
> > pointer. */
> > +static bool __vfio_iommu_viable(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *gpos, *dpos;
> > +
> > +	list_for_each(gpos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +		group = list_entry(gpos, struct vfio_group, iommu_next);
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (!device->device_data)
> > +				return false;
> > +		}
> > +	}
> > +	return true;
> > +}
> > +
> > +static void __vfio_close_iommu(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	if (!iommu->domain)
> > +		return;
> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +
> > +		__vfio_iommu_detach_group(iommu, group);
> > +	}
> > +
> > +	vfio_iommu_unmapall(iommu);
> > +
> > +	iommu_domain_free(iommu->domain);
> > +	iommu->domain = NULL;
> > +	iommu->mm = NULL;
> > +}
> > +
> > +/* Open the IOMMU.  This gates all access to the iommu or device file
> > + * descriptors and sets current->mm as the exclusive user. */
> 
> Given the fn  vfio_group_open (ie, 1st object, 2nd operation), I would have
> called this one __vfio_iommu_open (instead of __vfio_open_iommu).
> Is it named __vfio_open_iommu to avoid a conflict with the namespace in vfio_iommu.c?      

I would have expected that too, I'll look at renaming these.

> > +static int __vfio_open_iommu(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +	int ret;
> > +
> > +	if (!__vfio_iommu_viable(iommu))
> > +		return -EBUSY;
> > +
> > +	if (iommu->domain)
> > +		return -EINVAL;
> > +
> > +	iommu->domain = iommu_domain_alloc(iommu->bus);
> > +	if (!iommu->domain)
> > +		return -EFAULT;
> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +
> > +		ret = __vfio_iommu_attach_group(iommu, group);
> > +		if (ret) {
> > +			__vfio_close_iommu(iommu);
> > +			return ret;
> > +		}
> > +	}
> > +
> > +	if (!allow_unsafe_intrs &&
> > +	    !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
> > +		__vfio_close_iommu(iommu);
> > +		return -EFAULT;
> > +	}
> > +
> > +	iommu->cache = (iommu_domain_has_cap(iommu->domain,
> > +					     IOMMU_CAP_CACHE_COHERENCY) != 0);
> > +	iommu->mm = current->mm;
> > +
> > +	return 0;
> > +}
> > +
> > +/* Actively try to tear down the iommu and merged groups.  If there
> > are no
> > + * open iommu or device fds, we close the iommu.  If we close the
> > iommu and
> > + * there are also no open group fds, we can futher dissolve the group
> > to
> > + * iommu association and free the iommu data structure. */
> > +static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)
> > +{
> > +
> > +	if (__vfio_iommu_inuse(iommu))
> > +		return -EBUSY;
> > +
> > +	__vfio_close_iommu(iommu);
> > +
> > +	if (!__vfio_iommu_groups_inuse(iommu)) {
> > +		struct list_head *pos, *ppos;
> > +
> > +		list_for_each_safe(pos, ppos, &iommu->group_list) {
> > +			struct vfio_group *group;
> > +
> > +			group = list_entry(pos, struct vfio_group,
> > iommu_next);
> > +			__vfio_group_set_iommu(group, NULL);
> > +		}
> > +
> > +
> > +		kfree(iommu);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static struct vfio_device *__vfio_lookup_dev(struct device *dev)
> > +{
> > +	struct list_head *gpos;
> > +	unsigned int groupid;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return NULL;
> > +
> > +	list_for_each(gpos, &vfio.group_list) {
> > +		struct vfio_group *group;
> > +		struct list_head *dpos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, group_next);
> > +
> > +		if (group->groupid != groupid)
> > +			continue;
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (device->dev == dev)
> > +				return device;
> > +		}
> > +	}
> > +	return NULL;
> > +}
> > +
> > +/* All release paths simply decrement the refcnt, attempt to teardown
> > + * the iommu and merged groups, and wakeup anything that might be
> > + * waiting if we successfully dissolve anything. */
> > +static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)
> > +{
> > +	bool wake;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	(*refcnt)--;
> > +	wake = (__vfio_try_dissolve_iommu(iommu) == 0);
> > +
> > +	mutex_unlock(&vfio.lock);
> > +
> > +	if (wake)
> > +		wake_up(&vfio.release_q);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Device fops - passthrough to vfio device driver w/ device_data
> > + */
> > +static int vfio_device_release(struct inode *inode, struct file
> > *filep)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	vfio_do_release(&device->refcnt, device->iommu);
> > +
> > +	device->ops->put(device->device_data);
> > +
> > +	return 0;
> > +}
> > +
> > +static long vfio_device_unl_ioctl(struct file *filep,
> > +				  unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->ioctl(device->device_data, cmd, arg);
> > +}
> > +
> > +static ssize_t vfio_device_read(struct file *filep, char __user *buf,
> > +				size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->read(device->device_data, buf, count, ppos);
> > +}
> > +
> > +static ssize_t vfio_device_write(struct file *filep, const char __user
> > *buf,
> > +				 size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->write(device->device_data, buf, count, ppos);
> > +}
> > +
> > +static int vfio_device_mmap(struct file *filep, struct vm_area_struct
> > *vma)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->mmap(device->device_data, vma);
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_device_compat_ioctl(struct file *filep,
> > +				     unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_device_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +const struct file_operations vfio_device_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.release	= vfio_device_release,
> > +	.read		= vfio_device_read,
> > +	.write		= vfio_device_write,
> > +	.unlocked_ioctl	= vfio_device_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_device_compat_ioctl,
> > +#endif
> > +	.mmap		= vfio_device_mmap,
> > +};
> > +
> > +/*
> > + * Group fops
> > + */
> > +static int vfio_group_open(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_group *group;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	group = idr_find(&vfio.idr, iminor(inode));
> > +
> > +	if (!group) {
> > +		ret = -ENODEV;
> > +		goto out;
> > +	}
> > +
> > +	filep->private_data = group;
> > +
> > +	if (!group->iommu) {
> > +		struct vfio_iommu *iommu;
> > +
> > +		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> > +		if (!iommu) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +		INIT_LIST_HEAD(&iommu->group_list);
> > +		INIT_LIST_HEAD(&iommu->dm_list);
> > +		mutex_init(&iommu->dgate);
> > +		iommu->bus = group->bus;
> > +		__vfio_group_set_iommu(group, iommu);
> > +	}
> > +	group->refcnt++;
> > +
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +
> > +	return ret;
> > +}
> > +
> > +static int vfio_group_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_group *group = filep->private_data;
> > +
> > +	return vfio_do_release(&group->refcnt, group->iommu);
> > +}
> > +
> > +/* Attempt to merge the group pointed to by fd into group.  The merge-
> > ee
> > + * group must not have an iommu or any devices open because we cannot
> > + * maintain that context across the merge.  The merge-er group can be
> > + * in use. */
> > +static int vfio_group_merge(struct vfio_group *group, int fd)
> 
> The documentation in vfio.txt explains clearly the logic implemented by
> the merge/unmerge group ioctls.
> However, what you are doing is not merging groups, but rather adding/removing
> groups to/from iommus (and creating flat lists of groups).
> For example, when you do
> 
>   merge(A,B)
> 
> you actually mean to say "merge B to the list of groups assigned to the
> same iommu as group A".

It's actually a little more than that.  After you've merged B into A,
you can close the file descriptor for B and access all of the devices
for the merged group from A.

> For the same reason, you do not really need to provide the group you want
> to unmerge from, which means that instead of
> 
>   unmerge(A,B) 
> 
> you would just need
> 
>   unmerge(B)

Good point, we can avoid the awkward reference via file descriptor for
the unmerge.

> I understand the reason why it is not a real merge/unmerge (ie, to keep the
> original groups so that you can unmerge later)

Right, we still need to have visibility of the groups comprising the
merged group, but the abstraction provided to the user seems to be
deeper than you're thinking.

>  ... however I just wonder if
> it wouldn't be more natural to implement the VFIO_IOMMU_ADD_GROUP/DEL_GROUP
> iommu ioctls instead? (the relationships between the data structure would
> remain the same)
> I guess you already discarded this option for some reasons, right? What was
> the reason?

It's a possibility, I'm not sure it was discussed or really what
advantage it provides.  It seems like we'd logically lose the ability to
access devices from other groups, whether that's good or bad, I don't
know.  I think the notion of "merge" promotes the idea that the groups
are peers and an iommu_add/del feels a bit more hierarchical.

> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *old_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +	bool opened = false;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +
> > +	if (!new || new == group || !new->iommu ||
> > +	    new->iommu->domain || new->bus != group->bus) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We need to attach all the devices to each domain separately
> > +	 * in order to validate that the capabilities match for both.  */
> > +	ret = __vfio_open_iommu(new->iommu);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +		opened = true;
> > +	}
> > +
> > +	/* If cache coherency doesn't match we'd potentialy need to
> > +	 * remap existing iommu mappings in the merge-er domain.
> > +	 * Poor return to bother trying to allow this currently. */
> > +	if (iommu_domain_has_cap(group->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY) !=
> > +	    iommu_domain_has_cap(new->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY)) {
> > +		__vfio_close_iommu(new->iommu);
> > +		if (opened)
> > +			__vfio_close_iommu(group->iommu);
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* Close the iommu for the merge-ee and attach all its devices
> > +	 * to the merge-er iommu. */
> > +	__vfio_close_iommu(new->iommu);
> > +
> > +	ret = __vfio_iommu_attach_group(group->iommu, new);
> > +	if (ret)
> > +		goto out;
> > +
> > +	/* set_iommu unlinks new from the iommu, so save a pointer to it
> > */
> > +	old_iommu = new->iommu;
> > +	__vfio_group_set_iommu(new, group->iommu);
> > +	kfree(old_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Unmerge the group pointed to by fd from group. */
> > +static int vfio_group_unmerge(struct vfio_group *group, int fd)
> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *new_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +
> > +	/* Since the merge-out group is already opened, it needs to
> > +	 * have an iommu struct associated with it. */
> > +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
> > +	if (!new_iommu)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&new_iommu->group_list);
> > +	INIT_LIST_HEAD(&new_iommu->dm_list);
> > +	mutex_init(&new_iommu->dgate);
> > +	new_iommu->bus = group->bus;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +	if (!new || new == group || new->iommu != group->iommu) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We can't merge-out a group with devices still in use. */
> > +	if (__vfio_group_devs_inuse(new)) {
> > +		ret = -EBUSY;
> > +		goto out;
> > +	}
> > +
> > +	__vfio_iommu_detach_group(group->iommu, new);
> > +	__vfio_group_set_iommu(new, new_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	if (ret)
> > +		kfree(new_iommu);
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new iommu file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set. */
> > +static int vfio_group_get_iommu_fd(struct vfio_group *group)
> > +{
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
> > +			       group->iommu, O_RDWR);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	group->iommu->refcnt++;
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new device file descriptor.  This will open the iommu,
> > setting
> > + * the current->mm ownership if it's not already set.  It's difficult
> > to
> > + * specify the requirements for matching a user supplied buffer to a
> > + * device, so we use a vfio driver callback to test for a match.  For
> > + * PCI, dev_name(dev) is unique, but other drivers may require
> > including
> > + * a parent device string. */
> > +static int vfio_group_get_device_fd(struct vfio_group *group, char
> > *buf)
> > +{
> > +	struct vfio_iommu *iommu = group->iommu;
> > +	struct list_head *gpos;
> > +	int ret = -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!iommu->domain) {
> > +		ret = __vfio_open_iommu(iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	list_for_each(gpos, &iommu->group_list) {
> > +		struct list_head *dpos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, iommu_next);
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (device->ops->match(device->dev, buf)) {
> > +				struct file *file;
> > +
> > +				if (device->ops->get(device->device_data)) {
> > +					ret = -EFAULT;
> > +					goto out;
> > +				}
> > +
> > +				/* We can't use anon_inode_getfd(), like above
> > +				 * because we need to modify the f_mode flags
> > +				 * directly to allow more than just ioctls */
> > +				ret = get_unused_fd();
> > +				if (ret < 0) {
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> > +
> > +				file = anon_inode_getfile("[vfio-device]",
> > +							  &vfio_device_fops,
> > +							  device, O_RDWR);
> > +				if (IS_ERR(file)) {
> > +					put_unused_fd(ret);
> > +					ret = PTR_ERR(file);
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> > +
> > +				/* Todo: add an anon_inode interface to do
> > +				 * this.  Appears to be missing by lack of
> > +				 * need rather than explicitly prevented.
> > +				 * Now there's need. */
> > +				file->f_mode |= (FMODE_LSEEK |
> > +						 FMODE_PREAD |
> > +						 FMODE_PWRITE);
> > +
> > +				fd_install(ret, file);
> > +
> > +				device->refcnt++;
> > +				goto out;
> > +			}
> > +		}
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +static long vfio_group_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_group *group = filep->private_data;
> > +
> > +	if (cmd == VFIO_GROUP_GET_FLAGS) {
> > +		u64 flags = 0;
> > +
> > +		mutex_lock(&vfio.lock);
> > +		if (__vfio_iommu_viable(group->iommu))
> > +			flags |= VFIO_GROUP_FLAGS_VIABLE;
> > +		mutex_unlock(&vfio.lock);
> > +
> > +		if (group->iommu->mm)
> > +			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;
> > +
> > +		return put_user(flags, (u64 __user *)arg);
> > +	}
> > +
> > +	/* Below commands are restricted once the mm is set */
> > +	if (group->iommu->mm && group->iommu->mm != current->mm)
> > +		return -EPERM;
> > +	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {
> > +		int fd;
> > +
> > +		if (get_user(fd, (int __user *)arg))
> > +			return -EFAULT;
> > +		if (fd < 0)
> > +			return -EINVAL;
> > +
> > +		if (cmd == VFIO_GROUP_MERGE)
> > +			return vfio_group_merge(group, fd);
> > +		else
> > +			return vfio_group_unmerge(group, fd);
> > +	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {
> > +		return vfio_group_get_iommu_fd(group);
> > +	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {
> > +		char *buf;
> > +		int ret;
> > +
> > +		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
> > +		if (IS_ERR(buf))
> > +			return PTR_ERR(buf);
> > +
> > +		ret = vfio_group_get_device_fd(group, buf);
> > +		kfree(buf);
> > +		return ret;
> > +	}
> > +
> > +	return -ENOSYS;
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_group_compat_ioctl(struct file *filep,
> > +				    unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_group_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +static const struct file_operations vfio_group_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.open		= vfio_group_open,
> > +	.release	= vfio_group_release,
> > +	.unlocked_ioctl	= vfio_group_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_group_compat_ioctl,
> > +#endif
> > +};
> > +
> > +/* iommu fd release hook */
> 
> Given vfio_device_release and
>       vfio_group_release (ie, 1st object, 2nd operation), I was
> going to suggest renaming the fn below to vfio_iommu_release, but
> then I saw the latter name being already used in vfio_iommu.c ...
> a bit confusing but I guess it's ok then.

Right, this one was definitely because of naming collision.

> > +int vfio_release_iommu(struct vfio_iommu *iommu)
> > +{
> > +	return vfio_do_release(&iommu->refcnt, iommu);
> > +}
> > +
> > +/*
> > + * VFIO driver API
> > + */
> > +
> > +/* Add a new device to the vfio framework with associated vfio driver
> > + * callbacks.  This is the entry point for vfio drivers to register
> > devices. */
> > +int vfio_group_add_dev(struct device *dev, const struct
> > vfio_device_ops *ops)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_group *group = NULL;
> > +	struct vfio_device *device = NULL;
> > +	unsigned int groupid;
> > +	int ret = 0;
> > +	bool new_group = false;
> > +
> > +	if (!ops)
> > +		return -EINVAL;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		group = list_entry(pos, struct vfio_group, group_next);
> > +		if (group->groupid == groupid)
> > +			break;
> > +		group = NULL;
> > +	}
> > +
> > +	if (!group) {
> > +		int minor;
> > +
> > +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group = kzalloc(sizeof(*group), GFP_KERNEL);
> > +		if (!group) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group->groupid = groupid;
> > +		INIT_LIST_HEAD(&group->device_list);
> > +
> > +		ret = idr_get_new(&vfio.idr, group, &minor);
> > +		if (ret == 0 && minor > MINORMASK) {
> > +			idr_remove(&vfio.idr, minor);
> > +			kfree(group);
> > +			ret = -ENOSPC;
> > +			goto out;
> > +		}
> > +
> > +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> > +		device_create(vfio.class, NULL, group->devt,
> > +			      group, "%u", groupid);
> > +
> > +		group->bus = dev->bus;
> > +		list_add(&group->group_next, &vfio.group_list);
> > +		new_group = true;
> > +	} else {
> > +		if (group->bus != dev->bus) {
> > +			printk(KERN_WARNING
> > +			       "Error: IOMMU group ID conflict.  Group ID %u
> > "
> > +				"on both bus %s and %s\n", groupid,
> > +				group->bus->name, dev->bus->name);
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +
> > +		list_for_each(pos, &group->device_list) {
> > +			device = list_entry(pos,
> > +					    struct vfio_device, device_next);
> > +			if (device->dev == dev)
> > +				break;
> > +			device = NULL;
> > +		}
> > +	}
> > +
> > +	if (!device) {
> > +		if (__vfio_group_devs_inuse(group) ||
> > +		    (group->iommu && group->iommu->refcnt)) {
> > +			printk(KERN_WARNING
> > +			       "Adding device %s to group %u while group is
> > already in use!!\n",
> > +			       dev_name(dev), group->groupid);
> > +			/* XXX How to prevent other drivers from claiming? */
> 
> Here we are adding a device (not yet assigned to a vfio bus) to a group
> that is already in use.
> Given that it would not be acceptable for this device to get assigned
> to a non vfio driver, why not forcing such assignment here then?

Exactly, I just don't know the mechanics of how to make that happen and
was hoping for suggestions...

> I am not sure though what the best way to do it would be.
> What about something like this:
> 
> - when the bus vfio-pci processes the BUS_NOTIFY_ADD_DEVICE
>   notification it assigns to the device a PCI ID that will make sure
>   the vfio-pci's probe routine will be invoked (and no other driver can
>   therefore claim the device). That PCI ID would have to be added
>   to the vfio_pci_driver's id_table (it would be the exception to the
>   "only dynamic IDs" rule). Too hackish?

Presumably some other driver also has the ID in it's id_table, how do we
make sure we win?

> > +		}
> > +
> > +		device = kzalloc(sizeof(*device), GFP_KERNEL);
> > +		if (!device) {
> > +			/* If we just created this group, tear it down */
> > +			if (new_group) {
> > +				list_del(&group->group_next);
> > +				device_destroy(vfio.class, group->devt);
> > +				idr_remove(&vfio.idr, MINOR(group->devt));
> > +				kfree(group);
> > +			}
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		list_add(&device->device_next, &group->device_list);
> > +		device->dev = dev;
> > +		device->ops = ops;
> > +		device->iommu = group->iommu; /* NULL if new */
> 
> Shouldn't you check the return code of __vfio_iommu_attach_dev?

Yep, looks like I did this because the expected use case has a NULL
iommu here, so I need to distiguish that error from an actual
iommu_attach_device() error.

> > +		__vfio_iommu_attach_dev(group->iommu, device);
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_group_add_dev);
> > +
> > +/* Remove a device from the vfio framework */
> 
> This fn below does not return any error code. Ok ...
> However, there are a number of errors case that you test, for example
> - device that does not belong to any group (according to iommu API)
> - device that belongs to a group but that does not appear in the list
>   of devices of the vfio_group structure.
> Are the above two errors checks just paranoia or are those errors actually possible?
> If they were possible, shouldn't we generate a warning (most probably
> it would be a bug in the code)?

They're all vfio-bus driver bugs of some sort, so it's just a matter of
how much we want to scream about them.  I'll comments on each below.

> > +void vfio_group_del_dev(struct device *dev)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_group *group = NULL;
> > +	struct vfio_device *device = NULL;
> > +	unsigned int groupid;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return;

Here the bus driver is probably just sitting on a notifier list for
their bus_type and a device is getting removed.  Unless we want to
require the bus driver to track everything it's attempted to add and
whether it worked, we can just ignore this.

> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		group = list_entry(pos, struct vfio_group, group_next);
> > +		if (group->groupid == groupid)
> > +			break;
> > +		group = NULL;
> > +	}
> > +
> > +	if (!group)
> > +		goto out;

We don't even have a group for the device, we could BUG_ON here.  The
bus driver failed to tell us about something that was then removed.

> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		if (device->dev == dev)
> > +			break;
> > +		device = NULL;
> > +	}
> > +
> > +	if (!device)
> > +		goto out;

Same here.

> > +
> > +	BUG_ON(device->refcnt);
> > +
> > +	if (device->attached)
> > +		__vfio_iommu_detach_dev(group->iommu, device);
> > +
> > +	list_del(&device->device_next);
> > +	kfree(device);
> > +
> > +	/* If this was the only device in the group, remove the group.
> > +	 * Note that we intentionally unmerge empty groups here if the
> > +	 * group fd isn't opened. */
> > +	if (list_empty(&group->device_list) && group->refcnt == 0) {
> > +		struct vfio_iommu *iommu = group->iommu;
> > +
> > +		if (iommu) {
> > +			__vfio_group_set_iommu(group, NULL);
> > +			__vfio_try_dissolve_iommu(iommu);
> > +		}
> > +
> > +		device_destroy(vfio.class, group->devt);
> > +		idr_remove(&vfio.idr, MINOR(group->devt));
> > +		list_del(&group->group_next);
> > +		kfree(group);
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_group_del_dev);
> > +
> > +/* When a device is bound to a vfio device driver (ex. vfio-pci), this
> > + * entry point is used to mark the device usable (viable).  The vfio
> > + * device driver associates a private device_data struct with the
> > device
> > + * here, which will later be return for vfio_device_fops callbacks. */
> > +int vfio_bind_dev(struct device *dev, void *device_data)
> > +{
> > +	struct vfio_device *device;
> > +	int ret = -EINVAL;
> > +
> > +	BUG_ON(!device_data);
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	device = __vfio_lookup_dev(dev);
> > +
> > +	BUG_ON(!device);
> > +
> > +	ret = dev_set_drvdata(dev, device);
> > +	if (!ret)
> > +		device->device_data = device_data;
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_bind_dev);
> > +
> > +/* A device is only removeable if the iommu for the group is not in
> > use. */
> > +static bool vfio_device_removeable(struct vfio_device *device)
> > +{
> > +	bool ret = true;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (device->iommu && __vfio_iommu_inuse(device->iommu))
> > +		ret = false;
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Notify vfio that a device is being unbound from the vfio device
> > driver
> > + * and return the device private device_data pointer.  If the group is
> > + * in use, we need to block or take other measures to make it safe for
> > + * the device to be removed from the iommu. */
> > +void *vfio_unbind_dev(struct device *dev)
> > +{
> > +	struct vfio_device *device = dev_get_drvdata(dev);
> > +	void *device_data;
> > +
> > +	BUG_ON(!device);
> > +
> > +again:
> > +	if (!vfio_device_removeable(device)) {
> > +		/* XXX signal for all devices in group to be removed or
> > +		 * resort to killing the process holding the device fds.
> > +		 * For now just block waiting for releases to wake us. */
> > +		wait_event(vfio.release_q, vfio_device_removeable(device));
> 
> Any new idea/proposal on how to handle this situation?
> The last one I remember was to leave the soft/hard/etc timeout handling in
> userspace and implement it as a sort of policy. Is that one still the most
> likely candidate solution to handle this situation?

I haven't heard any new proposals.  I think we need the hard timeout
handling in the kernel.  We can't leave it to userspace to decide they
get to keep the device.  We could have this tunable via an ioctl, but I
don't see how we wouldn't require CAP_SYS_ADMIN (or similar) to tweak
it.  I was intending to re-implement the netlink interface to signal the
removal, but expect to get allergic reactions to that.

Thanks for the comments!

Alex
Christian Benvenuti (benve) Nov. 9, 2011, 9:08 p.m. UTC | #5
Comments inline...

> -----Original Message-----

> From: Alex Williamson [mailto:alex.williamson@redhat.com]

> Sent: Wednesday, November 09, 2011 10:03 AM

> To: Christian Benvenuti (benve)

> Cc: chrisw@sous-sol.org; aik@au1.ibm.com; pmac@au1.ibm.com;

> dwg@au1.ibm.com; joerg.roedel@amd.com; agraf@suse.de; Aaron Fabbri

> (aafabbri); B08248@freescale.com; B07421@freescale.com; avi@redhat.com;

> konrad.wilk@oracle.com; kvm@vger.kernel.org; qemu-devel@nongnu.org;

> iommu@lists.linux-foundation.org; linux-pci@vger.kernel.org

> Subject: RE: [RFC PATCH] vfio: VFIO Driver core framework

> 

> On Wed, 2011-11-09 at 02:11 -0600, Christian Benvenuti (benve) wrote:

> > I have not gone through the all patch yet, but here are

> > my first comments/questions about the code in vfio_main.c

> > (and pci/vfio_pci.c).

> 

> Thanks!  Comments inline...

> 

> > > -----Original Message-----

> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]

> > > Sent: Thursday, November 03, 2011 1:12 PM

> > > To: chrisw@sous-sol.org; aik@au1.ibm.com; pmac@au1.ibm.com;

> > > dwg@au1.ibm.com; joerg.roedel@amd.com; agraf@suse.de; Christian

> > > Benvenuti (benve); Aaron Fabbri (aafabbri); B08248@freescale.com;

> > > B07421@freescale.com; avi@redhat.com; konrad.wilk@oracle.com;

> > > kvm@vger.kernel.org; qemu-devel@nongnu.org; iommu@lists.linux-

> > > foundation.org; linux-pci@vger.kernel.org

> > > Subject: [RFC PATCH] vfio: VFIO Driver core framework

> >

> > <snip>

> >

> > > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c

> > > new file mode 100644

> > > index 0000000..6169356

> > > --- /dev/null

> > > +++ b/drivers/vfio/vfio_main.c

> > > @@ -0,0 +1,1151 @@

> > > +/*

> > > + * VFIO framework

> > > + *

> > > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.

> > > + *     Author: Alex Williamson <alex.williamson@redhat.com>

> > > + *

> > > + * This program is free software; you can redistribute it and/or

> > > modify

> > > + * it under the terms of the GNU General Public License version 2

> as

> > > + * published by the Free Software Foundation.

> > > + *

> > > + * Derived from original vfio:

> > > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.

> > > + * Author: Tom Lyon, pugs@cisco.com

> > > + */

> > > +

> > > +#include <linux/cdev.h>

> > > +#include <linux/compat.h>

> > > +#include <linux/device.h>

> > > +#include <linux/file.h>

> > > +#include <linux/anon_inodes.h>

> > > +#include <linux/fs.h>

> > > +#include <linux/idr.h>

> > > +#include <linux/iommu.h>

> > > +#include <linux/mm.h>

> > > +#include <linux/module.h>

> > > +#include <linux/slab.h>

> > > +#include <linux/string.h>

> > > +#include <linux/uaccess.h>

> > > +#include <linux/vfio.h>

> > > +#include <linux/wait.h>

> > > +

> > > +#include "vfio_private.h"

> > > +

> > > +#define DRIVER_VERSION	"0.2"

> > > +#define DRIVER_AUTHOR	"Alex Williamson

> <alex.williamson@redhat.com>"

> > > +#define DRIVER_DESC	"VFIO - User Level meta-driver"

> > > +

> > > +static int allow_unsafe_intrs;

> > > +module_param(allow_unsafe_intrs, int, 0);

> > > +MODULE_PARM_DESC(allow_unsafe_intrs,

> > > +        "Allow use of IOMMUs which do not support interrupt

> > > remapping");

> > > +

> > > +static struct vfio {

> > > +	dev_t			devt;

> > > +	struct cdev		cdev;

> > > +	struct list_head	group_list;

> > > +	struct mutex		lock;

> > > +	struct kref		kref;

> > > +	struct class		*class;

> > > +	struct idr		idr;

> > > +	wait_queue_head_t	release_q;

> > > +} vfio;

> > > +

> > > +static const struct file_operations vfio_group_fops;

> > > +extern const struct file_operations vfio_iommu_fops;

> > > +

> > > +struct vfio_group {

> > > +	dev_t			devt;

> > > +	unsigned int		groupid;

> >

> > This groupid is returned by the device_group callback you recently

> added

> > with a separate (not yet in tree) IOMMU patch.

> > Is it correct to say that the scope of this ID is the bus the iommu

> > belongs too (but you use it as if it was global)?

> > I believe there is nothing right now to ensure the uniqueness of such

> > ID across bus types (assuming there will be other bus drivers in the

> > future besides vfio-pci).

> > If that's the case, the vfio.group_list global list and the

> __vfio_lookup_dev

> > routine should be changed to account for the bus too?

> > Ops, I just saw the error msg in vfio_group_add_dev about the group

> id conflict.

> > Is that warning related to what I mentioned above?

> 

> Yeah, this is a concern, but I can't think of a system where we would

> manifest a collision.  The IOMMU driver is expected to provide unique

> groupids for all devices below them, but we could imagine a system that

> implements two different bus_types, each with a different IOMMU driver

> and we have no coordination between them.  Perhaps since we have

> iommu_ops per bus, we should also expose the bus in the vfio group

> path,

> ie. /dev/vfio/%s/%u, dev->bus->name, iommu_device_group(dev,..).  This

> means userspace would need to do a readlink of the subsystem entry

> where

> it finds the iommu_group to find the vfio group.  Reasonable?


Most probably we won't see use cases with multiple buses anytime soon, but
this scheme you proposed (with the per-bus subdir) looks good to me. 

> > > +	struct bus_type		*bus;

> > > +	struct vfio_iommu	*iommu;

> > > +	struct list_head	device_list;

> > > +	struct list_head	iommu_next;

> > > +	struct list_head	group_next;

> > > +	int			refcnt;

> > > +};

> > > +

> > > +struct vfio_device {

> > > +	struct device			*dev;

> > > +	const struct vfio_device_ops	*ops;

> > > +	struct vfio_iommu		*iommu;

> >

> > I wonder if you need to have the 'iommu' field here.

> > vfio_device.iommu is always set and reset together with

> > vfio_group.iommu.

> > Given that a vfio_device instance is always linked to a vfio_group

> > instance, do we need this duplication? Is this duplication there

> > because you do not want the double dereference device->group->iommu?

> 

> I think that was my initial goal in duplicating the pointer on the

> device.  I believe I was also at one point passing a vfio_device around

> and needed the pointer.  We seem to be getting along fine w/o that and

> I

> don't see any performance sensitive paths from getting from the device

> to iommu, so I'll see about removing it.


I guess you can add it back later if there will be need for it.
Right now, since you always init/deinit both at the same time, this would simplify
the code and make it more unlikely to use an out-of-sync pointer.

> > > +	struct vfio_group		*group;

> > > +	struct list_head		device_next;

> > > +	bool				attached;

> > > +	int				refcnt;

> > > +	void				*device_data;

> > > +};

> > > +

> > > +/*

> > > + * Helper functions called under vfio.lock

> > > + */

> > > +

> > > +/* Return true if any devices within a group are opened */

> > > +static bool __vfio_group_devs_inuse(struct vfio_group *group)

> > > +{

> > > +	struct list_head *pos;

> > > +

> > > +	list_for_each(pos, &group->device_list) {

> > > +		struct vfio_device *device;

> > > +

> > > +		device = list_entry(pos, struct vfio_device, device_next);

> > > +		if (device->refcnt)

> > > +			return true;

> > > +	}

> > > +	return false;

> > > +}

> > > +

> > > +/* Return true if any of the groups attached to an iommu are

> opened.

> > > + * We can only tear apart merged groups when nothing is left open.

> */

> > > +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)

> > > +{

> > > +	struct list_head *pos;

> > > +

> > > +	list_for_each(pos, &iommu->group_list) {

> > > +		struct vfio_group *group;

> > > +

> > > +		group = list_entry(pos, struct vfio_group, iommu_next);

> > > +		if (group->refcnt)

> > > +			return true;

> > > +	}

> > > +	return false;

> > > +}

> > > +

> > > +/* An iommu is "in use" if it has a file descriptor open or if any

> of

> > > + * the groups assigned to the iommu have devices open. */

> > > +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)

> > > +{

> > > +	struct list_head *pos;

> > > +

> > > +	if (iommu->refcnt)

> > > +		return true;

> > > +

> > > +	list_for_each(pos, &iommu->group_list) {

> > > +		struct vfio_group *group;

> > > +

> > > +		group = list_entry(pos, struct vfio_group, iommu_next);

> > > +

> > > +		if (__vfio_group_devs_inuse(group))

> > > +			return true;

> > > +	}

> > > +	return false;

> > > +}

> >

> > I looked at how you take care of ref counts ...

> >

> > This is how the tree of vfio_iommu/vfio_group/vfio_device data

> > Structures is organized (I'll use just iommu/group/dev to make

> > the graph smaller):

> >

> >             iommu

> >            /     \

> >           /       \

> >     group   ...     group

> >     /  \           /  \

> >    /    \         /    \

> > dev  ..  dev   dev  ..  dev

> >

> > This is how you get a file descriptor for the three kind of objects:

> >

> > - group : open /dev/vfio/xxx for group xxx

> > - iommu : group ioctl VFIO_GROUP_GET_IOMMU_FD

> > - device: group ioctl VFIO_GROUP_GET_DEVICE_FD

> >

> > Given the above topology, I would assume that:

> >

> > (1) an iommu is 'inuse' if : a) iommu refcnt > 0, or

> >                              b) any of its groups is 'inuse'

> >

> > (2) a  group is 'inuse' if : a) group refcnt > 0, or

> >                              b) any of its devices is 'inuse'

> >

> > (3) a device is 'inuse' if : a) device refcnt > 0

> 

> (2) is a bit debatable.  I've wrestled with this one for a while.  The

> vfio_iommu serves two purposes.  First, it is the object we use for

> managing iommu domains, which includes allocating domains and attaching

> devices to domains.  Groups objects aren't involved here, they just

> manage the set of devices.  The second role is to manage merged groups,

> because whether or not groups can be merged is a function of iommu

> domain compatibility.

> 

> So if we look at "is the iommu in use?" ie. can I destroy the mapping

> context, detach devices and free the domain, the reference count on the

> group is irrelevant.  The user has to have a device or iommu file

> descriptor opened somewhere, across the group or merged group, for that

> context to be maintained.  A reasonable requirement, I think.


OK, then if you close all devices and the iommu, keeping the group open
Would not protect the iommu domain mapping. This means that if you (or
A management application) need to close all devices+iommu and reopen
right away again the same devices+iommu you may get a failure on the
iommu domain creation (supposing the system goes out of resources).
Is this just a very unlikely scenario? 
I guess in this case you would simply have to avoid releasing the iommu
fd, right?

> However, if we ask "is the group in use?" ie. can I not only destroy

> the

> mappings above, but also automatically tear apart merged groups, then I

> think we need to look at the group refcnt.


Correct.

> There's also a symmetry factor, the group is a benign entry point to

> device access.  It's only when device or iommu access is granted that

> the group gains any real power.  Therefore, shouldn't that power also

> be

> removed when those access points are closed?

> 

> > You have coded the 'inuse' logic with these three routines:

> >

> >     __vfio_iommu_inuse, which implements (1) above

> >

> > and

> >     __vfio_iommu_groups_inuse

> 

> Implements (2.a)


Yes, but for al groups at once.

> >     __vfio_group_devs_inuse

> 

> Implements (2.b)


Yes

> > which are used by __vfio_iommu_inuse.

> > Why don't you check the group refcnt in __vfio_iommu_groups_inuse?

> 

> Hopefully explained above, but open for discussion.

> 

> > Would it make sense (and the code more readable) to structure the

> > nested refcnt/inuse check like this?

> > (The numbers (1)(2)(3) refer to the three 'inuse' conditions above)

> >

> >    (1)__vfio_iommu_inuse

> >    |

> >    +-> check iommu refcnt

> >    +-> __vfio_iommu_groups_inuse

> >        |

> >        +->LOOP: (2)__vfio_iommu_group_inuse<--MISSING

> >                 |

> >                 +-> check group refcnt<--MISSING

> >                 +-> __vfio_group_devs_inuse()

> >                     |

> >                     +-> LOOP: (3)__vfio_group_dev_inuse<--MISSING

> >                               |

> >                               +-> check device refcnt

> 

> We currently do:

> 

>    (1)__vfio_iommu_inuse

>     |

>     +-> check iommu refcnt

>     +-> __vfio_group_devs_inuse

>         |

>         +->LOOP: (2.b)__vfio_group_devs_inuse

>                   |

>                   +-> LOOP: (3) check device refcnt

> 

> If that passes, the iommu context can be dissolved and we follow up

> with:

> 

>     __vfio_iommu_groups_inuse

>     |

>     +-> LOOP: (2.a)__vfio_iommu_groups_inuse

>                |

>                +-> check group refcnt

> 

> If that passes, groups can also be umerged.

> 

> Is this right?


Yes, assuming we stick to the "benign" role of groups you
described above.

> > > +static void __vfio_group_set_iommu(struct vfio_group *group,

> > > +				   struct vfio_iommu *iommu)

> > > +{

> > > +	struct list_head *pos;

> > > +

> > > +	if (group->iommu)

> > > +		list_del(&group->iommu_next);

> > > +	if (iommu)

> > > +		list_add(&group->iommu_next, &iommu->group_list);

> > > +

> > > +	group->iommu = iommu;

> >

> > If you remove the vfio_device.iommu field (as suggested above in a

> previous

> > Comment), the block below would not be needed anymore.

> 

> Yep, I'll try removing that and see how it plays out.

> 

> > > +	list_for_each(pos, &group->device_list) {

> > > +		struct vfio_device *device;

> > > +

> > > +		device = list_entry(pos, struct vfio_device, device_next);

> > > +		device->iommu = iommu;

> > > +	}

> > > +}

> > > +

> > > +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,

> > > +				    struct vfio_device *device)

> > > +{

> > > +	BUG_ON(!iommu->domain && device->attached);

> > > +

> > > +	if (!iommu->domain || !device->attached)

> > > +		return;

> > > +

> > > +	iommu_detach_device(iommu->domain, device->dev);

> > > +	device->attached = false;

> > > +}

> > > +

> > > +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,

> > > +				      struct vfio_group *group)

> > > +{

> > > +	struct list_head *pos;

> > > +

> > > +	list_for_each(pos, &group->device_list) {

> > > +		struct vfio_device *device;

> > > +

> > > +		device = list_entry(pos, struct vfio_device, device_next);

> > > +		__vfio_iommu_detach_dev(iommu, device);

> > > +	}

> > > +}

> > > +

> > > +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,

> > > +				   struct vfio_device *device)

> > > +{

> > > +	int ret;

> > > +

> > > +	BUG_ON(device->attached);

> > > +

> > > +	if (!iommu || !iommu->domain)

> > > +		return -EINVAL;

> > > +

> > > +	ret = iommu_attach_device(iommu->domain, device->dev);

> > > +	if (!ret)

> > > +		device->attached = true;

> > > +

> > > +	return ret;

> > > +}

> > > +

> > > +static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,

> > > +				     struct vfio_group *group)

> > > +{

> > > +	struct list_head *pos;

> > > +

> > > +	list_for_each(pos, &group->device_list) {

> > > +		struct vfio_device *device;

> > > +		int ret;

> > > +

> > > +		device = list_entry(pos, struct vfio_device, device_next);

> > > +		ret = __vfio_iommu_attach_dev(iommu, device);

> > > +		if (ret) {

> > > +			__vfio_iommu_detach_group(iommu, group);

> > > +			return ret;

> > > +		}

> > > +	}

> > > +	return 0;

> > > +}

> > > +

> > > +/* The iommu is viable, ie. ready to be configured, when all the

> > > devices

> > > + * for all the groups attached to the iommu are bound to their

> vfio

> > > device

> > > + * drivers (ex. vfio-pci).  This sets the device_data private data

> > > pointer. */

> > > +static bool __vfio_iommu_viable(struct vfio_iommu *iommu)

> > > +{

> > > +	struct list_head *gpos, *dpos;

> > > +

> > > +	list_for_each(gpos, &iommu->group_list) {

> > > +		struct vfio_group *group;

> > > +		group = list_entry(gpos, struct vfio_group, iommu_next);

> > > +

> > > +		list_for_each(dpos, &group->device_list) {

> > > +			struct vfio_device *device;

> > > +			device = list_entry(dpos,

> > > +					    struct vfio_device, device_next);

> > > +

> > > +			if (!device->device_data)

> > > +				return false;

> > > +		}

> > > +	}

> > > +	return true;

> > > +}

> > > +

> > > +static void __vfio_close_iommu(struct vfio_iommu *iommu)

> > > +{

> > > +	struct list_head *pos;

> > > +

> > > +	if (!iommu->domain)

> > > +		return;

> > > +

> > > +	list_for_each(pos, &iommu->group_list) {

> > > +		struct vfio_group *group;

> > > +		group = list_entry(pos, struct vfio_group, iommu_next);

> > > +

> > > +		__vfio_iommu_detach_group(iommu, group);

> > > +	}

> > > +

> > > +	vfio_iommu_unmapall(iommu);

> > > +

> > > +	iommu_domain_free(iommu->domain);

> > > +	iommu->domain = NULL;

> > > +	iommu->mm = NULL;

> > > +}

> > > +

> > > +/* Open the IOMMU.  This gates all access to the iommu or device

> file

> > > + * descriptors and sets current->mm as the exclusive user. */

> >

> > Given the fn  vfio_group_open (ie, 1st object, 2nd operation), I

> would have

> > called this one __vfio_iommu_open (instead of __vfio_open_iommu).

> > Is it named __vfio_open_iommu to avoid a conflict with the namespace

> in vfio_iommu.c?

> 

> I would have expected that too, I'll look at renaming these.

> 

> > > +static int __vfio_open_iommu(struct vfio_iommu *iommu)

> > > +{

> > > +	struct list_head *pos;

> > > +	int ret;

> > > +

> > > +	if (!__vfio_iommu_viable(iommu))

> > > +		return -EBUSY;

> > > +

> > > +	if (iommu->domain)

> > > +		return -EINVAL;

> > > +

> > > +	iommu->domain = iommu_domain_alloc(iommu->bus);

> > > +	if (!iommu->domain)

> > > +		return -EFAULT;

> > > +

> > > +	list_for_each(pos, &iommu->group_list) {

> > > +		struct vfio_group *group;

> > > +		group = list_entry(pos, struct vfio_group, iommu_next);

> > > +

> > > +		ret = __vfio_iommu_attach_group(iommu, group);

> > > +		if (ret) {

> > > +			__vfio_close_iommu(iommu);

> > > +			return ret;

> > > +		}

> > > +	}

> > > +

> > > +	if (!allow_unsafe_intrs &&

> > > +	    !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {

> > > +		__vfio_close_iommu(iommu);

> > > +		return -EFAULT;

> > > +	}

> > > +

> > > +	iommu->cache = (iommu_domain_has_cap(iommu->domain,

> > > +					     IOMMU_CAP_CACHE_COHERENCY) != 0);

> > > +	iommu->mm = current->mm;

> > > +

> > > +	return 0;

> > > +}

> > > +

> > > +/* Actively try to tear down the iommu and merged groups.  If

> there

> > > are no

> > > + * open iommu or device fds, we close the iommu.  If we close the

> > > iommu and

> > > + * there are also no open group fds, we can futher dissolve the

> group

> > > to

> > > + * iommu association and free the iommu data structure. */

> > > +static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)

> > > +{

> > > +

> > > +	if (__vfio_iommu_inuse(iommu))

> > > +		return -EBUSY;

> > > +

> > > +	__vfio_close_iommu(iommu);

> > > +

> > > +	if (!__vfio_iommu_groups_inuse(iommu)) {

> > > +		struct list_head *pos, *ppos;

> > > +

> > > +		list_for_each_safe(pos, ppos, &iommu->group_list) {

> > > +			struct vfio_group *group;

> > > +

> > > +			group = list_entry(pos, struct vfio_group,

> > > iommu_next);

> > > +			__vfio_group_set_iommu(group, NULL);

> > > +		}

> > > +

> > > +

> > > +		kfree(iommu);

> > > +	}

> > > +

> > > +	return 0;

> > > +}

> > > +

> > > +static struct vfio_device *__vfio_lookup_dev(struct device *dev)

> > > +{

> > > +	struct list_head *gpos;

> > > +	unsigned int groupid;

> > > +

> > > +	if (iommu_device_group(dev, &groupid))

> > > +		return NULL;

> > > +

> > > +	list_for_each(gpos, &vfio.group_list) {

> > > +		struct vfio_group *group;

> > > +		struct list_head *dpos;

> > > +

> > > +		group = list_entry(gpos, struct vfio_group, group_next);

> > > +

> > > +		if (group->groupid != groupid)

> > > +			continue;

> > > +

> > > +		list_for_each(dpos, &group->device_list) {

> > > +			struct vfio_device *device;

> > > +

> > > +			device = list_entry(dpos,

> > > +					    struct vfio_device, device_next);

> > > +

> > > +			if (device->dev == dev)

> > > +				return device;

> > > +		}

> > > +	}

> > > +	return NULL;

> > > +}

> > > +

> > > +/* All release paths simply decrement the refcnt, attempt to

> teardown

> > > + * the iommu and merged groups, and wakeup anything that might be

> > > + * waiting if we successfully dissolve anything. */

> > > +static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)

> > > +{

> > > +	bool wake;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	(*refcnt)--;

> > > +	wake = (__vfio_try_dissolve_iommu(iommu) == 0);

> > > +

> > > +	mutex_unlock(&vfio.lock);

> > > +

> > > +	if (wake)

> > > +		wake_up(&vfio.release_q);

> > > +

> > > +	return 0;

> > > +}

> > > +

> > > +/*

> > > + * Device fops - passthrough to vfio device driver w/ device_data

> > > + */

> > > +static int vfio_device_release(struct inode *inode, struct file

> > > *filep)

> > > +{

> > > +	struct vfio_device *device = filep->private_data;

> > > +

> > > +	vfio_do_release(&device->refcnt, device->iommu);

> > > +

> > > +	device->ops->put(device->device_data);

> > > +

> > > +	return 0;

> > > +}

> > > +

> > > +static long vfio_device_unl_ioctl(struct file *filep,

> > > +				  unsigned int cmd, unsigned long arg)

> > > +{

> > > +	struct vfio_device *device = filep->private_data;

> > > +

> > > +	return device->ops->ioctl(device->device_data, cmd, arg);

> > > +}

> > > +

> > > +static ssize_t vfio_device_read(struct file *filep, char __user

> *buf,

> > > +				size_t count, loff_t *ppos)

> > > +{

> > > +	struct vfio_device *device = filep->private_data;

> > > +

> > > +	return device->ops->read(device->device_data, buf, count, ppos);

> > > +}

> > > +

> > > +static ssize_t vfio_device_write(struct file *filep, const char

> __user

> > > *buf,

> > > +				 size_t count, loff_t *ppos)

> > > +{

> > > +	struct vfio_device *device = filep->private_data;

> > > +

> > > +	return device->ops->write(device->device_data, buf, count, ppos);

> > > +}

> > > +

> > > +static int vfio_device_mmap(struct file *filep, struct

> vm_area_struct

> > > *vma)

> > > +{

> > > +	struct vfio_device *device = filep->private_data;

> > > +

> > > +	return device->ops->mmap(device->device_data, vma);

> > > +}

> > > +

> > > +#ifdef CONFIG_COMPAT

> > > +static long vfio_device_compat_ioctl(struct file *filep,

> > > +				     unsigned int cmd, unsigned long arg)

> > > +{

> > > +	arg = (unsigned long)compat_ptr(arg);

> > > +	return vfio_device_unl_ioctl(filep, cmd, arg);

> > > +}

> > > +#endif	/* CONFIG_COMPAT */

> > > +

> > > +const struct file_operations vfio_device_fops = {

> > > +	.owner		= THIS_MODULE,

> > > +	.release	= vfio_device_release,

> > > +	.read		= vfio_device_read,

> > > +	.write		= vfio_device_write,

> > > +	.unlocked_ioctl	= vfio_device_unl_ioctl,

> > > +#ifdef CONFIG_COMPAT

> > > +	.compat_ioctl	= vfio_device_compat_ioctl,

> > > +#endif

> > > +	.mmap		= vfio_device_mmap,

> > > +};

> > > +

> > > +/*

> > > + * Group fops

> > > + */

> > > +static int vfio_group_open(struct inode *inode, struct file

> *filep)

> > > +{

> > > +	struct vfio_group *group;

> > > +	int ret = 0;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	group = idr_find(&vfio.idr, iminor(inode));

> > > +

> > > +	if (!group) {

> > > +		ret = -ENODEV;

> > > +		goto out;

> > > +	}

> > > +

> > > +	filep->private_data = group;

> > > +

> > > +	if (!group->iommu) {

> > > +		struct vfio_iommu *iommu;

> > > +

> > > +		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);

> > > +		if (!iommu) {

> > > +			ret = -ENOMEM;

> > > +			goto out;

> > > +		}

> > > +		INIT_LIST_HEAD(&iommu->group_list);

> > > +		INIT_LIST_HEAD(&iommu->dm_list);

> > > +		mutex_init(&iommu->dgate);

> > > +		iommu->bus = group->bus;

> > > +		__vfio_group_set_iommu(group, iommu);

> > > +	}

> > > +	group->refcnt++;

> > > +

> > > +out:

> > > +	mutex_unlock(&vfio.lock);

> > > +

> > > +	return ret;

> > > +}

> > > +

> > > +static int vfio_group_release(struct inode *inode, struct file

> *filep)

> > > +{

> > > +	struct vfio_group *group = filep->private_data;

> > > +

> > > +	return vfio_do_release(&group->refcnt, group->iommu);

> > > +}

> > > +

> > > +/* Attempt to merge the group pointed to by fd into group.  The

> merge-

> > > ee

> > > + * group must not have an iommu or any devices open because we

> cannot

> > > + * maintain that context across the merge.  The merge-er group can

> be

> > > + * in use. */

> > > +static int vfio_group_merge(struct vfio_group *group, int fd)

> >

> > The documentation in vfio.txt explains clearly the logic implemented

> by

> > the merge/unmerge group ioctls.

> > However, what you are doing is not merging groups, but rather

> adding/removing

> > groups to/from iommus (and creating flat lists of groups).

> > For example, when you do

> >

> >   merge(A,B)

> >

> > you actually mean to say "merge B to the list of groups assigned to

> the

> > same iommu as group A".

> 

> It's actually a little more than that.  After you've merged B into A,

> you can close the file descriptor for B and access all of the devices

> for the merged group from A.


It is actually more...

Scenario 1:

  create_grp(A)
  create_grp(B)
  ...
  merge_grp(A,B)
  create_grp(C)
  merge_grp(C,B) ... this works, right?

Scenario 2:

  create_grp(A)
  create_grp(B)
  fd_x = get_dev_fd(B,x)
  ...
  merge_grp(A,B)
  create_grp(C)
  merge_grp(A,C)
  fd_x = get_dev_fd(C,x) 

Those two examples seems to suggest me more of a list-abstraction than a merge abstraction.
However, if it fits into the agreed syntax/logic it is ok, as long as we document it
properly.

> > For the same reason, you do not really need to provide the group you

> want

> > to unmerge from, which means that instead of

> >

> >   unmerge(A,B)

> >

> > you would just need

> >

> >   unmerge(B)

> 

> Good point, we can avoid the awkward reference via file descriptor for

> the unmerge.

> 

> > I understand the reason why it is not a real merge/unmerge (ie, to

> keep the

> > original groups so that you can unmerge later)

> 

> Right, we still need to have visibility of the groups comprising the

> merged group, but the abstraction provided to the user seems to be

> deeper than you're thinking.

> 

> >  ... however I just wonder if

> > it wouldn't be more natural to implement the

> VFIO_IOMMU_ADD_GROUP/DEL_GROUP

> > iommu ioctls instead? (the relationships between the data structure

> would

> > remain the same)

> > I guess you already discarded this option for some reasons, right?

> What was

> > the reason?

> 

> It's a possibility, I'm not sure it was discussed or really what

> advantage it provides.  It seems like we'd logically lose the ability

> to

> access devices from other groups,


What is the real (immediate) benefit of this capability?

> whether that's good or bad, I don't know.  I think the notion of "merge"

> promotes the idea that the groups

> are peers and an iommu_add/del feels a bit more hierarchical.


I agree. 

> > > +{

> > > +	struct vfio_group *new;

> > > +	struct vfio_iommu *old_iommu;

> > > +	struct file *file;

> > > +	int ret = 0;

> > > +	bool opened = false;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	file = fget(fd);

> > > +	if (!file) {

> > > +		ret = -EBADF;

> > > +		goto out_noput;

> > > +	}

> > > +

> > > +	/* Sanity check, is this really our fd? */

> > > +	if (file->f_op != &vfio_group_fops) {

> > > +		ret = -EINVAL;

> > > +		goto out;

> > > +	}

> > > +

> > > +	new = file->private_data;

> > > +

> > > +	if (!new || new == group || !new->iommu ||

> > > +	    new->iommu->domain || new->bus != group->bus) {

> > > +		ret = -EINVAL;

> > > +		goto out;

> > > +	}

> > > +

> > > +	/* We need to attach all the devices to each domain separately

> > > +	 * in order to validate that the capabilities match for both.  */

> > > +	ret = __vfio_open_iommu(new->iommu);

> > > +	if (ret)

> > > +		goto out;

> > > +

> > > +	if (!group->iommu->domain) {

> > > +		ret = __vfio_open_iommu(group->iommu);

> > > +		if (ret)

> > > +			goto out;

> > > +		opened = true;

> > > +	}

> > > +

> > > +	/* If cache coherency doesn't match we'd potentialy need to

> > > +	 * remap existing iommu mappings in the merge-er domain.

> > > +	 * Poor return to bother trying to allow this currently. */

> > > +	if (iommu_domain_has_cap(group->iommu->domain,

> > > +				 IOMMU_CAP_CACHE_COHERENCY) !=

> > > +	    iommu_domain_has_cap(new->iommu->domain,

> > > +				 IOMMU_CAP_CACHE_COHERENCY)) {

> > > +		__vfio_close_iommu(new->iommu);

> > > +		if (opened)

> > > +			__vfio_close_iommu(group->iommu);

> > > +		ret = -EINVAL;

> > > +		goto out;

> > > +	}

> > > +

> > > +	/* Close the iommu for the merge-ee and attach all its devices

> > > +	 * to the merge-er iommu. */

> > > +	__vfio_close_iommu(new->iommu);

> > > +

> > > +	ret = __vfio_iommu_attach_group(group->iommu, new);

> > > +	if (ret)

> > > +		goto out;

> > > +

> > > +	/* set_iommu unlinks new from the iommu, so save a pointer to it

> > > */

> > > +	old_iommu = new->iommu;

> > > +	__vfio_group_set_iommu(new, group->iommu);

> > > +	kfree(old_iommu);

> > > +

> > > +out:

> > > +	fput(file);

> > > +out_noput:

> > > +	mutex_unlock(&vfio.lock);

> > > +	return ret;

> > > +}

> > > +

> > > +/* Unmerge the group pointed to by fd from group. */

> > > +static int vfio_group_unmerge(struct vfio_group *group, int fd)

> > > +{

> > > +	struct vfio_group *new;

> > > +	struct vfio_iommu *new_iommu;

> > > +	struct file *file;

> > > +	int ret = 0;

> > > +

> > > +	/* Since the merge-out group is already opened, it needs to

> > > +	 * have an iommu struct associated with it. */

> > > +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);

> > > +	if (!new_iommu)

> > > +		return -ENOMEM;

> > > +

> > > +	INIT_LIST_HEAD(&new_iommu->group_list);

> > > +	INIT_LIST_HEAD(&new_iommu->dm_list);

> > > +	mutex_init(&new_iommu->dgate);

> > > +	new_iommu->bus = group->bus;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	file = fget(fd);

> > > +	if (!file) {

> > > +		ret = -EBADF;

> > > +		goto out_noput;

> > > +	}

> > > +

> > > +	/* Sanity check, is this really our fd? */

> > > +	if (file->f_op != &vfio_group_fops) {

> > > +		ret = -EINVAL;

> > > +		goto out;

> > > +	}

> > > +

> > > +	new = file->private_data;

> > > +	if (!new || new == group || new->iommu != group->iommu) {

> > > +		ret = -EINVAL;

> > > +		goto out;

> > > +	}

> > > +

> > > +	/* We can't merge-out a group with devices still in use. */

> > > +	if (__vfio_group_devs_inuse(new)) {

> > > +		ret = -EBUSY;

> > > +		goto out;

> > > +	}

> > > +

> > > +	__vfio_iommu_detach_group(group->iommu, new);

> > > +	__vfio_group_set_iommu(new, new_iommu);

> > > +

> > > +out:

> > > +	fput(file);

> > > +out_noput:

> > > +	if (ret)

> > > +		kfree(new_iommu);

> > > +	mutex_unlock(&vfio.lock);

> > > +	return ret;

> > > +}

> > > +

> > > +/* Get a new iommu file descriptor.  This will open the iommu,

> setting

> > > + * the current->mm ownership if it's not already set. */

> > > +static int vfio_group_get_iommu_fd(struct vfio_group *group)

> > > +{

> > > +	int ret = 0;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	if (!group->iommu->domain) {

> > > +		ret = __vfio_open_iommu(group->iommu);

> > > +		if (ret)

> > > +			goto out;

> > > +	}

> > > +

> > > +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,

> > > +			       group->iommu, O_RDWR);

> > > +	if (ret < 0)

> > > +		goto out;

> > > +

> > > +	group->iommu->refcnt++;

> > > +out:

> > > +	mutex_unlock(&vfio.lock);

> > > +	return ret;

> > > +}

> > > +

> > > +/* Get a new device file descriptor.  This will open the iommu,

> > > setting

> > > + * the current->mm ownership if it's not already set.  It's

> difficult

> > > to

> > > + * specify the requirements for matching a user supplied buffer to

> a

> > > + * device, so we use a vfio driver callback to test for a match.

> For

> > > + * PCI, dev_name(dev) is unique, but other drivers may require

> > > including

> > > + * a parent device string. */

> > > +static int vfio_group_get_device_fd(struct vfio_group *group, char

> > > *buf)

> > > +{

> > > +	struct vfio_iommu *iommu = group->iommu;

> > > +	struct list_head *gpos;

> > > +	int ret = -ENODEV;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	if (!iommu->domain) {

> > > +		ret = __vfio_open_iommu(iommu);

> > > +		if (ret)

> > > +			goto out;

> > > +	}

> > > +

> > > +	list_for_each(gpos, &iommu->group_list) {

> > > +		struct list_head *dpos;

> > > +

> > > +		group = list_entry(gpos, struct vfio_group, iommu_next);

> > > +

> > > +		list_for_each(dpos, &group->device_list) {

> > > +			struct vfio_device *device;

> > > +

> > > +			device = list_entry(dpos,

> > > +					    struct vfio_device, device_next);

> > > +

> > > +			if (device->ops->match(device->dev, buf)) {

> > > +				struct file *file;

> > > +

> > > +				if (device->ops->get(device->device_data)) {

> > > +					ret = -EFAULT;

> > > +					goto out;

> > > +				}

> > > +

> > > +				/* We can't use anon_inode_getfd(), like above

> > > +				 * because we need to modify the f_mode flags

> > > +				 * directly to allow more than just ioctls */

> > > +				ret = get_unused_fd();

> > > +				if (ret < 0) {

> > > +					device->ops->put(device->device_data);

> > > +					goto out;

> > > +				}

> > > +

> > > +				file = anon_inode_getfile("[vfio-device]",

> > > +							  &vfio_device_fops,

> > > +							  device, O_RDWR);

> > > +				if (IS_ERR(file)) {

> > > +					put_unused_fd(ret);

> > > +					ret = PTR_ERR(file);

> > > +					device->ops->put(device->device_data);

> > > +					goto out;

> > > +				}

> > > +

> > > +				/* Todo: add an anon_inode interface to do

> > > +				 * this.  Appears to be missing by lack of

> > > +				 * need rather than explicitly prevented.

> > > +				 * Now there's need. */

> > > +				file->f_mode |= (FMODE_LSEEK |

> > > +						 FMODE_PREAD |

> > > +						 FMODE_PWRITE);

> > > +

> > > +				fd_install(ret, file);

> > > +

> > > +				device->refcnt++;

> > > +				goto out;

> > > +			}

> > > +		}

> > > +	}

> > > +out:

> > > +	mutex_unlock(&vfio.lock);

> > > +	return ret;

> > > +}

> > > +

> > > +static long vfio_group_unl_ioctl(struct file *filep,

> > > +				 unsigned int cmd, unsigned long arg)

> > > +{

> > > +	struct vfio_group *group = filep->private_data;

> > > +

> > > +	if (cmd == VFIO_GROUP_GET_FLAGS) {

> > > +		u64 flags = 0;

> > > +

> > > +		mutex_lock(&vfio.lock);

> > > +		if (__vfio_iommu_viable(group->iommu))

> > > +			flags |= VFIO_GROUP_FLAGS_VIABLE;

> > > +		mutex_unlock(&vfio.lock);

> > > +

> > > +		if (group->iommu->mm)

> > > +			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;

> > > +

> > > +		return put_user(flags, (u64 __user *)arg);

> > > +	}

> > > +

> > > +	/* Below commands are restricted once the mm is set */

> > > +	if (group->iommu->mm && group->iommu->mm != current->mm)

> > > +		return -EPERM;

> > > +	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {

> > > +		int fd;

> > > +

> > > +		if (get_user(fd, (int __user *)arg))

> > > +			return -EFAULT;

> > > +		if (fd < 0)

> > > +			return -EINVAL;

> > > +

> > > +		if (cmd == VFIO_GROUP_MERGE)

> > > +			return vfio_group_merge(group, fd);

> > > +		else

> > > +			return vfio_group_unmerge(group, fd);

> > > +	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {

> > > +		return vfio_group_get_iommu_fd(group);

> > > +	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {

> > > +		char *buf;

> > > +		int ret;

> > > +

> > > +		buf = strndup_user((const char __user *)arg, PAGE_SIZE);

> > > +		if (IS_ERR(buf))

> > > +			return PTR_ERR(buf);

> > > +

> > > +		ret = vfio_group_get_device_fd(group, buf);

> > > +		kfree(buf);

> > > +		return ret;

> > > +	}

> > > +

> > > +	return -ENOSYS;

> > > +}

> > > +

> > > +#ifdef CONFIG_COMPAT

> > > +static long vfio_group_compat_ioctl(struct file *filep,

> > > +				    unsigned int cmd, unsigned long arg)

> > > +{

> > > +	arg = (unsigned long)compat_ptr(arg);

> > > +	return vfio_group_unl_ioctl(filep, cmd, arg);

> > > +}

> > > +#endif	/* CONFIG_COMPAT */

> > > +

> > > +static const struct file_operations vfio_group_fops = {

> > > +	.owner		= THIS_MODULE,

> > > +	.open		= vfio_group_open,

> > > +	.release	= vfio_group_release,

> > > +	.unlocked_ioctl	= vfio_group_unl_ioctl,

> > > +#ifdef CONFIG_COMPAT

> > > +	.compat_ioctl	= vfio_group_compat_ioctl,

> > > +#endif

> > > +};

> > > +

> > > +/* iommu fd release hook */

> >

> > Given vfio_device_release and

> >       vfio_group_release (ie, 1st object, 2nd operation), I was

> > going to suggest renaming the fn below to vfio_iommu_release, but

> > then I saw the latter name being already used in vfio_iommu.c ...

> > a bit confusing but I guess it's ok then.

> 

> Right, this one was definitely because of naming collision.

> 

> > > +int vfio_release_iommu(struct vfio_iommu *iommu)

> > > +{

> > > +	return vfio_do_release(&iommu->refcnt, iommu);

> > > +}

> > > +

> > > +/*

> > > + * VFIO driver API

> > > + */

> > > +

> > > +/* Add a new device to the vfio framework with associated vfio

> driver

> > > + * callbacks.  This is the entry point for vfio drivers to

> register

> > > devices. */

> > > +int vfio_group_add_dev(struct device *dev, const struct

> > > vfio_device_ops *ops)

> > > +{

> > > +	struct list_head *pos;

> > > +	struct vfio_group *group = NULL;

> > > +	struct vfio_device *device = NULL;

> > > +	unsigned int groupid;

> > > +	int ret = 0;

> > > +	bool new_group = false;

> > > +

> > > +	if (!ops)

> > > +		return -EINVAL;

> > > +

> > > +	if (iommu_device_group(dev, &groupid))

> > > +		return -ENODEV;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	list_for_each(pos, &vfio.group_list) {

> > > +		group = list_entry(pos, struct vfio_group, group_next);

> > > +		if (group->groupid == groupid)

> > > +			break;

> > > +		group = NULL;

> > > +	}

> > > +

> > > +	if (!group) {

> > > +		int minor;

> > > +

> > > +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {

> > > +			ret = -ENOMEM;

> > > +			goto out;

> > > +		}

> > > +

> > > +		group = kzalloc(sizeof(*group), GFP_KERNEL);

> > > +		if (!group) {

> > > +			ret = -ENOMEM;

> > > +			goto out;

> > > +		}

> > > +

> > > +		group->groupid = groupid;

> > > +		INIT_LIST_HEAD(&group->device_list);

> > > +

> > > +		ret = idr_get_new(&vfio.idr, group, &minor);

> > > +		if (ret == 0 && minor > MINORMASK) {

> > > +			idr_remove(&vfio.idr, minor);

> > > +			kfree(group);

> > > +			ret = -ENOSPC;

> > > +			goto out;

> > > +		}

> > > +

> > > +		group->devt = MKDEV(MAJOR(vfio.devt), minor);

> > > +		device_create(vfio.class, NULL, group->devt,

> > > +			      group, "%u", groupid);

> > > +

> > > +		group->bus = dev->bus;

> > > +		list_add(&group->group_next, &vfio.group_list);

> > > +		new_group = true;

> > > +	} else {

> > > +		if (group->bus != dev->bus) {

> > > +			printk(KERN_WARNING

> > > +			       "Error: IOMMU group ID conflict.  Group ID %u

> > > "

> > > +				"on both bus %s and %s\n", groupid,

> > > +				group->bus->name, dev->bus->name);

> > > +			ret = -EFAULT;

> > > +			goto out;

> > > +		}

> > > +

> > > +		list_for_each(pos, &group->device_list) {

> > > +			device = list_entry(pos,

> > > +					    struct vfio_device, device_next);

> > > +			if (device->dev == dev)

> > > +				break;

> > > +			device = NULL;

> > > +		}

> > > +	}

> > > +

> > > +	if (!device) {

> > > +		if (__vfio_group_devs_inuse(group) ||

> > > +		    (group->iommu && group->iommu->refcnt)) {

> > > +			printk(KERN_WARNING

> > > +			       "Adding device %s to group %u while group is

> > > already in use!!\n",

> > > +			       dev_name(dev), group->groupid);

> > > +			/* XXX How to prevent other drivers from claiming? */

> >

> > Here we are adding a device (not yet assigned to a vfio bus) to a

> group

> > that is already in use.

> > Given that it would not be acceptable for this device to get assigned

> > to a non vfio driver, why not forcing such assignment here then?

> 

> Exactly, I just don't know the mechanics of how to make that happen and

> was hoping for suggestions...

> 

> > I am not sure though what the best way to do it would be.

> > What about something like this:

> >

> > - when the bus vfio-pci processes the BUS_NOTIFY_ADD_DEVICE

> >   notification it assigns to the device a PCI ID that will make sure

> >   the vfio-pci's probe routine will be invoked (and no other driver

> can

> >   therefore claim the device). That PCI ID would have to be added

> >   to the vfio_pci_driver's id_table (it would be the exception to the

> >   "only dynamic IDs" rule). Too hackish?

> 

> Presumably some other driver also has the ID in it's id_table, how do

> we make sure we win?


By mangling such ID (when processing the BUS_NOTIFY_ADD_DEVICE notification) to
match against a 'fake' ID registered in the vfio-pci table (it would be like a
sort of driver redirect/divert). The vfio-pci's probe routine would restore
the original ID (we do not want to confuse userspace). This is hackish, I agree.

What about this:
- When vfio-pci processes the BUS_NOTIFY_ADD_DEVICE notification it can
  pre-initialize the driver pointer (via an API). We would then need to change
  the match/probe PCI mechanism too: for example, the PCI core will have to check
  and honor such pre-driver-initialization when present (and give it higher
  priority over the match callbacks).
  How to do this? For example, when vfio_group_add_dev is invoked, it checks
  whether the device is getting added to an already existent group where
  the other devices (well, you would need to check just one of the devices in
  the group) are already assigned to vfio-pci, and in such a case it
  pre-initialize the driver to vfio-pci.

NOTE: By "preinit" I mean "save into the device a reference to a driver before
      the 'match' callbacks".

This would be the timeline:

|
+-> new device gets added to (PCI) bus
|
+-> PCI: send BUS_NOTIFIER_ADD_DEVICE notification
|
+-> VFIO:vfio_pci_device_notifier
|        |
|        +-> BUS_NOTIFIER_ADD_DEVICE: vfio_group_add_dev
|            |
|            +->iommu_device_group(dev,&groupid)
|            +->group = <search groupid in vfio.group_list>
|            +->if (group && group_is_vfio(group))
|            |        <preinit device driver to vfio-pci>
|            ...
|
+-> PCI: xxx
|        |
|        +-> if (!device_driver_is_preinit(dev))
|        |       probe=<search driver's probe callback using 'match'>
|        |   else 
|        |       probe=<get it from preint driver config>
|        |       (+fallback to 'match' if preinit driver disappeared?)
|        |   
|        +-> rc = probe(...)
|        |
|        ...
v
...

Of course, what if multiple drivers decide to preinit the device ?

One way to make it cleaner would be to:
- have the PCI layer export an API that allows (for example) the bus
  notification callbacks (like vfio_pci_device_notifier) to preinit a driver
- make such API reject calls on devices that already have a preinit
  driver.
- make VFIO detect the case where vfio_pci_device_notifier can not
  preinit the driver (to vfio-pci) for the new device (because already
  preinited) and raise an error/warning.

Would this look a bit cleaner?

> > > +		}

> > > +

> > > +		device = kzalloc(sizeof(*device), GFP_KERNEL);

> > > +		if (!device) {

> > > +			/* If we just created this group, tear it down */

> > > +			if (new_group) {

> > > +				list_del(&group->group_next);

> > > +				device_destroy(vfio.class, group->devt);

> > > +				idr_remove(&vfio.idr, MINOR(group->devt));

> > > +				kfree(group);

> > > +			}

> > > +			ret = -ENOMEM;

> > > +			goto out;

> > > +		}

> > > +

> > > +		list_add(&device->device_next, &group->device_list);

> > > +		device->dev = dev;

> > > +		device->ops = ops;

> > > +		device->iommu = group->iommu; /* NULL if new */

> >

> > Shouldn't you check the return code of __vfio_iommu_attach_dev?

> 

> Yep, looks like I did this because the expected use case has a NULL

> iommu here, so I need to distiguish that error from an actual

> iommu_attach_device() error.

> 

> > > +		__vfio_iommu_attach_dev(group->iommu, device);

> > > +	}

> > > +out:

> > > +	mutex_unlock(&vfio.lock);

> > > +	return ret;

> > > +}

> > > +EXPORT_SYMBOL_GPL(vfio_group_add_dev);

> > > +

> > > +/* Remove a device from the vfio framework */

> >

> > This fn below does not return any error code. Ok ...

> > However, there are a number of errors case that you test, for example

> > - device that does not belong to any group (according to iommu API)

> > - device that belongs to a group but that does not appear in the list

> >   of devices of the vfio_group structure.

> > Are the above two errors checks just paranoia or are those errors

> actually possible?

> > If they were possible, shouldn't we generate a warning (most probably

> > it would be a bug in the code)?

> 

> They're all vfio-bus driver bugs of some sort, so it's just a matter of

> how much we want to scream about them.  I'll comments on each below.

> 

> > > +void vfio_group_del_dev(struct device *dev)

> > > +{

> > > +	struct list_head *pos;

> > > +	struct vfio_group *group = NULL;

> > > +	struct vfio_device *device = NULL;

> > > +	unsigned int groupid;

> > > +

> > > +	if (iommu_device_group(dev, &groupid))

> > > +		return;

> 

> Here the bus driver is probably just sitting on a notifier list for

> their bus_type and a device is getting removed.  Unless we want to

> require the bus driver to track everything it's attempted to add and

> whether it worked, we can just ignore this.


OK, I see what you mean. If vfio_group_add_dev fails for some reasons we
do not keep track of it. Right?
Would it make sense to add one special group to vfio.group_list (or better
On a separate field of the vfio structure) whose goal
would be just that: keep track of those devices that failed to be added
to the VFIO framework (can it help for debugging too?)?

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	list_for_each(pos, &vfio.group_list) {

> > > +		group = list_entry(pos, struct vfio_group, group_next);

> > > +		if (group->groupid == groupid)

> > > +			break;

> > > +		group = NULL;

> > > +	}

> > > +

> > > +	if (!group)

> > > +		goto out;

> 

> We don't even have a group for the device, we could BUG_ON here.  The

> bus driver failed to tell us about something that was then removed.

> 

> > > +

> > > +	list_for_each(pos, &group->device_list) {

> > > +		device = list_entry(pos, struct vfio_device, device_next);

> > > +		if (device->dev == dev)

> > > +			break;

> > > +		device = NULL;

> > > +	}

> > > +

> > > +	if (!device)

> > > +		goto out;

> 

> Same here.

> 

> > > +

> > > +	BUG_ON(device->refcnt);

> > > +

> > > +	if (device->attached)

> > > +		__vfio_iommu_detach_dev(group->iommu, device);

> > > +

> > > +	list_del(&device->device_next);

> > > +	kfree(device);

> > > +

> > > +	/* If this was the only device in the group, remove the group.

> > > +	 * Note that we intentionally unmerge empty groups here if the

> > > +	 * group fd isn't opened. */

> > > +	if (list_empty(&group->device_list) && group->refcnt == 0) {

> > > +		struct vfio_iommu *iommu = group->iommu;

> > > +

> > > +		if (iommu) {

> > > +			__vfio_group_set_iommu(group, NULL);

> > > +			__vfio_try_dissolve_iommu(iommu);

> > > +		}

> > > +

> > > +		device_destroy(vfio.class, group->devt);

> > > +		idr_remove(&vfio.idr, MINOR(group->devt));

> > > +		list_del(&group->group_next);

> > > +		kfree(group);

> > > +	}

> > > +out:

> > > +	mutex_unlock(&vfio.lock);

> > > +}

> > > +EXPORT_SYMBOL_GPL(vfio_group_del_dev);

> > > +

> > > +/* When a device is bound to a vfio device driver (ex. vfio-pci),

> this

> > > + * entry point is used to mark the device usable (viable).  The

> vfio

> > > + * device driver associates a private device_data struct with the

> > > device

> > > + * here, which will later be return for vfio_device_fops

> callbacks. */

> > > +int vfio_bind_dev(struct device *dev, void *device_data)

> > > +{

> > > +	struct vfio_device *device;

> > > +	int ret = -EINVAL;

> > > +

> > > +	BUG_ON(!device_data);

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	device = __vfio_lookup_dev(dev);

> > > +

> > > +	BUG_ON(!device);

> > > +

> > > +	ret = dev_set_drvdata(dev, device);

> > > +	if (!ret)

> > > +		device->device_data = device_data;

> > > +

> > > +	mutex_unlock(&vfio.lock);

> > > +	return ret;

> > > +}

> > > +EXPORT_SYMBOL_GPL(vfio_bind_dev);

> > > +

> > > +/* A device is only removeable if the iommu for the group is not

> in

> > > use. */

> > > +static bool vfio_device_removeable(struct vfio_device *device)

> > > +{

> > > +	bool ret = true;

> > > +

> > > +	mutex_lock(&vfio.lock);

> > > +

> > > +	if (device->iommu && __vfio_iommu_inuse(device->iommu))

> > > +		ret = false;

> > > +

> > > +	mutex_unlock(&vfio.lock);

> > > +	return ret;

> > > +}

> > > +

> > > +/* Notify vfio that a device is being unbound from the vfio device

> > > driver

> > > + * and return the device private device_data pointer.  If the

> group is

> > > + * in use, we need to block or take other measures to make it safe

> for

> > > + * the device to be removed from the iommu. */

> > > +void *vfio_unbind_dev(struct device *dev)

> > > +{

> > > +	struct vfio_device *device = dev_get_drvdata(dev);

> > > +	void *device_data;

> > > +

> > > +	BUG_ON(!device);

> > > +

> > > +again:

> > > +	if (!vfio_device_removeable(device)) {

> > > +		/* XXX signal for all devices in group to be removed or

> > > +		 * resort to killing the process holding the device fds.

> > > +		 * For now just block waiting for releases to wake us. */

> > > +		wait_event(vfio.release_q, vfio_device_removeable(device));

> >

> > Any new idea/proposal on how to handle this situation?

> > The last one I remember was to leave the soft/hard/etc timeout

> handling in

> > userspace and implement it as a sort of policy. Is that one still the

> most

> > likely candidate solution to handle this situation?

> 

> I haven't heard any new proposals.  I think we need the hard timeout

> handling in the kernel.  We can't leave it to userspace to decide they

> get to keep the device.  We could have this tunable via an ioctl, but I

> don't see how we wouldn't require CAP_SYS_ADMIN (or similar) to tweak

> it.  I was intending to re-implement the netlink interface to signal

> the

> removal, but expect to get allergic reactions to that.


(I personally like the async netlink signaling, but I am OK with an ioctl based
mechanism if it provides the same flexibility)

What would be a reasonable hard timeout?

/Chris
Alex Williamson Nov. 9, 2011, 11:40 p.m. UTC | #6
On Wed, 2011-11-09 at 15:08 -0600, Christian Benvenuti (benve) wrote:
<snip>
> > > > +
> > > > +struct vfio_group {
> > > > +	dev_t			devt;
> > > > +	unsigned int		groupid;
> > >
> > > This groupid is returned by the device_group callback you recently
> > added
> > > with a separate (not yet in tree) IOMMU patch.
> > > Is it correct to say that the scope of this ID is the bus the iommu
> > > belongs too (but you use it as if it was global)?
> > > I believe there is nothing right now to ensure the uniqueness of such
> > > ID across bus types (assuming there will be other bus drivers in the
> > > future besides vfio-pci).
> > > If that's the case, the vfio.group_list global list and the
> > __vfio_lookup_dev
> > > routine should be changed to account for the bus too?
> > > Ops, I just saw the error msg in vfio_group_add_dev about the group
> > id conflict.
> > > Is that warning related to what I mentioned above?
> > 
> > Yeah, this is a concern, but I can't think of a system where we would
> > manifest a collision.  The IOMMU driver is expected to provide unique
> > groupids for all devices below them, but we could imagine a system that
> > implements two different bus_types, each with a different IOMMU driver
> > and we have no coordination between them.  Perhaps since we have
> > iommu_ops per bus, we should also expose the bus in the vfio group
> > path,
> > ie. /dev/vfio/%s/%u, dev->bus->name, iommu_device_group(dev,..).  This
> > means userspace would need to do a readlink of the subsystem entry
> > where
> > it finds the iommu_group to find the vfio group.  Reasonable?
> 
> Most probably we won't see use cases with multiple buses anytime soon, but
> this scheme you proposed (with the per-bus subdir) looks good to me. 

Ok, I think that's easier than any scheme of trying to organize globally
unique groupids instead of just bus_type unique.  That makes group
objects internally matched by the {groupid, bus} pair.

<snip>
> > >
> > > I looked at how you take care of ref counts ...
> > >
> > > This is how the tree of vfio_iommu/vfio_group/vfio_device data
> > > Structures is organized (I'll use just iommu/group/dev to make
> > > the graph smaller):
> > >
> > >             iommu
> > >            /     \
> > >           /       \
> > >     group   ...     group
> > >     /  \           /  \
> > >    /    \         /    \
> > > dev  ..  dev   dev  ..  dev
> > >
> > > This is how you get a file descriptor for the three kind of objects:
> > >
> > > - group : open /dev/vfio/xxx for group xxx
> > > - iommu : group ioctl VFIO_GROUP_GET_IOMMU_FD
> > > - device: group ioctl VFIO_GROUP_GET_DEVICE_FD
> > >
> > > Given the above topology, I would assume that:
> > >
> > > (1) an iommu is 'inuse' if : a) iommu refcnt > 0, or
> > >                              b) any of its groups is 'inuse'
> > >
> > > (2) a  group is 'inuse' if : a) group refcnt > 0, or
> > >                              b) any of its devices is 'inuse'
> > >
> > > (3) a device is 'inuse' if : a) device refcnt > 0
> > 
> > (2) is a bit debatable.  I've wrestled with this one for a while.  The
> > vfio_iommu serves two purposes.  First, it is the object we use for
> > managing iommu domains, which includes allocating domains and attaching
> > devices to domains.  Groups objects aren't involved here, they just
> > manage the set of devices.  The second role is to manage merged groups,
> > because whether or not groups can be merged is a function of iommu
> > domain compatibility.
> > 
> > So if we look at "is the iommu in use?" ie. can I destroy the mapping
> > context, detach devices and free the domain, the reference count on the
> > group is irrelevant.  The user has to have a device or iommu file
> > descriptor opened somewhere, across the group or merged group, for that
> > context to be maintained.  A reasonable requirement, I think.
> 
> OK, then if you close all devices and the iommu, keeping the group open
> Would not protect the iommu domain mapping. This means that if you (or
> A management application) need to close all devices+iommu and reopen
> right away again the same devices+iommu you may get a failure on the
> iommu domain creation (supposing the system goes out of resources).
> Is this just a very unlikely scenario? 

Can you think of a use case that would require such?  I can't.

> I guess in this case you would simply have to avoid releasing the iommu
> fd, right?

Right.  We could also debate whether we should drop all iommu mappings
when the iommu refcnt goes to zero.  We don't currently do that, but it
might make sense.

> 
> > However, if we ask "is the group in use?" ie. can I not only destroy
> > the
> > mappings above, but also automatically tear apart merged groups, then I
> > think we need to look at the group refcnt.
> 
> Correct.
> 
> > There's also a symmetry factor, the group is a benign entry point to
> > device access.  It's only when device or iommu access is granted that
> > the group gains any real power.  Therefore, shouldn't that power also
> > be
> > removed when those access points are closed?
> > 
> > > You have coded the 'inuse' logic with these three routines:
> > >
> > >     __vfio_iommu_inuse, which implements (1) above
> > >
> > > and
> > >     __vfio_iommu_groups_inuse
> > 
> > Implements (2.a)
> 
> Yes, but for al groups at once.

Right

> > >     __vfio_group_devs_inuse
> > 
> > Implements (2.b)
> 
> Yes
> 
> > > which are used by __vfio_iommu_inuse.
> > > Why don't you check the group refcnt in __vfio_iommu_groups_inuse?
> > 
> > Hopefully explained above, but open for discussion.
> > 
> > > Would it make sense (and the code more readable) to structure the
> > > nested refcnt/inuse check like this?
> > > (The numbers (1)(2)(3) refer to the three 'inuse' conditions above)
> > >
> > >    (1)__vfio_iommu_inuse
> > >    |
> > >    +-> check iommu refcnt
> > >    +-> __vfio_iommu_groups_inuse
> > >        |
> > >        +->LOOP: (2)__vfio_iommu_group_inuse<--MISSING
> > >                 |
> > >                 +-> check group refcnt<--MISSING
> > >                 +-> __vfio_group_devs_inuse()
> > >                     |
> > >                     +-> LOOP: (3)__vfio_group_dev_inuse<--MISSING
> > >                               |
> > >                               +-> check device refcnt
> > 
> > We currently do:
> > 
> >    (1)__vfio_iommu_inuse
> >     |
> >     +-> check iommu refcnt
> >     +-> __vfio_group_devs_inuse
> >         |
> >         +->LOOP: (2.b)__vfio_group_devs_inuse
> >                   |
> >                   +-> LOOP: (3) check device refcnt
> > 
> > If that passes, the iommu context can be dissolved and we follow up
> > with:
> > 
> >     __vfio_iommu_groups_inuse
> >     |
> >     +-> LOOP: (2.a)__vfio_iommu_groups_inuse
> >                |
> >                +-> check group refcnt
> > 
> > If that passes, groups can also be umerged.
> > 
> > Is this right?
> 
> Yes, assuming we stick to the "benign" role of groups you
> described above.

Ok, no change then.  Thanks for looking at that so closely.

<snip>
> > > > +static int vfio_group_merge(struct vfio_group *group, int fd)
> > >
> > > The documentation in vfio.txt explains clearly the logic implemented
> > by
> > > the merge/unmerge group ioctls.
> > > However, what you are doing is not merging groups, but rather
> > adding/removing
> > > groups to/from iommus (and creating flat lists of groups).
> > > For example, when you do
> > >
> > >   merge(A,B)
> > >
> > > you actually mean to say "merge B to the list of groups assigned to
> > the
> > > same iommu as group A".
> > 
> > It's actually a little more than that.  After you've merged B into A,
> > you can close the file descriptor for B and access all of the devices
> > for the merged group from A.
> 
> It is actually more...
> 
> Scenario 1:
> 
>   create_grp(A)
>   create_grp(B)
>   ...
>   merge_grp(A,B)
>   create_grp(C)
>   merge_grp(C,B) ... this works, right?

No, but merge_grp(B,C) does.  I currently require that the incoming
group has no open device or iommu file descriptors and is a singular
group.  The device/iommu is a hard requirement since we'll be changing
the iommu context and can't leave an attack window.  The singular group
is an implementation detail.  Given the iommu/device requirement, it's
just as easy for userspace to tear apart the group and pass each
individually.

> Scenario 2:
> 
>   create_grp(A)
>   create_grp(B)
>   fd_x = get_dev_fd(B,x)
>   ...
>   merge_grp(A,B)

NAK, fails no open device test.  Again, merge_grp(B,A) is supported.

>   create_grp(C)
>   merge_grp(A,C)

Yep, this works.

>   fd_x = get_dev_fd(C,x) 

Yep, and if x is they same in both cases, you'll get 2 different file
descriptors backed by the same device.

> Those two examples seems to suggest me more of a list-abstraction than a merge abstraction.
> However, if it fits into the agreed syntax/logic it is ok, as long as we document it
> properly.

Can you suggest documentation changes that would make this more clear?

> > > For the same reason, you do not really need to provide the group you
> > want
> > > to unmerge from, which means that instead of
> > >
> > >   unmerge(A,B)
> > >
> > > you would just need
> > >
> > >   unmerge(B)
> > 
> > Good point, we can avoid the awkward reference via file descriptor for
> > the unmerge.
> > 
> > > I understand the reason why it is not a real merge/unmerge (ie, to
> > keep the
> > > original groups so that you can unmerge later)
> > 
> > Right, we still need to have visibility of the groups comprising the
> > merged group, but the abstraction provided to the user seems to be
> > deeper than you're thinking.
> > 
> > >  ... however I just wonder if
> > > it wouldn't be more natural to implement the
> > VFIO_IOMMU_ADD_GROUP/DEL_GROUP
> > > iommu ioctls instead? (the relationships between the data structure
> > would
> > > remain the same)
> > > I guess you already discarded this option for some reasons, right?
> > What was
> > > the reason?
> > 
> > It's a possibility, I'm not sure it was discussed or really what
> > advantage it provides.  It seems like we'd logically lose the ability
> > to
> > access devices from other groups,
> 
> What is the real (immediate) benefit of this capability?

Mostly convenience, but also promotes the peer idea where merged groups
simply create a "super" group that can access the iommu and all the
devices of the member groups.  On x86 we expect that merging groups will
always succeed and groups will typically have a single device, so a
driver could merge them all together, throw away all the extra group
file descriptors and manage the whole super group via a single group fd.

> > whether that's good or bad, I don't know.  I think the notion of "merge"
> > promotes the idea that the groups
> > are peers and an iommu_add/del feels a bit more hierarchical.
> 
> I agree. 
<snip>
> > > > +	if (!device) {
> > > > +		if (__vfio_group_devs_inuse(group) ||
> > > > +		    (group->iommu && group->iommu->refcnt)) {
> > > > +			printk(KERN_WARNING
> > > > +			       "Adding device %s to group %u while group is
> > > > already in use!!\n",
> > > > +			       dev_name(dev), group->groupid);
> > > > +			/* XXX How to prevent other drivers from claiming? */
> > >
> > > Here we are adding a device (not yet assigned to a vfio bus) to a
> > group
> > > that is already in use.
> > > Given that it would not be acceptable for this device to get assigned
> > > to a non vfio driver, why not forcing such assignment here then?
> > 
> > Exactly, I just don't know the mechanics of how to make that happen and
> > was hoping for suggestions...
> > 
> > > I am not sure though what the best way to do it would be.
> > > What about something like this:
> > >
> > > - when the bus vfio-pci processes the BUS_NOTIFY_ADD_DEVICE
> > >   notification it assigns to the device a PCI ID that will make sure
> > >   the vfio-pci's probe routine will be invoked (and no other driver
> > can
> > >   therefore claim the device). That PCI ID would have to be added
> > >   to the vfio_pci_driver's id_table (it would be the exception to the
> > >   "only dynamic IDs" rule). Too hackish?
> > 
> > Presumably some other driver also has the ID in it's id_table, how do
> > we make sure we win?
> 
> By mangling such ID (when processing the BUS_NOTIFY_ADD_DEVICE notification) to
> match against a 'fake' ID registered in the vfio-pci table (it would be like a
> sort of driver redirect/divert). The vfio-pci's probe routine would restore
> the original ID (we do not want to confuse userspace). This is hackish, I agree.
> 
> What about this:
> - When vfio-pci processes the BUS_NOTIFY_ADD_DEVICE notification it can
>   pre-initialize the driver pointer (via an API). We would then need to change
>   the match/probe PCI mechanism too: for example, the PCI core will have to check
>   and honor such pre-driver-initialization when present (and give it higher
>   priority over the match callbacks).
>   How to do this? For example, when vfio_group_add_dev is invoked, it checks
>   whether the device is getting added to an already existent group where
>   the other devices (well, you would need to check just one of the devices in
>   the group) are already assigned to vfio-pci, and in such a case it
>   pre-initialize the driver to vfio-pci.

It's ok to make a group "non-viable", we only want to intervene if the
iommu is inuse (iommu or device refcnt > 0).

> 
> NOTE: By "preinit" I mean "save into the device a reference to a driver before
>       the 'match' callbacks".
> 
> This would be the timeline:
> 
> |
> +-> new device gets added to (PCI) bus
> |
> +-> PCI: send BUS_NOTIFIER_ADD_DEVICE notification
> |
> +-> VFIO:vfio_pci_device_notifier
> |        |
> |        +-> BUS_NOTIFIER_ADD_DEVICE: vfio_group_add_dev
> |            |
> |            +->iommu_device_group(dev,&groupid)
> |            +->group = <search groupid in vfio.group_list>
> |            +->if (group && group_is_vfio(group))
> |            |        <preinit device driver to vfio-pci>
> |            ...
> |
> +-> PCI: xxx
> |        |
> |        +-> if (!device_driver_is_preinit(dev))
> |        |       probe=<search driver's probe callback using 'match'>
> |        |   else 
> |        |       probe=<get it from preint driver config>
> |        |       (+fallback to 'match' if preinit driver disappeared?)
> |        |   
> |        +-> rc = probe(...)
> |        |
> |        ...
> v
> ...
> 
> Of course, what if multiple drivers decide to preinit the device ?

Yep, we'd have to have a policy to BUG_ON if the preinit driver is
already set.

> One way to make it cleaner would be to:
> - have the PCI layer export an API that allows (for example) the bus
>   notification callbacks (like vfio_pci_device_notifier) to preinit a driver
> - make such API reject calls on devices that already have a preinit
>   driver.
> - make VFIO detect the case where vfio_pci_device_notifier can not
>   preinit the driver (to vfio-pci) for the new device (because already
>   preinited) and raise an error/warning.
> 
> Would this look a bit cleaner?

It looks like there might already be infrastructure that we can set
dev->driver and call the driver probe() function, so maybe we're only in
trouble if dev->driver is already set when we get the bus add
notification.  I just wasn't sure if that was entirely kosher.  I'll
have to try that and figure out how to test it; fake hotplug maybe.

<snip>
> > > This fn below does not return any error code. Ok ...
> > > However, there are a number of errors case that you test, for example
> > > - device that does not belong to any group (according to iommu API)
> > > - device that belongs to a group but that does not appear in the list
> > >   of devices of the vfio_group structure.
> > > Are the above two errors checks just paranoia or are those errors
> > actually possible?
> > > If they were possible, shouldn't we generate a warning (most probably
> > > it would be a bug in the code)?
> > 
> > They're all vfio-bus driver bugs of some sort, so it's just a matter of
> > how much we want to scream about them.  I'll comments on each below.
> > 
> > > > +void vfio_group_del_dev(struct device *dev)
> > > > +{
> > > > +	struct list_head *pos;
> > > > +	struct vfio_group *group = NULL;
> > > > +	struct vfio_device *device = NULL;
> > > > +	unsigned int groupid;
> > > > +
> > > > +	if (iommu_device_group(dev, &groupid))
> > > > +		return;
> > 
> > Here the bus driver is probably just sitting on a notifier list for
> > their bus_type and a device is getting removed.  Unless we want to
> > require the bus driver to track everything it's attempted to add and
> > whether it worked, we can just ignore this.
> 
> OK, I see what you mean. If vfio_group_add_dev fails for some reasons we
> do not keep track of it. Right?

The primary thing I'm thinking of here is not vfio_group_add_dev()
failing for "some reason", but specifically failing because the device
doesn't have a groupid, ie. it's not behind an iommu.  In that case it's
just a random device that can't be used by vfio.

> Would it make sense to add one special group to vfio.group_list (or better
> On a separate field of the vfio structure) whose goal
> would be just that: keep track of those devices that failed to be added
> to the VFIO framework (can it help for debugging too?)?

For the above case, no, we shouldn't need to track those.  But it does
seem like there's a gap for devices that fail vfio_group_add_dev() for
other reasons.  I don't think we want a special group for them, because
that isolates them from other devices that are potentially in the same
group.  I think instead what we want to do is set a taint flag on the
group.  We can do a BUG_ON not being able to allocate a group, then a
WARN_ON if we fail elsewhere and mark the group tainted so it's
effectively never viable.

<snip>
> > > > +	if (!vfio_device_removeable(device)) {
> > > > +		/* XXX signal for all devices in group to be removed or
> > > > +		 * resort to killing the process holding the device fds.
> > > > +		 * For now just block waiting for releases to wake us. */
> > > > +		wait_event(vfio.release_q, vfio_device_removeable(device));
> > >
> > > Any new idea/proposal on how to handle this situation?
> > > The last one I remember was to leave the soft/hard/etc timeout
> > handling in
> > > userspace and implement it as a sort of policy. Is that one still the
> > most
> > > likely candidate solution to handle this situation?
> > 
> > I haven't heard any new proposals.  I think we need the hard timeout
> > handling in the kernel.  We can't leave it to userspace to decide they
> > get to keep the device.  We could have this tunable via an ioctl, but I
> > don't see how we wouldn't require CAP_SYS_ADMIN (or similar) to tweak
> > it.  I was intending to re-implement the netlink interface to signal
> > the
> > removal, but expect to get allergic reactions to that.
> 
> (I personally like the async netlink signaling, but I am OK with an ioctl based
> mechanism if it provides the same flexibility)
> 
> What would be a reasonable hard timeout?

I think we were looking at 10s of seconds in the old vfio code.  Tough
call though.  Could potentially provide a module_param override so an
admin that trusts their users could set long/infinite timeout.  Thanks,

Alex
Christian Benvenuti (benve) Nov. 10, 2011, 12:57 a.m. UTC | #7
Here are few minor comments on vfio_iommu.c ...

> diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c

> new file mode 100644

> index 0000000..029dae3

> --- /dev/null

> +++ b/drivers/vfio/vfio_iommu.c

> @@ -0,0 +1,530 @@

> +/*

> + * VFIO: IOMMU DMA mapping support

> + *

> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.

> + *     Author: Alex Williamson <alex.williamson@redhat.com>

> + *

> + * This program is free software; you can redistribute it and/or

> modify

> + * it under the terms of the GNU General Public License version 2 as

> + * published by the Free Software Foundation.

> + *

> + * Derived from original vfio:

> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.

> + * Author: Tom Lyon, pugs@cisco.com

> + */

> +

> +#include <linux/compat.h>

> +#include <linux/device.h>

> +#include <linux/fs.h>

> +#include <linux/iommu.h>

> +#include <linux/module.h>

> +#include <linux/mm.h>

> +#include <linux/sched.h>

> +#include <linux/slab.h>

> +#include <linux/uaccess.h>

> +#include <linux/vfio.h>

> +#include <linux/workqueue.h>

> +

> +#include "vfio_private.h"


Doesn't the 'dma_'  prefix belong to the generic DMA code?

> +struct dma_map_page {

> +	struct list_head	list;

> +	dma_addr_t		daddr;

> +	unsigned long		vaddr;

> +	int			npage;

> +	int			rdwr;

> +};

> +

> +/*

> + * This code handles mapping and unmapping of user data buffers

> + * into DMA'ble space using the IOMMU

> + */

> +

> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)

> +

> +struct vwork {

> +	struct mm_struct	*mm;

> +	int			npage;

> +	struct work_struct	work;

> +};

> +

> +/* delayed decrement for locked_vm */

> +static void vfio_lock_acct_bg(struct work_struct *work)

> +{

> +	struct vwork *vwork = container_of(work, struct vwork, work);

> +	struct mm_struct *mm;

> +

> +	mm = vwork->mm;

> +	down_write(&mm->mmap_sem);

> +	mm->locked_vm += vwork->npage;

> +	up_write(&mm->mmap_sem);

> +	mmput(mm);		/* unref mm */

> +	kfree(vwork);

> +}

> +

> +static void vfio_lock_acct(int npage)

> +{

> +	struct vwork *vwork;

> +	struct mm_struct *mm;

> +

> +	if (!current->mm) {

> +		/* process exited */

> +		return;

> +	}

> +	if (down_write_trylock(&current->mm->mmap_sem)) {

> +		current->mm->locked_vm += npage;

> +		up_write(&current->mm->mmap_sem);

> +		return;

> +	}

> +	/*

> +	 * Couldn't get mmap_sem lock, so must setup to decrement

                                                      ^^^^^^^^^

Increment?

> +	 * mm->locked_vm later. If locked_vm were atomic, we wouldn't

> +	 * need this silliness

> +	 */

> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);

> +	if (!vwork)

> +		return;

> +	mm = get_task_mm(current);	/* take ref mm */

> +	if (!mm) {

> +		kfree(vwork);

> +		return;

> +	}

> +	INIT_WORK(&vwork->work, vfio_lock_acct_bg);

> +	vwork->mm = mm;

> +	vwork->npage = npage;

> +	schedule_work(&vwork->work);

> +}

> +

> +/* Some mappings aren't backed by a struct page, for example an mmap'd

> + * MMIO range for our own or another device.  These use a different

> + * pfn conversion and shouldn't be tracked as locked pages. */

> +static int is_invalid_reserved_pfn(unsigned long pfn)

> +{

> +	if (pfn_valid(pfn)) {

> +		int reserved;

> +		struct page *tail = pfn_to_page(pfn);

> +		struct page *head = compound_trans_head(tail);

> +		reserved = PageReserved(head);

> +		if (head != tail) {

> +			/* "head" is not a dangling pointer

> +			 * (compound_trans_head takes care of that)

> +			 * but the hugepage may have been split

> +			 * from under us (and we may not hold a

> +			 * reference count on the head page so it can

> +			 * be reused before we run PageReferenced), so

> +			 * we've to check PageTail before returning

> +			 * what we just read.

> +			 */

> +			smp_rmb();

> +			if (PageTail(tail))

> +				return reserved;

> +		}

> +		return PageReserved(tail);

> +	}

> +

> +	return true;

> +}

> +

> +static int put_pfn(unsigned long pfn, int rdwr)

> +{

> +	if (!is_invalid_reserved_pfn(pfn)) {

> +		struct page *page = pfn_to_page(pfn);

> +		if (rdwr)

> +			SetPageDirty(page);

> +		put_page(page);

> +		return 1;

> +	}

> +	return 0;

> +}

> +

> +/* Unmap DMA region */

> +/* dgate must be held */

> +static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long

> iova,

> +			    int npage, int rdwr)

> +{

> +	int i, unlocked = 0;

> +

> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {

> +		unsigned long pfn;

> +

> +		pfn = iommu_iova_to_phys(iommu->domain, iova) >>

> PAGE_SHIFT;

> +		if (pfn) {

> +			iommu_unmap(iommu->domain, iova, 0);

> +			unlocked += put_pfn(pfn, rdwr);

> +		}

> +	}

> +	return unlocked;

> +}

> +

> +static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long

> iova,

> +			   unsigned long npage, int rdwr)

> +{

> +	int unlocked;

> +

> +	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);

> +	vfio_lock_acct(-unlocked);

> +}

> +

> +/* Unmap ALL DMA regions */

> +void vfio_iommu_unmapall(struct vfio_iommu *iommu)

> +{

> +	struct list_head *pos, *pos2;

> +	struct dma_map_page *mlp;

> +

> +	mutex_lock(&iommu->dgate);

> +	list_for_each_safe(pos, pos2, &iommu->dm_list) {

> +		mlp = list_entry(pos, struct dma_map_page, list);

> +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);

> +		list_del(&mlp->list);

> +		kfree(mlp);

> +	}

> +	mutex_unlock(&iommu->dgate);

> +}

> +

> +static int vaddr_get_pfn(unsigned long vaddr, int rdwr, unsigned long

> *pfn)

> +{

> +	struct page *page[1];

> +	struct vm_area_struct *vma;

> +	int ret = -EFAULT;

> +

> +	if (get_user_pages_fast(vaddr, 1, rdwr, page) == 1) {

> +		*pfn = page_to_pfn(page[0]);

> +		return 0;

> +	}

> +

> +	down_read(&current->mm->mmap_sem);

> +

> +	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);

> +

> +	if (vma && vma->vm_flags & VM_PFNMAP) {

> +		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma-

> >vm_pgoff;

> +		if (is_invalid_reserved_pfn(*pfn))

> +			ret = 0;

> +	}

> +

> +	up_read(&current->mm->mmap_sem);

> +

> +	return ret;

> +}

> +

> +/* Map DMA region */

> +/* dgate must be held */

> +static int vfio_dma_map(struct vfio_iommu *iommu, unsigned long iova,

> +			unsigned long vaddr, int npage, int rdwr)

> +{

> +	unsigned long start = iova;

> +	int i, ret, locked = 0, prot = IOMMU_READ;

> +

> +	/* Verify pages are not already mapped */

> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE)

> +		if (iommu_iova_to_phys(iommu->domain, iova))

> +			return -EBUSY;

> +

> +	iova = start;

> +

> +	if (rdwr)

> +		prot |= IOMMU_WRITE;

> +	if (iommu->cache)

> +		prot |= IOMMU_CACHE;

> +

> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr +=

> PAGE_SIZE) {

> +		unsigned long pfn = 0;

> +

> +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);

> +		if (ret) {

> +			__vfio_dma_unmap(iommu, start, i, rdwr);

> +			return ret;

> +		}

> +

> +		/* Only add actual locked pages to accounting */

> +		if (!is_invalid_reserved_pfn(pfn))

> +			locked++;

> +

> +		ret = iommu_map(iommu->domain, iova,

> +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);

> +		if (ret) {

> +			/* Back out mappings on error */

> +			put_pfn(pfn, rdwr);

> +			__vfio_dma_unmap(iommu, start, i, rdwr);

> +			return ret;

> +		}

> +	}

> +	vfio_lock_acct(locked);

> +	return 0;

> +}

> +

> +static inline int ranges_overlap(unsigned long start1, size_t size1,

> +				 unsigned long start2, size_t size2)

> +{

> +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);

> +}

> +

> +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,

> +					  dma_addr_t start, size_t size)

> +{

> +	struct list_head *pos;

> +	struct dma_map_page *mlp;

> +

> +	list_for_each(pos, &iommu->dm_list) {

> +		mlp = list_entry(pos, struct dma_map_page, list);

> +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),

> +				   start, size))

> +			return mlp;

> +	}

> +	return NULL;

> +}

> +

> +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t

> start,

> +			    size_t size, struct dma_map_page *mlp)

> +{

> +	struct dma_map_page *split;

> +	int npage_lo, npage_hi;

> +

> +	/* Existing dma region is completely covered, unmap all */


This works. However, given how vfio_dma_map_dm implements the merging
logic, I think it is impossible to have

    (start < mlp->daddr &&
     start + size > mlp->daddr + NPAGE_TO_SIZE(mlp->npage))


> +	if (start <= mlp->daddr &&

> +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {

> +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);

> +		list_del(&mlp->list);

> +		npage_lo = mlp->npage;

> +		kfree(mlp);

> +		return npage_lo;

> +	}

> +

> +	/* Overlap low address of existing range */


Same as above (ie, '<' is impossible)

> +	if (start <= mlp->daddr) {

> +		size_t overlap;

> +

> +		overlap = start + size - mlp->daddr;

> +		npage_lo = overlap >> PAGE_SHIFT;

> +		npage_hi = mlp->npage - npage_lo;

> +

> +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);

> +		mlp->daddr += overlap;

> +		mlp->vaddr += overlap;

> +		mlp->npage -= npage_lo;

> +		return npage_lo;

> +	}


Same as above (ie, '>' is impossible).

> +	/* Overlap high address of existing range */

> +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {

> +		size_t overlap;

> +

> +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;

> +		npage_hi = overlap >> PAGE_SHIFT;

> +		npage_lo = mlp->npage - npage_hi;

> +

> +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);

> +		mlp->npage -= npage_hi;

> +		return npage_hi;

> +	}

> +

> +	/* Split existing */

> +	npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;

> +	npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;

> +

> +	split = kzalloc(sizeof *split, GFP_KERNEL);

> +	if (!split)

> +		return -ENOMEM;

> +

> +	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);

> +

> +	mlp->npage = npage_lo;

> +

> +	split->npage = npage_hi;

> +	split->daddr = start + size;

> +	split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;

> +	split->rdwr = mlp->rdwr;

> +	list_add(&split->list, &iommu->dm_list);

> +	return size >> PAGE_SHIFT;

> +}

> +

> +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map

> *dmp)

> +{

> +	int ret = 0;

> +	size_t npage = dmp->size >> PAGE_SHIFT;

> +	struct list_head *pos, *n;

> +

> +	if (dmp->dmaaddr & ~PAGE_MASK)

> +		return -EINVAL;

> +	if (dmp->size & ~PAGE_MASK)

> +		return -EINVAL;

> +

> +	mutex_lock(&iommu->dgate);

> +

> +	list_for_each_safe(pos, n, &iommu->dm_list) {

> +		struct dma_map_page *mlp;

> +

> +		mlp = list_entry(pos, struct dma_map_page, list);

> +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),

> +				   dmp->dmaaddr, dmp->size)) {

> +			ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,

> +						      dmp->size, mlp);

> +			if (ret > 0)

> +				npage -= NPAGE_TO_SIZE(ret);

> +			if (ret < 0 || npage == 0)

> +				break;

> +		}

> +	}

> +	mutex_unlock(&iommu->dgate);

> +	return ret > 0 ? 0 : ret;

> +}

> +

> +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map

> *dmp)

> +{

> +	int npage;

> +	struct dma_map_page *mlp, *mmlp = NULL;

> +	dma_addr_t daddr = dmp->dmaaddr;

> +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;

> +	size_t size = dmp->size;

> +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;

> +

> +	if (vaddr & (PAGE_SIZE-1))

> +		return -EINVAL;

> +	if (daddr & (PAGE_SIZE-1))

> +		return -EINVAL;

> +	if (size & (PAGE_SIZE-1))

> +		return -EINVAL;

> +

> +	npage = size >> PAGE_SHIFT;

> +	if (!npage)

> +		return -EINVAL;

> +

> +	if (!iommu)

> +		return -EINVAL;

> +

> +	mutex_lock(&iommu->dgate);

> +

> +	if (vfio_find_dma(iommu, daddr, size)) {

> +		ret = -EBUSY;

> +		goto out_lock;

> +	}

> +

> +	/* account for locked pages */

> +	locked = current->mm->locked_vm + npage;

> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {

> +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",

> +			__func__, rlimit(RLIMIT_MEMLOCK));

> +		ret = -ENOMEM;

> +		goto out_lock;

> +	}

> +

> +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);

> +	if (ret)

> +		goto out_lock;

> +

> +	/* Check if we abut a region below */


Is !daddr possible?

> +	if (daddr) {

> +		mlp = vfio_find_dma(iommu, daddr - 1, 1);

> +		if (mlp && mlp->rdwr == rdwr &&

> +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {

> +

> +			mlp->npage += npage;

> +			daddr = mlp->daddr;

> +			vaddr = mlp->vaddr;

> +			npage = mlp->npage;

> +			size = NPAGE_TO_SIZE(npage);

> +

> +			mmlp = mlp;

> +		}

> +	}


Is !(daddr + size) possible?

> +	if (daddr + size) {

> +		mlp = vfio_find_dma(iommu, daddr + size, 1);

> +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size)

> {

> +

> +			mlp->npage += npage;

> +			mlp->daddr = daddr;

> +			mlp->vaddr = vaddr;

> +

> +			/* If merged above and below, remove previously

> +			 * merged entry.  New entry covers it.  */

> +			if (mmlp) {

> +				list_del(&mmlp->list);

> +				kfree(mmlp);

> +			}

> +			mmlp = mlp;

> +		}

> +	}

> +

> +	if (!mmlp) {

> +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);

> +		if (!mlp) {

> +			ret = -ENOMEM;

> +			vfio_dma_unmap(iommu, daddr, npage, rdwr);

> +			goto out_lock;

> +		}

> +

> +		mlp->npage = npage;

> +		mlp->daddr = daddr;

> +		mlp->vaddr = vaddr;

> +		mlp->rdwr = rdwr;

> +		list_add(&mlp->list, &iommu->dm_list);

> +	}

> +

> +out_lock:

> +	mutex_unlock(&iommu->dgate);

> +	return ret;

> +}

> +

> +static int vfio_iommu_release(struct inode *inode, struct file *filep)

> +{

> +	struct vfio_iommu *iommu = filep->private_data;

> +

> +	vfio_release_iommu(iommu);

> +	return 0;

> +}

> +

> +static long vfio_iommu_unl_ioctl(struct file *filep,

> +				 unsigned int cmd, unsigned long arg)

> +{

> +	struct vfio_iommu *iommu = filep->private_data;

> +	int ret = -ENOSYS;


Any reason for not using "switch" ?

> +        if (cmd == VFIO_IOMMU_GET_FLAGS) {

> +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;

> +

> +                ret = put_user(flags, (u64 __user *)arg);

> +

> +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {

> +		struct vfio_dma_map dm;

> +

> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))

> +			return -EFAULT;


What does the "_dm" suffix stand for?

> +		ret = vfio_dma_map_dm(iommu, &dm);

> +

> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof

> dm))

> +			ret = -EFAULT;

> +

> +	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {

> +		struct vfio_dma_map dm;

> +

> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))

> +			return -EFAULT;

> +

> +		ret = vfio_dma_unmap_dm(iommu, &dm);

> +

> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof

> dm))

> +			ret = -EFAULT;

> +	}

> +	return ret;

> +}

> +

> +#ifdef CONFIG_COMPAT

> +static long vfio_iommu_compat_ioctl(struct file *filep,

> +				    unsigned int cmd, unsigned long arg)

> +{

> +	arg = (unsigned long)compat_ptr(arg);

> +	return vfio_iommu_unl_ioctl(filep, cmd, arg);

> +}

> +#endif	/* CONFIG_COMPAT */

> +

> +const struct file_operations vfio_iommu_fops = {

> +	.owner		= THIS_MODULE,

> +	.release	= vfio_iommu_release,

> +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,

> +#ifdef CONFIG_COMPAT

> +	.compat_ioctl	= vfio_iommu_compat_ioctl,

> +#endif

> +};


/Chris
Konrad Rzeszutek Wilk Nov. 11, 2011, 5:51 p.m. UTC | #8
On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> VFIO provides a secure, IOMMU based interface for user space
> drivers, including device assignment to virtual machines.
> This provides the base management of IOMMU groups, devices,
> and IOMMU objects.  See Documentation/vfio.txt included in
> this patch for user and kernel API description.
> 
> Note, this implements the new API discussed at KVM Forum
> 2011, as represented by the drvier version 0.2.  It's hoped
> that this provides a modular enough interface to support PCI
> and non-PCI userspace drivers across various architectures
> and IOMMU implementations.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> Fingers crossed, this is the last RFC for VFIO, but we need
> the iommu group support before this can go upstream
> (http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
> hoping this helps push that along.
> 
> Since the last posting, this version completely modularizes
> the device backends and better defines the APIs between the
> core VFIO code and the device backends.  I expect that we
> might also adopt a modular IOMMU interface as iommu_ops learns
> about different types of hardware.  Also many, many cleanups.
> Check the complete git history for details:
> 
> git://github.com/awilliam/linux-vfio.git vfio-ng
> 
> (matching qemu tree: git://github.com/awilliam/qemu-vfio.git)
> 
> This version, along with the supporting VFIO PCI backend can
> be found here:
> 
> git://github.com/awilliam/linux-vfio.git vfio-next-20111103
> 
> I've held off on implementing a kernel->user signaling
> mechanism for now since the previous netlink version produced
> too many gag reflexes.  It's easy enough to set a bit in the
> group flags too indicate such support in the future, so I
> think we can move ahead without it.
> 
> Appreciate any feedback or suggestions.  Thanks,
> 
> Alex
> 
>  Documentation/ioctl/ioctl-number.txt |    1 
>  Documentation/vfio.txt               |  304 +++++++++
>  MAINTAINERS                          |    8 
>  drivers/Kconfig                      |    2 
>  drivers/Makefile                     |    1 
>  drivers/vfio/Kconfig                 |    8 
>  drivers/vfio/Makefile                |    3 
>  drivers/vfio/vfio_iommu.c            |  530 ++++++++++++++++
>  drivers/vfio/vfio_main.c             | 1151 ++++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio_private.h          |   34 +
>  include/linux/vfio.h                 |  155 +++++
>  11 files changed, 2197 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/vfio.txt
>  create mode 100644 drivers/vfio/Kconfig
>  create mode 100644 drivers/vfio/Makefile
>  create mode 100644 drivers/vfio/vfio_iommu.c
>  create mode 100644 drivers/vfio/vfio_main.c
>  create mode 100644 drivers/vfio/vfio_private.h
>  create mode 100644 include/linux/vfio.h
> 
> diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
> index 54078ed..59d01e4 100644
> --- a/Documentation/ioctl/ioctl-number.txt
> +++ b/Documentation/ioctl/ioctl-number.txt
> @@ -88,6 +88,7 @@ Code  Seq#(hex)	Include File		Comments
>  		and kernel/power/user.c
>  '8'	all				SNP8023 advanced NIC card
>  					<mailto:mcr@solidum.com>
> +';'	64-76	linux/vfio.h
>  '@'	00-0F	linux/radeonfb.h	conflict!
>  '@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
>  'A'	00-1F	linux/apm_bios.h	conflict!
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> new file mode 100644
> index 0000000..5866896
> --- /dev/null
> +++ b/Documentation/vfio.txt
> @@ -0,0 +1,304 @@
> +VFIO - "Virtual Function I/O"[1]
> +-------------------------------------------------------------------------------
> +Many modern system now provide DMA and interrupt remapping facilities
> +to help ensure I/O devices behave within the boundaries they've been
> +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> +well as POWER systems with Partitionable Endpoints (PEs) and even
> +embedded powerpc systems (technology name unknown).  The VFIO driver
> +is an IOMMU/device agnostic framework for exposing direct device
> +access to userspace, in a secure, IOMMU protected environment.  In
> +other words, this allows safe, non-privileged, userspace drivers.
> +
> +Why do we want that?  Virtual machines often make use of direct device
> +access ("device assignment") when configured for the highest possible
> +I/O performance.  From a device and host perspective, this simply turns
> +the VM into a userspace driver, with the benefits of significantly
> +reduced latency, higher bandwidth, and direct use of bare-metal device
> +drivers[2].

Are there any constraints of running a 32-bit userspace with
a 64-bit kernel and with 32-bit user space drivers?

> +
> +Some applications, particularly in the high performance computing
> +field, also benefit from low-overhead, direct device access from
> +userspace.  Examples include network adapters (often non-TCP/IP based)
> +and compute accelerators.  Previous to VFIO, these drivers needed to
> +go through the full development cycle to become proper upstream driver,
> +be maintained out of tree, or make use of the UIO framework, which
> +has no notion of IOMMU protection, limited interrupt support, and
> +requires root privileges to access things like PCI configuration space.
> +
> +The VFIO driver framework intends to unify these, replacing both the
> +KVM PCI specific device assignment currently used as well as provide
> +a more secure, more featureful userspace driver environment than UIO.
> +
> +Groups, Devices, IOMMUs, oh my

<chuckles> oh my, eh?

> +-------------------------------------------------------------------------------
> +
> +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> +can't always distinguish transactions from each individual device in
> +the system.  Sometimes this is because of the IOMMU design, such as with
> +PEs, other times it's caused by the I/O topology, for instance a
> +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> +devices created by these restictions IOMMU groups (or just "groups" for
> +this document).
> +
> +The IOMMU cannot distiguish transactions between the individual devices
> +within the group, therefore the group is the basic unit of ownership for
> +a userspace process.  Because of this, groups are also the primary
> +interface to both devices and IOMMU domains in VFIO.
> +
> +The VFIO representation of groups is created as devices are added into
> +the framework by a VFIO bus driver.  The vfio-pci module is an example
> +of a bus driver.  This module registers devices along with a set of bus
> +specific callbacks with the VFIO core.  These callbacks provide the
> +interfaces later used for device access.  As each new group is created,
> +as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
> +character device.
> +
> +In addition to the device enumeration and callbacks, the VFIO bus driver
> +also provides a traditional device driver and is able to bind to devices
> +on it's bus.  When a device is bound to the bus driver it's available to
> +VFIO.  When all the devices within a group are bound to their bus drivers,
> +the group becomes "viable" and a user with sufficient access to the VFIO
> +group chardev can obtain exclusive access to the set of group devices.
> +
> +As documented in linux/vfio.h, several ioctls are provided on the
> +group chardev:
> +
> +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
> +
> +The last two ioctls return new file descriptors for accessing
> +individual devices within the group and programming the IOMMU.  Each of
> +these new file descriptors provide their own set of file interfaces.
> +These ioctls will fail if any of the devices within the group are not
> +bound to their VFIO bus driver.  Additionally, when either of these
> +interfaces are used, the group is then bound to the struct_mm of the
> +caller.  The GET_FLAGS ioctl can be used to view the state of the group.
> +
> +When either the GET_IOMMU_FD or GET_DEVICE_FD ioctls are invoked, a
> +new IOMMU domain is created and all of the devices in the group are
> +attached to it.  This is the only way to ensure full IOMMU isolation
> +of the group, but potentially wastes resources and cycles if the user
> +intends to manage multiple groups with the same set of IOMMU mappings.
> +VFIO therefore provides a group MERGE and UNMERGE interface, which
> +allows multiple groups to share an IOMMU domain.  Not all IOMMUs allow
> +arbitrary groups to be merged, so the user should assume merging is
> +opportunistic.  A new group, with no open device or IOMMU file
> +descriptors, can be merged into an existing, in-use, group using the
> +MERGE ioctl.  A merged group can be unmerged using the UNMERGE ioctl
> +once all of the device file descriptors for the group being merged
> +"out" are closed.
> +
> +When groups are merged, the GET_IOMMU_FD and GET_DEVICE_FD ioctls are
> +essentially fungible between group file descriptors (ie. if device A
> +is in group X, and X is merged with Y, a file descriptor for A can be
> +retrieved using GET_DEVICE_FD on Y.  Likewise, GET_IOMMU_FD returns a
> +file descriptor referencing the same internal IOMMU object from either
> +X or Y).  Merged groups can be dissolved either explictly with UNMERGE
> +or automatically when ALL file descriptors for the merged group are
> +closed (all IOMMUs, all devices, all groups).
> +
> +The IOMMU file descriptor provides this set of ioctls:
> +
> +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)

Coherency support is not going to be addressed right? What about sync?
Say you need to sync CPU to Device address?

> +
> +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> +We currently only support IOMMU domains that are able to map any
> +virtual address to any IOVA.  This is indicated by the MAP_ANY flag.
> +
> +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> +and unmapping IOVAs to process virtual addresses:
> +
> +struct vfio_dma_map {
> +        __u64   len;            /* length of structure */

What is the purpose of the 'len' field? Is it to guard against future
version changes?

> +        __u64   vaddr;          /* process virtual addr */
> +        __u64   dmaaddr;        /* desired and/or returned dma address */
> +        __u64   size;           /* size in bytes */
> +        __u64   flags;
> +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> +};
> +
> +Current users of VFIO use relatively static DMA mappings, not requiring
> +high frequency turnover.  As new users are added, it's expected that the

Is there a limit to how many DMA mappings can be created?

> +IOMMU file descriptor will evolve to support new mapping interfaces, this
> +will be reflected in the flags and may present new ioctls and file
> +interfaces.
> +
> +The device GET_FLAGS ioctl is intended to return basic device type and
> +indicate support for optional capabilities.  Flags currently include whether
> +the device is PCI or described by Device Tree, and whether the RESET ioctl
> +is supported:

And reset in terms of PCIe spec is the FLR?

> +
> +#define VFIO_DEVICE_GET_FLAGS           _IOR(';', 108, __u64)
> + #define VFIO_DEVICE_FLAGS_PCI          (1 << 0)
> + #define VFIO_DEVICE_FLAGS_DT           (1 << 1)
> + #define VFIO_DEVICE_FLAGS_RESET        (1 << 2)
> +
> +The MMIO and IOP resources used by a device are described by regions.

IOP?

> +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> +
> +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)

Don't want __u32?
> +
> +Regions are described by a struct vfio_region_info, which is retrieved by
> +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> +the desired region (0 based index).  Note that devices may implement zero
> 
+sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> +mapping).

Huh?

> +
> +struct vfio_region_info {
> +        __u32   len;            /* length of structure */
> +        __u32   index;          /* region number */
> +        __u64   size;           /* size in bytes of region */
> +        __u64   offset;         /* start offset of region */
> +        __u64   flags;
> +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)

What is FLAG_MMAP? Does it mean: 1) it can be mmaped, or 2) it is mmaped?
FLAG_RO is pretty obvious - presumarily this is for firmware regions and such.
And PHYS_VALID is if the region is disabled for some reasons? If so
would the name FLAG_DISABLED be better?

> +        __u64   phys;           /* physical address of region */
> +};
> +
> +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> +
> +The offset indicates the offset into the device file descriptor which
> +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> +available access types and validity of optional fields.  For instance
> +the phys field may only be valid for certain devices types.
> +
> +Interrupts are described using a similar interface.  GET_NUM_IRQS
> +reports the number or IRQ indexes for the device.
> +
> +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)

_u32?

> +
> +struct vfio_irq_info {
> +        __u32   len;            /* length of structure */
> +        __u32   index;          /* IRQ number */
> +        __u32   count;          /* number of individual IRQs */
> +        __u64   flags;
> +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> +};
> +
> +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> +type to index mapping).

I am not really sure what that means.

> +
> +Information about each index can be retrieved using the GET_IRQ_INFO
> +ioctl, used much like GET_REGION_INFO.
> +
> +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> +
> +Individual indexes can describe single or sets of IRQs.  This provides the
> +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> +
> +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> +as shown below, are used to pass the IRQ info index, the number of eventfds,
> +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> +
> +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */

Are eventfds u64 or u32?

Why not just define a structure?
struct vfio_irq_eventfds {
	__u32	index;
	__u32	count;
	__u64	eventfds[0]
};

How do you get an eventfd to feed in here?

> +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)

u32?
> +
> +When a level triggered interrupt is signaled, the interrupt is masked
> +on the host.  This prevents an unresponsive userspace driver from
> +continuing to interrupt the host system.  After servicing the interrupt,
> +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> +triggered interrupts implicitly have a count of 1 per index.

So they are enabled automatically? Meaning you don't even hav to do
SET_IRQ_EVENTFDS b/c the count is set to 1?

> +
> +/* Unmask IRQ index, arg[0] = index */
> +#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)

So this is for MSI as well? So if I've an index = 1, with count = 4,
and doing unmaks IRQ will chip enable all the MSI event at once?

I guess there is not much point in enabling/disabling selective MSI
IRQs..

> +
> +Level triggered interrupts can also be unmasked using an irqfd.  Use

irqfd or eventfd?

> +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.

So only level triggered? Hmm, how do I know whether the device is
level or edge? Or is that edge (MSI) can also be unmaked using the
eventfs

> +
> +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> +
> +When supported, as indicated by the device flags, reset the device.
> +
> +#define VFIO_DEVICE_RESET               _IO(';', 116)

Does it disable the 'count'? Err, does it disable the IRQ on the
device after this and one should call VFIO_DEVICE_SET_IRQ_EVENTFDS
to set new eventfds? Or does it re-use the eventfds and the device
is enabled after this?


> +
> +Device tree devices also invlude ioctls for further defining the

include

> +device tree properties of the device:
> +
> +struct vfio_dtpath {
> +        __u32   len;            /* length of structure */
> +        __u32   index;

0 based I presume?
> +        __u64   flags;
> +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)

What is region in this context?? Or would this make much more sense
if I knew what Device Tree actually is.

> +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> +        char    *path;

Ah, now I see why you want 'len' here.. But I am still at loss
why you want that with the other structures.

> +};
> +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> +
> +struct vfio_dtindex {
> +        __u32   len;            /* length of structure */
> +        __u32   index;
> +        __u32   prop_type;

Is that an enum type? Is this definied somewhere?
> +        __u32   prop_index;

What is the purpose of this field?

> +        __u64   flags;
> +#define VFIO_DTINDEX_FLAGS_REGION       (1 << 0)
> +#define VFIO_DTINDEX_FLAGS_IRQ          (1 << 1)
> +};
> +#define VFIO_DEVICE_GET_DTINDEX         _IOWR(';', 118, struct vfio_dtindex)
> +
> +
> +VFIO bus driver API
> +-------------------------------------------------------------------------------
> +
> +Bus drivers, such as PCI, have three jobs:
> + 1) Add/remove devices from vfio
> + 2) Provide vfio_device_ops for device access
> + 3) Device binding and unbinding

suspend/resume?

> +
> +When initialized, the bus driver should enumerate the devices on it's
> +bus and call vfio_group_add_dev() for each device.  If the bus supports
> +hotplug, notifiers should be enabled to track devices being added and
> +removed.  vfio_group_del_dev() removes a previously added device from
> +vfio.
> +
> +Adding a device registers a vfio_device_ops function pointer structure
> +for the device:

Huh? So this gets created for _every_ 'struct device' that is added
the VFIO bus? Is this structure exposed? Or is this an internal one?

> +
> +struct vfio_device_ops {
> +	bool			(*match)(struct device *, char *);
> +	int			(*get)(void *);
> +	void			(*put)(void *);
> +	ssize_t			(*read)(void *, char __user *,
> +					size_t, loff_t *);
> +	ssize_t			(*write)(void *, const char __user *,
> +					 size_t, loff_t *);
> +	long			(*ioctl)(void *, unsigned int, unsigned long);
> +	int			(*mmap)(void *, struct vm_area_struct *);
> +};
> +
> +When a device is bound to the bus driver, the bus driver indicates this
> +to vfio using the vfio_bind_dev() interface.  The device_data parameter

Might want to paste the function decleration for it.. b/c I am not sure
where the 'device_data' parameter is on the argument list.

> +is a pointer to an opaque data structure for use only by the bus driver.
> +The get, put, read, write, ioctl, and mmap vfio_device_ops all pass
> +this data structure back to the bus driver.  When a device is unbound

Oh, so it is on the 'void *'.
> +from the bus driver, the vfio_unbind_dev() interface signals this to
> +vfio.  This function returns the pointer to the device_data structure

That function
> +registered for the device.

I am not really sure what this section purpose is? Could this be part
of the header file or the code? It does not look to be part of the
ioctl API?

> +
> +As noted previously, a group contains one or more devices, so
> +GROUP_GET_DEVICE_FD needs to identify the specific device being requested.
> +The vfio_device_ops.match callback is used to allow bus drivers to determine
> +the match.  For drivers like vfio-pci, it's a simple match to dev_name(),
> +which is unique in the system due to the PCI bus topology, other bus drivers
> +may need to include parent devices to create a unique match, so this is
> +left as a bus driver interface.
> +
> +-------------------------------------------------------------------------------
> +
> +[1] VFIO was originally an acronym for "Virtual Function I/O" in it's
> +initial implementation by Tom Lyon while as Cisco.  We've since outgrown
> +the acronym, but it's catchy.
> +
> +[2] As always there are trade-offs to virtual machine device
> +assignment that are beyond the scope of VFIO.  It's expected that
> +future IOMMU technologies will reduce some, but maybe not all, of
> +these trade-offs.
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f05f5f6..4bd5aa0 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -7106,6 +7106,14 @@ S:	Maintained
>  F:	Documentation/filesystems/vfat.txt
>  F:	fs/fat/
>  
> +VFIO DRIVER
> +M:	Alex Williamson <alex.williamson@redhat.com>
> +L:	kvm@vger.kernel.org

No vfio mailing list? Or a vfio-mailing list? 
> +S:	Maintained
> +F:	Documentation/vfio.txt
> +F:	drivers/vfio/
> +F:	include/linux/vfio.h
> +
>  VIDEOBUF2 FRAMEWORK
>  M:	Pawel Osciak <pawel@osciak.com>
>  M:	Marek Szyprowski <m.szyprowski@samsung.com>
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index b5e6f24..e15578b 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -112,6 +112,8 @@ source "drivers/auxdisplay/Kconfig"
>  
>  source "drivers/uio/Kconfig"
>  
> +source "drivers/vfio/Kconfig"
> +
>  source "drivers/vlynq/Kconfig"
>  
>  source "drivers/virtio/Kconfig"
> diff --git a/drivers/Makefile b/drivers/Makefile
> index 1b31421..5f138b5 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -58,6 +58,7 @@ obj-$(CONFIG_ATM)		+= atm/
>  obj-$(CONFIG_FUSION)		+= message/
>  obj-y				+= firewire/
>  obj-$(CONFIG_UIO)		+= uio/
> +obj-$(CONFIG_VFIO)		+= vfio/
>  obj-y				+= cdrom/
>  obj-y				+= auxdisplay/
>  obj-$(CONFIG_PCCARD)		+= pcmcia/
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> new file mode 100644
> index 0000000..9acb1e7
> --- /dev/null
> +++ b/drivers/vfio/Kconfig
> @@ -0,0 +1,8 @@
> +menuconfig VFIO
> +	tristate "VFIO Non-Privileged userspace driver framework"
> +	depends on IOMMU_API
> +	help
> +	  VFIO provides a framework for secure userspace device drivers.
> +	  See Documentation/vfio.txt for more details.
> +
> +	  If you don't know what to do here, say N.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> new file mode 100644
> index 0000000..088faf1
> --- /dev/null
> +++ b/drivers/vfio/Makefile
> @@ -0,0 +1,3 @@
> +vfio-y := vfio_main.o vfio_iommu.o
> +
> +obj-$(CONFIG_VFIO) := vfio.o
> diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> new file mode 100644
> index 0000000..029dae3
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu.c
> @@ -0,0 +1,530 @@
> +/*
> + * VFIO: IOMMU DMA mapping support
> + *
> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + */
> +
> +#include <linux/compat.h>
> +#include <linux/device.h>
> +#include <linux/fs.h>
> +#include <linux/iommu.h>
> +#include <linux/module.h>
> +#include <linux/mm.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +#include <linux/workqueue.h>
> +
> +#include "vfio_private.h"
> +
> +struct dma_map_page {
> +	struct list_head	list;
> +	dma_addr_t		daddr;
> +	unsigned long		vaddr;
> +	int			npage;
> +	int			rdwr;

rdwr? Is this a flag thing? Could it be made in an enum?
> +};
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	int			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement for locked_vm */
> +static void vfio_lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);		/* unref mm */
> +	kfree(vwork);
> +}
> +
> +static void vfio_lock_acct(int npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm) {
> +		/* process exited */
> +		return;
> +	}
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to decrement
> +	 * mm->locked_vm later. If locked_vm were atomic, we wouldn't
> +	 * need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);	/* take ref mm */
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}
> +
> +/* Some mappings aren't backed by a struct page, for example an mmap'd
> + * MMIO range for our own or another device.  These use a different
> + * pfn conversion and shouldn't be tracked as locked pages. */
> +static int is_invalid_reserved_pfn(unsigned long pfn)

static bool

> +{
> +	if (pfn_valid(pfn)) {
> +		int reserved;
> +		struct page *tail = pfn_to_page(pfn);
> +		struct page *head = compound_trans_head(tail);
> +		reserved = PageReserved(head);

bool reserved = PageReserved(head);


> +		if (head != tail) {
> +			/* "head" is not a dangling pointer
> +			 * (compound_trans_head takes care of that)
> +			 * but the hugepage may have been split
> +			 * from under us (and we may not hold a
> +			 * reference count on the head page so it can
> +			 * be reused before we run PageReferenced), so
> +			 * we've to check PageTail before returning
> +			 * what we just read.
> +			 */
> +			smp_rmb();
> +			if (PageTail(tail))
> +				return reserved;
> +		}
> +		return PageReserved(tail);
> +	}
> +
> +	return true;
> +}
> +
> +static int put_pfn(unsigned long pfn, int rdwr)
> +{
> +	if (!is_invalid_reserved_pfn(pfn)) {
> +		struct page *page = pfn_to_page(pfn);
> +		if (rdwr)
> +			SetPageDirty(page);
> +		put_page(page);
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +/* Unmap DMA region */
> +/* dgate must be held */

dgate?

> +static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> +			    int npage, int rdwr)
> +{
> +	int i, unlocked = 0;
> +
> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> +		unsigned long pfn;
> +
> +		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
> +		if (pfn) {
> +			iommu_unmap(iommu->domain, iova, 0);

What is the '0' for? Perhaps a comment: /* We only do zero order */

> +			unlocked += put_pfn(pfn, rdwr);
> +		}
> +	}
> +	return unlocked;
> +}
> +
> +static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> +			   unsigned long npage, int rdwr)
> +{
> +	int unlocked;
> +
> +	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);
> +	vfio_lock_acct(-unlocked);
> +}
> +
> +/* Unmap ALL DMA regions */
> +void vfio_iommu_unmapall(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos, *pos2;

pos2 should probably be just called 'tmp'

> +	struct dma_map_page *mlp;

What does 'mlp' stand for?

mlp -> dma_page ?

> +
> +	mutex_lock(&iommu->dgate);
> +	list_for_each_safe(pos, pos2, &iommu->dm_list) {
> +		mlp = list_entry(pos, struct dma_map_page, list);
> +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);

Uh, so if it did not get put_page() we would try to still delete it?
Couldn't that lead to corruption as the 'mlp' is returned to the poll?

Ah wait, the put_page is on the DMA page, so it is OK to
delete the tracking structure. It will be just a leaked page.
> +		list_del(&mlp->list);
> +		kfree(mlp);
> +	}
> +	mutex_unlock(&iommu->dgate);
> +}
> +
> +static int vaddr_get_pfn(unsigned long vaddr, int rdwr, unsigned long *pfn)
> +{
> +	struct page *page[1];
> +	struct vm_area_struct *vma;
> +	int ret = -EFAULT;
> +
> +	if (get_user_pages_fast(vaddr, 1, rdwr, page) == 1) {
> +		*pfn = page_to_pfn(page[0]);
> +		return 0;
> +	}
> +
> +	down_read(&current->mm->mmap_sem);
> +
> +	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +
> +	if (vma && vma->vm_flags & VM_PFNMAP) {
> +		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> +		if (is_invalid_reserved_pfn(*pfn))
> +			ret = 0;

Did you mean to break here?

> +	}
> +
> +	up_read(&current->mm->mmap_sem);
> +
> +	return ret;
> +}
> +
> +/* Map DMA region */
> +/* dgate must be held */
> +static int vfio_dma_map(struct vfio_iommu *iommu, unsigned long iova,
> +			unsigned long vaddr, int npage, int rdwr)
> +{
> +	unsigned long start = iova;
> +	int i, ret, locked = 0, prot = IOMMU_READ;
> +
> +	/* Verify pages are not already mapped */

I think a 'that' is missing above.

> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
> +		if (iommu_iova_to_phys(iommu->domain, iova))
> +			return -EBUSY;
> +
> +	iova = start;
> +
> +	if (rdwr)
> +		prot |= IOMMU_WRITE;
> +	if (iommu->cache)
> +		prot |= IOMMU_CACHE;
> +
> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> +		unsigned long pfn = 0;
> +
> +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> +		if (ret) {
> +			__vfio_dma_unmap(iommu, start, i, rdwr);
> +			return ret;
> +		}
> +
> +		/* Only add actual locked pages to accounting */
> +		if (!is_invalid_reserved_pfn(pfn))
> +			locked++;
> +
> +		ret = iommu_map(iommu->domain, iova,
> +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);

Put a comment by the 0 saying /* order 0 pages only! */

> +		if (ret) {
> +			/* Back out mappings on error */
> +			put_pfn(pfn, rdwr);
> +			__vfio_dma_unmap(iommu, start, i, rdwr);
> +			return ret;
> +		}
> +	}
> +	vfio_lock_acct(locked);
> +	return 0;
> +}
> +
> +static inline int ranges_overlap(unsigned long start1, size_t size1,

Perhaps a bool?

> +				 unsigned long start2, size_t size2)
> +{
> +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);
> +}
> +
> +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
> +					  dma_addr_t start, size_t size)
> +{
> +	struct list_head *pos;
> +	struct dma_map_page *mlp;
> +
> +	list_for_each(pos, &iommu->dm_list) {
> +		mlp = list_entry(pos, struct dma_map_page, list);
> +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> +				   start, size))
> +			return mlp;
> +	}
> +	return NULL;
> +}
> +
> +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
> +			    size_t size, struct dma_map_page *mlp)
> +{
> +	struct dma_map_page *split;
> +	int npage_lo, npage_hi;
> +
> +	/* Existing dma region is completely covered, unmap all */
> +	if (start <= mlp->daddr &&
> +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> +		list_del(&mlp->list);
> +		npage_lo = mlp->npage;
> +		kfree(mlp);
> +		return npage_lo;
> +	}
> +
> +	/* Overlap low address of existing range */
> +	if (start <= mlp->daddr) {
> +		size_t overlap;
> +
> +		overlap = start + size - mlp->daddr;
> +		npage_lo = overlap >> PAGE_SHIFT;
> +		npage_hi = mlp->npage - npage_lo;
> +
> +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> +		mlp->daddr += overlap;
> +		mlp->vaddr += overlap;
> +		mlp->npage -= npage_lo;
> +		return npage_lo;
> +	}
> +
> +	/* Overlap high address of existing range */
> +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> +		size_t overlap;
> +
> +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> +		npage_hi = overlap >> PAGE_SHIFT;
> +		npage_lo = mlp->npage - npage_hi;
> +
> +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> +		mlp->npage -= npage_hi;
> +		return npage_hi;
> +	}
> +
> +	/* Split existing */
> +	npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
> +	npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
> +
> +	split = kzalloc(sizeof *split, GFP_KERNEL);
> +	if (!split)
> +		return -ENOMEM;
> +
> +	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
> +
> +	mlp->npage = npage_lo;
> +
> +	split->npage = npage_hi;
> +	split->daddr = start + size;
> +	split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
> +	split->rdwr = mlp->rdwr;
> +	list_add(&split->list, &iommu->dm_list);
> +	return size >> PAGE_SHIFT;
> +}
> +
> +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> +{
> +	int ret = 0;
> +	size_t npage = dmp->size >> PAGE_SHIFT;
> +	struct list_head *pos, *n;
> +
> +	if (dmp->dmaaddr & ~PAGE_MASK)
> +		return -EINVAL;
> +	if (dmp->size & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->dgate);
> +
> +	list_for_each_safe(pos, n, &iommu->dm_list) {
> +		struct dma_map_page *mlp;
> +
> +		mlp = list_entry(pos, struct dma_map_page, list);
> +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> +				   dmp->dmaaddr, dmp->size)) {
> +			ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
> +						      dmp->size, mlp);
> +			if (ret > 0)
> +				npage -= NPAGE_TO_SIZE(ret);
> +			if (ret < 0 || npage == 0)
> +				break;
> +		}
> +	}
> +	mutex_unlock(&iommu->dgate);
> +	return ret > 0 ? 0 : ret;
> +}
> +
> +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> +{
> +	int npage;
> +	struct dma_map_page *mlp, *mmlp = NULL;
> +	dma_addr_t daddr = dmp->dmaaddr;
> +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;
> +	size_t size = dmp->size;
> +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;
> +
> +	if (vaddr & (PAGE_SIZE-1))
> +		return -EINVAL;
> +	if (daddr & (PAGE_SIZE-1))
> +		return -EINVAL;
> +	if (size & (PAGE_SIZE-1))
> +		return -EINVAL;
> +
> +	npage = size >> PAGE_SHIFT;
> +	if (!npage)
> +		return -EINVAL;
> +
> +	if (!iommu)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->dgate);
> +
> +	if (vfio_find_dma(iommu, daddr, size)) {
> +		ret = -EBUSY;
> +		goto out_lock;
> +	}
> +
> +	/* account for locked pages */
> +	locked = current->mm->locked_vm + npage;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> +			__func__, rlimit(RLIMIT_MEMLOCK));
> +		ret = -ENOMEM;
> +		goto out_lock;
> +	}
> +
> +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);
> +	if (ret)
> +		goto out_lock;
> +
> +	/* Check if we abut a region below */
> +	if (daddr) {
> +		mlp = vfio_find_dma(iommu, daddr - 1, 1);
> +		if (mlp && mlp->rdwr == rdwr &&
> +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {
> +
> +			mlp->npage += npage;
> +			daddr = mlp->daddr;
> +			vaddr = mlp->vaddr;
> +			npage = mlp->npage;
> +			size = NPAGE_TO_SIZE(npage);
> +
> +			mmlp = mlp;
> +		}
> +	}
> +
> +	if (daddr + size) {
> +		mlp = vfio_find_dma(iommu, daddr + size, 1);
> +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size) {
> +
> +			mlp->npage += npage;
> +			mlp->daddr = daddr;
> +			mlp->vaddr = vaddr;
> +
> +			/* If merged above and below, remove previously
> +			 * merged entry.  New entry covers it.  */
> +			if (mmlp) {
> +				list_del(&mmlp->list);
> +				kfree(mmlp);
> +			}
> +			mmlp = mlp;
> +		}
> +	}
> +
> +	if (!mmlp) {
> +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
> +		if (!mlp) {
> +			ret = -ENOMEM;
> +			vfio_dma_unmap(iommu, daddr, npage, rdwr);
> +			goto out_lock;
> +		}
> +
> +		mlp->npage = npage;
> +		mlp->daddr = daddr;
> +		mlp->vaddr = vaddr;
> +		mlp->rdwr = rdwr;
> +		list_add(&mlp->list, &iommu->dm_list);
> +	}
> +
> +out_lock:
> +	mutex_unlock(&iommu->dgate);
> +	return ret;
> +}
> +
> +static int vfio_iommu_release(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_iommu *iommu = filep->private_data;
> +
> +	vfio_release_iommu(iommu);
> +	return 0;
> +}
> +
> +static long vfio_iommu_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_iommu *iommu = filep->private_data;
> +	int ret = -ENOSYS;
> +
> +        if (cmd == VFIO_IOMMU_GET_FLAGS) {

Something is weird with the tabbing here..

> +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> +
> +                ret = put_user(flags, (u64 __user *)arg);
> +
> +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> +		struct vfio_dma_map dm;
> +
> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> +			return -EFAULT;
> +
> +		ret = vfio_dma_map_dm(iommu, &dm);
> +
> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> +			ret = -EFAULT;
> +
> +	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
> +		struct vfio_dma_map dm;
> +
> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> +			return -EFAULT;
> +
> +		ret = vfio_dma_unmap_dm(iommu, &dm);
> +
> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> +			ret = -EFAULT;
> +	}
> +	return ret;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long vfio_iommu_compat_ioctl(struct file *filep,
> +				    unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +const struct file_operations vfio_iommu_fops = {
> +	.owner		= THIS_MODULE,
> +	.release	= vfio_iommu_release,
> +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_iommu_compat_ioctl,
> +#endif
> +};
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> new file mode 100644
> index 0000000..6169356
> --- /dev/null
> +++ b/drivers/vfio/vfio_main.c
> @@ -0,0 +1,1151 @@
> +/*
> + * VFIO framework
> + *
> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + */
> +
> +#include <linux/cdev.h>
> +#include <linux/compat.h>
> +#include <linux/device.h>
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/fs.h>
> +#include <linux/idr.h>
> +#include <linux/iommu.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +#include <linux/wait.h>
> +
> +#include "vfio_private.h"
> +
> +#define DRIVER_VERSION	"0.2"
> +#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
> +#define DRIVER_DESC	"VFIO - User Level meta-driver"
> +
> +static int allow_unsafe_intrs;

__read_mostly
> +module_param(allow_unsafe_intrs, int, 0);

S_IRUGO ?

> +MODULE_PARM_DESC(allow_unsafe_intrs,
> +        "Allow use of IOMMUs which do not support interrupt remapping");
> +
> +static struct vfio {
> +	dev_t			devt;
> +	struct cdev		cdev;
> +	struct list_head	group_list;
> +	struct mutex		lock;
> +	struct kref		kref;
> +	struct class		*class;
> +	struct idr		idr;
> +	wait_queue_head_t	release_q;
> +} vfio;

You probably want to move this below the 'vfio_group'
as vfio contains the vfio_group.
> +
> +static const struct file_operations vfio_group_fops;
> +extern const struct file_operations vfio_iommu_fops;
> +
> +struct vfio_group {
> +	dev_t			devt;
> +	unsigned int		groupid;
> +	struct bus_type		*bus;
> +	struct vfio_iommu	*iommu;
> +	struct list_head	device_list;
> +	struct list_head	iommu_next;
> +	struct list_head	group_next;
> +	int			refcnt;
> +};
> +
> +struct vfio_device {
> +	struct device			*dev;
> +	const struct vfio_device_ops	*ops;
> +	struct vfio_iommu		*iommu;
> +	struct vfio_group		*group;
> +	struct list_head		device_next;
> +	bool				attached;
> +	int				refcnt;
> +	void				*device_data;
> +};

And perhaps move this above vfio_group. As vfio_group
contains a list of these structures?


> +
> +/*
> + * Helper functions called under vfio.lock
> + */
> +
> +/* Return true if any devices within a group are opened */
> +static bool __vfio_group_devs_inuse(struct vfio_group *group)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		if (device->refcnt)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/* Return true if any of the groups attached to an iommu are opened.
> + * We can only tear apart merged groups when nothing is left open. */
> +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +		if (group->refcnt)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/* An iommu is "in use" if it has a file descriptor open or if any of
> + * the groups assigned to the iommu have devices open. */
> +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	if (iommu->refcnt)
> +		return true;
> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +
> +		if (__vfio_group_devs_inuse(group))
> +			return true;
> +	}
> +	return false;
> +}
> +
> +static void __vfio_group_set_iommu(struct vfio_group *group,
> +				   struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	if (group->iommu)
> +		list_del(&group->iommu_next);
> +	if (iommu)
> +		list_add(&group->iommu_next, &iommu->group_list);
> +
> +	group->iommu = iommu;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		device->iommu = iommu;
> +	}
> +}
> +
> +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
> +				    struct vfio_device *device)
> +{
> +	BUG_ON(!iommu->domain && device->attached);

Whoa. Heavy hammer there.

Perhaps WARN_ON as you do check it later on.

> +
> +	if (!iommu->domain || !device->attached)
> +		return;
> +
> +	iommu_detach_device(iommu->domain, device->dev);
> +	device->attached = false;
> +}
> +
> +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
> +				      struct vfio_group *group)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		__vfio_iommu_detach_dev(iommu, device);
> +	}
> +}
> +
> +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
> +				   struct vfio_device *device)
> +{
> +	int ret;
> +
> +	BUG_ON(device->attached);

How about:

WARN_ON(device->attached, "The engineer who wrote the user-space device driver is trying to register
the device again! Tell him/her to stop please.\n");

> +
> +	if (!iommu || !iommu->domain)
> +		return -EINVAL;
> +
> +	ret = iommu_attach_device(iommu->domain, device->dev);
> +	if (!ret)
> +		device->attached = true;
> +
> +	return ret;
> +}
> +
> +static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,
> +				     struct vfio_group *group)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +		int ret;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		ret = __vfio_iommu_attach_dev(iommu, device);
> +		if (ret) {
> +			__vfio_iommu_detach_group(iommu, group);
> +			return ret;
> +		}
> +	}
> +	return 0;
> +}
> +
> +/* The iommu is viable, ie. ready to be configured, when all the devices
> + * for all the groups attached to the iommu are bound to their vfio device
> + * drivers (ex. vfio-pci).  This sets the device_data private data pointer. */
> +static bool __vfio_iommu_viable(struct vfio_iommu *iommu)
> +{
> +	struct list_head *gpos, *dpos;
> +
> +	list_for_each(gpos, &iommu->group_list) {
> +		struct vfio_group *group;
> +		group = list_entry(gpos, struct vfio_group, iommu_next);
> +
> +		list_for_each(dpos, &group->device_list) {
> +			struct vfio_device *device;
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +
> +			if (!device->device_data)
> +				return false;
> +		}
> +	}
> +	return true;
> +}
> +
> +static void __vfio_close_iommu(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	if (!iommu->domain)
> +		return;
> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +
> +		__vfio_iommu_detach_group(iommu, group);
> +	}
> +
> +	vfio_iommu_unmapall(iommu);
> +
> +	iommu_domain_free(iommu->domain);
> +	iommu->domain = NULL;
> +	iommu->mm = NULL;
> +}
> +
> +/* Open the IOMMU.  This gates all access to the iommu or device file
> + * descriptors and sets current->mm as the exclusive user. */
> +static int __vfio_open_iommu(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +	int ret;
> +
> +	if (!__vfio_iommu_viable(iommu))
> +		return -EBUSY;
> +
> +	if (iommu->domain)
> +		return -EINVAL;
> +
> +	iommu->domain = iommu_domain_alloc(iommu->bus);
> +	if (!iommu->domain)
> +		return -EFAULT;

ENOMEM?

> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +
> +		ret = __vfio_iommu_attach_group(iommu, group);
> +		if (ret) {
> +			__vfio_close_iommu(iommu);
> +			return ret;
> +		}
> +	}
> +
> +	if (!allow_unsafe_intrs &&
> +	    !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
> +		__vfio_close_iommu(iommu);
> +		return -EFAULT;
> +	}
> +
> +	iommu->cache = (iommu_domain_has_cap(iommu->domain,
> +					     IOMMU_CAP_CACHE_COHERENCY) != 0);
> +	iommu->mm = current->mm;
> +
> +	return 0;
> +}
> +
> +/* Actively try to tear down the iommu and merged groups.  If there are no
> + * open iommu or device fds, we close the iommu.  If we close the iommu and
> + * there are also no open group fds, we can futher dissolve the group to
> + * iommu association and free the iommu data structure. */
> +static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)
> +{
> +
> +	if (__vfio_iommu_inuse(iommu))
> +		return -EBUSY;
> +
> +	__vfio_close_iommu(iommu);
> +
> +	if (!__vfio_iommu_groups_inuse(iommu)) {
> +		struct list_head *pos, *ppos;
> +
> +		list_for_each_safe(pos, ppos, &iommu->group_list) {
> +			struct vfio_group *group;
> +
> +			group = list_entry(pos, struct vfio_group, iommu_next);
> +			__vfio_group_set_iommu(group, NULL);
> +		}
> +
> +
> +		kfree(iommu);
> +	}
> +
> +	return 0;
> +}
> +
> +static struct vfio_device *__vfio_lookup_dev(struct device *dev)
> +{
> +	struct list_head *gpos;
> +	unsigned int groupid;
> +
> +	if (iommu_device_group(dev, &groupid))

Hmm, where is this defined? v3.2-rc1 does not seem to have it?

> +		return NULL;
> +
> +	list_for_each(gpos, &vfio.group_list) {
> +		struct vfio_group *group;
> +		struct list_head *dpos;
> +
> +		group = list_entry(gpos, struct vfio_group, group_next);
> +
> +		if (group->groupid != groupid)
> +			continue;
> +
> +		list_for_each(dpos, &group->device_list) {
> +			struct vfio_device *device;
> +
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +
> +			if (device->dev == dev)
> +				return device;
> +		}
> +	}
> +	return NULL;
> +}
> +
> +/* All release paths simply decrement the refcnt, attempt to teardown
> + * the iommu and merged groups, and wakeup anything that might be
> + * waiting if we successfully dissolve anything. */
> +static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)
> +{
> +	bool wake;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	(*refcnt)--;
> +	wake = (__vfio_try_dissolve_iommu(iommu) == 0);
> +
> +	mutex_unlock(&vfio.lock);
> +
> +	if (wake)
> +		wake_up(&vfio.release_q);
> +
> +	return 0;
> +}
> +
> +/*
> + * Device fops - passthrough to vfio device driver w/ device_data
> + */
> +static int vfio_device_release(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	vfio_do_release(&device->refcnt, device->iommu);
> +
> +	device->ops->put(device->device_data);
> +
> +	return 0;
> +}
> +
> +static long vfio_device_unl_ioctl(struct file *filep,
> +				  unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->ioctl(device->device_data, cmd, arg);
> +}
> +
> +static ssize_t vfio_device_read(struct file *filep, char __user *buf,
> +				size_t count, loff_t *ppos)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->read(device->device_data, buf, count, ppos);
> +}
> +
> +static ssize_t vfio_device_write(struct file *filep, const char __user *buf,
> +				 size_t count, loff_t *ppos)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->write(device->device_data, buf, count, ppos);
> +}
> +
> +static int vfio_device_mmap(struct file *filep, struct vm_area_struct *vma)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->mmap(device->device_data, vma);
> +}
> +	
> +#ifdef CONFIG_COMPAT
> +static long vfio_device_compat_ioctl(struct file *filep,
> +				     unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_device_unl_ioctl(filep, cmd, arg);
> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +const struct file_operations vfio_device_fops = {
> +	.owner		= THIS_MODULE,
> +	.release	= vfio_device_release,
> +	.read		= vfio_device_read,
> +	.write		= vfio_device_write,
> +	.unlocked_ioctl	= vfio_device_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_device_compat_ioctl,
> +#endif
> +	.mmap		= vfio_device_mmap,
> +};
> +
> +/*
> + * Group fops
> + */
> +static int vfio_group_open(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_group *group;
> +	int ret = 0;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	group = idr_find(&vfio.idr, iminor(inode));
> +
> +	if (!group) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	filep->private_data = group;
> +
> +	if (!group->iommu) {
> +		struct vfio_iommu *iommu;
> +
> +		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> +		if (!iommu) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		INIT_LIST_HEAD(&iommu->group_list);
> +		INIT_LIST_HEAD(&iommu->dm_list);
> +		mutex_init(&iommu->dgate);
> +		iommu->bus = group->bus;
> +		__vfio_group_set_iommu(group, iommu);
> +	}
> +	group->refcnt++;
> +
> +out:
> +	mutex_unlock(&vfio.lock);
> +
> +	return ret;
> +}
> +
> +static int vfio_group_release(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_group *group = filep->private_data;
> +
> +	return vfio_do_release(&group->refcnt, group->iommu);
> +}
> +
> +/* Attempt to merge the group pointed to by fd into group.  The merge-ee
> + * group must not have an iommu or any devices open because we cannot
> + * maintain that context across the merge.  The merge-er group can be
> + * in use. */
> +static int vfio_group_merge(struct vfio_group *group, int fd)
> +{
> +	struct vfio_group *new;
> +	struct vfio_iommu *old_iommu;
> +	struct file *file;
> +	int ret = 0;
> +	bool opened = false;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	file = fget(fd);
> +	if (!file) {
> +		ret = -EBADF;
> +		goto out_noput;
> +	}
> +
> +	/* Sanity check, is this really our fd? */
> +	if (file->f_op != &vfio_group_fops) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	new = file->private_data;
> +
> +	if (!new || new == group || !new->iommu ||
> +	    new->iommu->domain || new->bus != group->bus) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/* We need to attach all the devices to each domain separately
> +	 * in order to validate that the capabilities match for both.  */
> +	ret = __vfio_open_iommu(new->iommu);
> +	if (ret)
> +		goto out;
> +
> +	if (!group->iommu->domain) {
> +		ret = __vfio_open_iommu(group->iommu);
> +		if (ret)
> +			goto out;
> +		opened = true;
> +	}
> +
> +	/* If cache coherency doesn't match we'd potentialy need to
> +	 * remap existing iommu mappings in the merge-er domain.
> +	 * Poor return to bother trying to allow this currently. */
> +	if (iommu_domain_has_cap(group->iommu->domain,
> +				 IOMMU_CAP_CACHE_COHERENCY) !=
> +	    iommu_domain_has_cap(new->iommu->domain,
> +				 IOMMU_CAP_CACHE_COHERENCY)) {
> +		__vfio_close_iommu(new->iommu);
> +		if (opened)
> +			__vfio_close_iommu(group->iommu);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/* Close the iommu for the merge-ee and attach all its devices
> +	 * to the merge-er iommu. */
> +	__vfio_close_iommu(new->iommu);
> +
> +	ret = __vfio_iommu_attach_group(group->iommu, new);
> +	if (ret)
> +		goto out;
> +
> +	/* set_iommu unlinks new from the iommu, so save a pointer to it */
> +	old_iommu = new->iommu;
> +	__vfio_group_set_iommu(new, group->iommu);
> +	kfree(old_iommu);
> +
> +out:
> +	fput(file);
> +out_noput:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Unmerge the group pointed to by fd from group. */
> +static int vfio_group_unmerge(struct vfio_group *group, int fd)
> +{
> +	struct vfio_group *new;
> +	struct vfio_iommu *new_iommu;
> +	struct file *file;
> +	int ret = 0;
> +
> +	/* Since the merge-out group is already opened, it needs to
> +	 * have an iommu struct associated with it. */
> +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
> +	if (!new_iommu)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&new_iommu->group_list);
> +	INIT_LIST_HEAD(&new_iommu->dm_list);
> +	mutex_init(&new_iommu->dgate);
> +	new_iommu->bus = group->bus;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	file = fget(fd);
> +	if (!file) {
> +		ret = -EBADF;
> +		goto out_noput;
> +	}
> +
> +	/* Sanity check, is this really our fd? */
> +	if (file->f_op != &vfio_group_fops) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	new = file->private_data;
> +	if (!new || new == group || new->iommu != group->iommu) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/* We can't merge-out a group with devices still in use. */
> +	if (__vfio_group_devs_inuse(new)) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +
> +	__vfio_iommu_detach_group(group->iommu, new);
> +	__vfio_group_set_iommu(new, new_iommu);
> +
> +out:
> +	fput(file);
> +out_noput:
> +	if (ret)
> +		kfree(new_iommu);
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Get a new iommu file descriptor.  This will open the iommu, setting
> + * the current->mm ownership if it's not already set. */
> +static int vfio_group_get_iommu_fd(struct vfio_group *group)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	if (!group->iommu->domain) {
> +		ret = __vfio_open_iommu(group->iommu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
> +			       group->iommu, O_RDWR);
> +	if (ret < 0)
> +		goto out;
> +
> +	group->iommu->refcnt++;
> +out:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Get a new device file descriptor.  This will open the iommu, setting
> + * the current->mm ownership if it's not already set.  It's difficult to
> + * specify the requirements for matching a user supplied buffer to a
> + * device, so we use a vfio driver callback to test for a match.  For
> + * PCI, dev_name(dev) is unique, but other drivers may require including
> + * a parent device string. */
> +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
> +{
> +	struct vfio_iommu *iommu = group->iommu;
> +	struct list_head *gpos;
> +	int ret = -ENODEV;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	if (!iommu->domain) {
> +		ret = __vfio_open_iommu(iommu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	list_for_each(gpos, &iommu->group_list) {
> +		struct list_head *dpos;
> +
> +		group = list_entry(gpos, struct vfio_group, iommu_next);
> +
> +		list_for_each(dpos, &group->device_list) {
> +			struct vfio_device *device;
> +
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +
> +			if (device->ops->match(device->dev, buf)) {
> +				struct file *file;
> +
> +				if (device->ops->get(device->device_data)) {
> +					ret = -EFAULT;
> +					goto out;
> +				}
> +
> +				/* We can't use anon_inode_getfd(), like above
> +				 * because we need to modify the f_mode flags
> +				 * directly to allow more than just ioctls */
> +				ret = get_unused_fd();
> +				if (ret < 0) {
> +					device->ops->put(device->device_data);
> +					goto out;
> +				}
> +
> +				file = anon_inode_getfile("[vfio-device]",
> +							  &vfio_device_fops,
> +							  device, O_RDWR);
> +				if (IS_ERR(file)) {
> +					put_unused_fd(ret);
> +					ret = PTR_ERR(file);
> +					device->ops->put(device->device_data);
> +					goto out;
> +				}
> +
> +				/* Todo: add an anon_inode interface to do
> +				 * this.  Appears to be missing by lack of
> +				 * need rather than explicitly prevented.
> +				 * Now there's need. */
> +				file->f_mode |= (FMODE_LSEEK |
> +						 FMODE_PREAD |
> +						 FMODE_PWRITE);
> +
> +				fd_install(ret, file);
> +
> +				device->refcnt++;
> +				goto out;
> +			}
> +		}
> +	}
> +out:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +static long vfio_group_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_group *group = filep->private_data;
> +
> +	if (cmd == VFIO_GROUP_GET_FLAGS) {
> +		u64 flags = 0;
> +
> +		mutex_lock(&vfio.lock);
> +		if (__vfio_iommu_viable(group->iommu))
> +			flags |= VFIO_GROUP_FLAGS_VIABLE;
> +		mutex_unlock(&vfio.lock);
> +
> +		if (group->iommu->mm)
> +			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;
> +
> +		return put_user(flags, (u64 __user *)arg);
> +	}
> +		
> +	/* Below commands are restricted once the mm is set */
> +	if (group->iommu->mm && group->iommu->mm != current->mm)
> +		return -EPERM;
> +
> +	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {
> +		int fd;
> +		
> +		if (get_user(fd, (int __user *)arg))
> +			return -EFAULT;
> +		if (fd < 0)
> +			return -EINVAL;
> +
> +		if (cmd == VFIO_GROUP_MERGE)
> +			return vfio_group_merge(group, fd);
> +		else
> +			return vfio_group_unmerge(group, fd);
> +	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {
> +		return vfio_group_get_iommu_fd(group);
> +	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {
> +		char *buf;
> +		int ret;
> +
> +		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
> +		if (IS_ERR(buf))
> +			return PTR_ERR(buf);
> +
> +		ret = vfio_group_get_device_fd(group, buf);
> +		kfree(buf);
> +		return ret;
> +	}
> +
> +	return -ENOSYS;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long vfio_group_compat_ioctl(struct file *filep,
> +				    unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_group_unl_ioctl(filep, cmd, arg);
> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +static const struct file_operations vfio_group_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= vfio_group_open,
> +	.release	= vfio_group_release,
> +	.unlocked_ioctl	= vfio_group_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_group_compat_ioctl,
> +#endif
> +};
> +
> +/* iommu fd release hook */
> +int vfio_release_iommu(struct vfio_iommu *iommu)
> +{
> +	return vfio_do_release(&iommu->refcnt, iommu);
> +}
> +
> +/*
> + * VFIO driver API
> + */
> +
> +/* Add a new device to the vfio framework with associated vfio driver
> + * callbacks.  This is the entry point for vfio drivers to register devices. */
> +int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
> +{
> +	struct list_head *pos;
> +	struct vfio_group *group = NULL;
> +	struct vfio_device *device = NULL;
> +	unsigned int groupid;
> +	int ret = 0;
> +	bool new_group = false;
> +
> +	if (!ops)
> +		return -EINVAL;
> +
> +	if (iommu_device_group(dev, &groupid))
> +		return -ENODEV;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	list_for_each(pos, &vfio.group_list) {
> +		group = list_entry(pos, struct vfio_group, group_next);
> +		if (group->groupid == groupid)
> +			break;
> +		group = NULL;
> +	}
> +
> +	if (!group) {
> +		int minor;
> +
> +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		group = kzalloc(sizeof(*group), GFP_KERNEL);
> +		if (!group) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		group->groupid = groupid;
> +		INIT_LIST_HEAD(&group->device_list);
> +
> +		ret = idr_get_new(&vfio.idr, group, &minor);
> +		if (ret == 0 && minor > MINORMASK) {
> +			idr_remove(&vfio.idr, minor);
> +			kfree(group);
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +
> +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> +		device_create(vfio.class, NULL, group->devt,
> +			      group, "%u", groupid);
> +
> +		group->bus = dev->bus;


Oh, so that is how the IOMMU iommu_ops get copied! You might
want to mention that - I was not sure where the 'handoff' is
was done to insert a device so that it can do iommu_ops properly.

Ok, so the time when a device is detected whether it can do
IOMMU is when we try to open it - as that is when iommu_domain_alloc
is called which can return NULL if the iommu_ops is not set.

So what about devices that don't have an iommu_ops? Say they
are using SWIOTLB? (like the AMD-Vi sometimes does if the
device is not on its list).

Can we use iommu_present?

> +		list_add(&group->group_next, &vfio.group_list);
> +		new_group = true;
> +	} else {
> +		if (group->bus != dev->bus) {
> +			printk(KERN_WARNING
> +			       "Error: IOMMU group ID conflict.  Group ID %u "
> +				"on both bus %s and %s\n", groupid,
> +				group->bus->name, dev->bus->name);
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		list_for_each(pos, &group->device_list) {
> +			device = list_entry(pos,
> +					    struct vfio_device, device_next);
> +			if (device->dev == dev)
> +				break;
> +			device = NULL;
> +		}
> +	}
> +
> +	if (!device) {
> +		if (__vfio_group_devs_inuse(group) ||
> +		    (group->iommu && group->iommu->refcnt)) {
> +			printk(KERN_WARNING
> +			       "Adding device %s to group %u while group is already in use!!\n",
> +			       dev_name(dev), group->groupid);
> +			/* XXX How to prevent other drivers from claiming? */
> +		}
> +
> +		device = kzalloc(sizeof(*device), GFP_KERNEL);
> +		if (!device) {
> +			/* If we just created this group, tear it down */
> +			if (new_group) {
> +				list_del(&group->group_next);
> +				device_destroy(vfio.class, group->devt);
> +				idr_remove(&vfio.idr, MINOR(group->devt));
> +				kfree(group);
> +			}
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		list_add(&device->device_next, &group->device_list);
> +		device->dev = dev;
> +		device->ops = ops;
> +		device->iommu = group->iommu; /* NULL if new */
> +		__vfio_iommu_attach_dev(group->iommu, device);
> +	}
> +out:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_add_dev);
> +
> +/* Remove a device from the vfio framework */
> +void vfio_group_del_dev(struct device *dev)
> +{
> +	struct list_head *pos;
> +	struct vfio_group *group = NULL;
> +	struct vfio_device *device = NULL;
> +	unsigned int groupid;
> +
> +	if (iommu_device_group(dev, &groupid))
> +		return;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	list_for_each(pos, &vfio.group_list) {
> +		group = list_entry(pos, struct vfio_group, group_next);
> +		if (group->groupid == groupid)
> +			break;
> +		group = NULL;
> +	}
> +
> +	if (!group)
> +		goto out;
> +
> +	list_for_each(pos, &group->device_list) {
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		if (device->dev == dev)
> +			break;
> +		device = NULL;
> +	}
> +
> +	if (!device)
> +		goto out;
> +
> +	BUG_ON(device->refcnt);
> +
> +	if (device->attached)
> +		__vfio_iommu_detach_dev(group->iommu, device);
> +
> +	list_del(&device->device_next);
> +	kfree(device);
> +
> +	/* If this was the only device in the group, remove the group.
> +	 * Note that we intentionally unmerge empty groups here if the
> +	 * group fd isn't opened. */
> +	if (list_empty(&group->device_list) && group->refcnt == 0) {
> +		struct vfio_iommu *iommu = group->iommu;
> +
> +		if (iommu) {
> +			__vfio_group_set_iommu(group, NULL);
> +			__vfio_try_dissolve_iommu(iommu);
> +		}
> +
> +		device_destroy(vfio.class, group->devt);
> +		idr_remove(&vfio.idr, MINOR(group->devt));
> +		list_del(&group->group_next);
> +		kfree(group);
> +	}
> +out:
> +	mutex_unlock(&vfio.lock);
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_del_dev);
> +
> +/* When a device is bound to a vfio device driver (ex. vfio-pci), this
> + * entry point is used to mark the device usable (viable).  The vfio
> + * device driver associates a private device_data struct with the device
> + * here, which will later be return for vfio_device_fops callbacks. */
> +int vfio_bind_dev(struct device *dev, void *device_data)
> +{
> +	struct vfio_device *device;
> +	int ret = -EINVAL;
> +
> +	BUG_ON(!device_data);
> +
> +	mutex_lock(&vfio.lock);
> +
> +	device = __vfio_lookup_dev(dev);
> +
> +	BUG_ON(!device);
> +
> +	ret = dev_set_drvdata(dev, device);
> +	if (!ret)
> +		device->device_data = device_data;
> +
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vfio_bind_dev);
> +
> +/* A device is only removeable if the iommu for the group is not in use. */
> +static bool vfio_device_removeable(struct vfio_device *device)
> +{
> +	bool ret = true;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	if (device->iommu && __vfio_iommu_inuse(device->iommu))
> +		ret = false;
> +
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Notify vfio that a device is being unbound from the vfio device driver
> + * and return the device private device_data pointer.  If the group is
> + * in use, we need to block or take other measures to make it safe for
> + * the device to be removed from the iommu. */
> +void *vfio_unbind_dev(struct device *dev)
> +{
> +	struct vfio_device *device = dev_get_drvdata(dev);
> +	void *device_data;
> +
> +	BUG_ON(!device);
> +
> +again:
> +	if (!vfio_device_removeable(device)) {
> +		/* XXX signal for all devices in group to be removed or
> +		 * resort to killing the process holding the device fds.
> +		 * For now just block waiting for releases to wake us. */
> +		wait_event(vfio.release_q, vfio_device_removeable(device));
> +	}
> +
> +	mutex_lock(&vfio.lock);
> +
> +	/* Need to re-check that the device is still removeable under lock. */
> +	if (device->iommu && __vfio_iommu_inuse(device->iommu)) {
> +		mutex_unlock(&vfio.lock);
> +		goto again;
> +	}
> +
> +	device_data = device->device_data;
> +
> +	device->device_data = NULL;
> +	dev_set_drvdata(dev, NULL);
> +
> +	mutex_unlock(&vfio.lock);
> +	return device_data;
> +}
> +EXPORT_SYMBOL_GPL(vfio_unbind_dev);
> +
> +/*
> + * Module/class support
> + */
> +static void vfio_class_release(struct kref *kref)
> +{
> +	class_destroy(vfio.class);
> +	vfio.class = NULL;
> +}
> +
> +static char *vfio_devnode(struct device *dev, mode_t *mode)
> +{
> +	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
> +}
> +
> +static int __init vfio_init(void)
> +{
> +	int ret;
> +
> +	idr_init(&vfio.idr);
> +	mutex_init(&vfio.lock);
> +	INIT_LIST_HEAD(&vfio.group_list);
> +	init_waitqueue_head(&vfio.release_q);
> +
> +	kref_init(&vfio.kref);
> +	vfio.class = class_create(THIS_MODULE, "vfio");
> +	if (IS_ERR(vfio.class)) {
> +		ret = PTR_ERR(vfio.class);
> +		goto err_class;
> +	}
> +
> +	vfio.class->devnode = vfio_devnode;
> +
> +	/* FIXME - how many minors to allocate... all of them! */
> +	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
> +	if (ret)
> +		goto err_chrdev;
> +
> +	cdev_init(&vfio.cdev, &vfio_group_fops);
> +	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
> +	if (ret)
> +		goto err_cdev;
> +
> +	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
> +
> +	return 0;
> +
> +err_cdev:
> +	unregister_chrdev_region(vfio.devt, MINORMASK);
> +err_chrdev:
> +	kref_put(&vfio.kref, vfio_class_release);
> +err_class:
> +	return ret;
> +}
> +
> +static void __exit vfio_cleanup(void)
> +{
> +	struct list_head *gpos, *gppos;
> +
> +	list_for_each_safe(gpos, gppos, &vfio.group_list) {
> +		struct vfio_group *group;
> +		struct list_head *dpos, *dppos;
> +
> +		group = list_entry(gpos, struct vfio_group, group_next);
> +
> +		list_for_each_safe(dpos, dppos, &group->device_list) {
> +			struct vfio_device *device;
> +
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +			vfio_group_del_dev(device->dev);
> +		}
> +	}
> +
> +	idr_destroy(&vfio.idr);
> +	cdev_del(&vfio.cdev);
> +	unregister_chrdev_region(vfio.devt, MINORMASK);
> +	kref_put(&vfio.kref, vfio_class_release);
> +}
> +
> +module_init(vfio_init);
> +module_exit(vfio_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
> new file mode 100644
> index 0000000..350ad67
> --- /dev/null
> +++ b/drivers/vfio/vfio_private.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + */
> +
> +#include <linux/list.h>
> +#include <linux/mutex.h>
> +
> +#ifndef VFIO_PRIVATE_H
> +#define VFIO_PRIVATE_H
> +
> +struct vfio_iommu {
> +	struct iommu_domain		*domain;
> +	struct bus_type			*bus;
> +	struct mutex			dgate;
> +	struct list_head		dm_list;
> +	struct mm_struct		*mm;
> +	struct list_head		group_list;
> +	int				refcnt;
> +	bool				cache;
> +};
> +
> +extern int vfio_release_iommu(struct vfio_iommu *iommu);
> +extern void vfio_iommu_unmapall(struct vfio_iommu *iommu);
> +
> +#endif /* VFIO_PRIVATE_H */
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> new file mode 100644
> index 0000000..4269b08
> --- /dev/null
> +++ b/include/linux/vfio.h
> @@ -0,0 +1,155 @@
> +/*
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + *
> + * This program is free software; you may redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; version 2 of the License.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + *
> + * Portions derived from drivers/uio/uio.c:
> + * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
> + * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
> + * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
> + * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
> + *
> + * Portions derived from drivers/uio/uio_pci_generic.c:
> + * Copyright (C) 2009 Red Hat, Inc.
> + * Author: Michael S. Tsirkin <mst@redhat.com>
> + */
> +#include <linux/types.h>
> +
> +#ifndef VFIO_H
> +#define VFIO_H
> +
> +#ifdef __KERNEL__
> +
> +struct vfio_device_ops {
> +	bool			(*match)(struct device *, char *);
> +	int			(*get)(void *);
> +	void			(*put)(void *);
> +	ssize_t			(*read)(void *, char __user *,
> +					size_t, loff_t *);
> +	ssize_t			(*write)(void *, const char __user *,
> +					 size_t, loff_t *);
> +	long			(*ioctl)(void *, unsigned int, unsigned long);
> +	int			(*mmap)(void *, struct vm_area_struct *);
> +};
> +
> +extern int vfio_group_add_dev(struct device *device,
> +			      const struct vfio_device_ops *ops);
> +extern void vfio_group_del_dev(struct device *device);
> +extern int vfio_bind_dev(struct device *device, void *device_data);
> +extern void *vfio_unbind_dev(struct device *device);
> +
> +#endif /* __KERNEL__ */
> +
> +/*
> + * VFIO driver - allow mapping and use of certain devices
> + * in unprivileged user processes. (If IOMMU is present)
> + * Especially useful for Virtual Function parts of SR-IOV devices
> + */
> +
> +
> +/* Kernel & User level defines for ioctls */
> +
> +#define VFIO_GROUP_GET_FLAGS		_IOR(';', 100, __u64)

> + #define VFIO_GROUP_FLAGS_VIABLE	(1 << 0)
> + #define VFIO_GROUP_FLAGS_MM_LOCKED	(1 << 1)
> +#define VFIO_GROUP_MERGE		_IOW(';', 101, int)
> +#define VFIO_GROUP_UNMERGE		_IOW(';', 102, int)
> +#define VFIO_GROUP_GET_IOMMU_FD		_IO(';', 103)
> +#define VFIO_GROUP_GET_DEVICE_FD	_IOW(';', 104, char *)
> +
> +/*
> + * Structure for DMA mapping of user buffers
> + * vaddr, dmaaddr, and size must all be page aligned
> + */
> +struct vfio_dma_map {
> +	__u64	len;		/* length of structure */
> +	__u64	vaddr;		/* process virtual addr */
> +	__u64	dmaaddr;	/* desired and/or returned dma address */
> +	__u64	size;		/* size in bytes */
> +	__u64	flags;
> +#define	VFIO_DMA_MAP_FLAG_WRITE		(1 << 0) /* req writeable DMA mem */
> +};
> +
> +#define	VFIO_IOMMU_GET_FLAGS		_IOR(';', 105, __u64)
> + /* Does the IOMMU support mapping any IOVA to any virtual address? */
> + #define VFIO_IOMMU_FLAGS_MAP_ANY	(1 << 0)
> +#define	VFIO_IOMMU_MAP_DMA		_IOWR(';', 106, struct vfio_dma_map)
> +#define	VFIO_IOMMU_UNMAP_DMA		_IOWR(';', 107, struct vfio_dma_map)
> +
> +#define VFIO_DEVICE_GET_FLAGS		_IOR(';', 108, __u64)
> + #define VFIO_DEVICE_FLAGS_PCI		(1 << 0)
> + #define VFIO_DEVICE_FLAGS_DT		(1 << 1)
> + #define VFIO_DEVICE_FLAGS_RESET	(1 << 2)
> +#define VFIO_DEVICE_GET_NUM_REGIONS	_IOR(';', 109, int)
> +
> +struct vfio_region_info {
> +	__u32	len;		/* length of structure */
> +	__u32	index;		/* region number */
> +	__u64	size;		/* size in bytes of region */
> +	__u64	offset;		/* start offset of region */
> +	__u64	flags;
> +#define VFIO_REGION_INFO_FLAG_MMAP		(1 << 0)
> +#define VFIO_REGION_INFO_FLAG_RO		(1 << 1)
> +#define VFIO_REGION_INFO_FLAG_PHYS_VALID	(1 << 2)
> +	__u64	phys;		/* physical address of region */
> +};
> +
> +#define VFIO_DEVICE_GET_REGION_INFO	_IOWR(';', 110, struct vfio_region_info)
> +
> +#define VFIO_DEVICE_GET_NUM_IRQS	_IOR(';', 111, int)
> +
> +struct vfio_irq_info {
> +	__u32	len;		/* length of structure */
> +	__u32	index;		/* IRQ number */
> +	__u32	count;		/* number of individual IRQs */
> +	__u32	flags;
> +#define VFIO_IRQ_INFO_FLAG_LEVEL		(1 << 0)
> +};
> +
> +#define VFIO_DEVICE_GET_IRQ_INFO	_IOWR(';', 112, struct vfio_irq_info)
> +
> +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> +#define VFIO_DEVICE_SET_IRQ_EVENTFDS	_IOW(';', 113, int)
> +
> +/* Unmask IRQ index, arg[0] = index */
> +#define VFIO_DEVICE_UNMASK_IRQ		_IOW(';', 114, int)
> +
> +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD	_IOW(';', 115, int)
> +
> +#define VFIO_DEVICE_RESET		_IO(';', 116)
> +
> +struct vfio_dtpath {
> +	__u32	len;		/* length of structure */
> +	__u32	index;
> +	__u64	flags;
> +#define VFIO_DTPATH_FLAGS_REGION	(1 << 0)
> +#define VFIO_DTPATH_FLAGS_IRQ		(1 << 1)
> +	char	*path;
> +};
> +#define VFIO_DEVICE_GET_DTPATH		_IOWR(';', 117, struct vfio_dtpath)
> +
> +struct vfio_dtindex {
> +	__u32	len;		/* length of structure */
> +	__u32	index;
> +	__u32	prop_type;
> +	__u32	prop_index;
> +	__u64	flags;
> +#define VFIO_DTINDEX_FLAGS_REGION	(1 << 0)
> +#define VFIO_DTINDEX_FLAGS_IRQ		(1 << 1)
> +};
> +#define VFIO_DEVICE_GET_DTINDEX		_IOWR(';', 118, struct vfio_dtindex)
> +
> +#endif /* VFIO_H */


So where is the vfio-pci? Is that a seperate posting?
Alex Williamson Nov. 11, 2011, 6:04 p.m. UTC | #9
On Wed, 2011-11-09 at 18:57 -0600, Christian Benvenuti (benve) wrote:
> Here are few minor comments on vfio_iommu.c ...

Sorry, I've been poking sticks at trying to figure out a clean way to
solve the force vfio driver attach problem.

> > diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> > new file mode 100644
> > index 0000000..029dae3
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_iommu.c
<snip>
> > +
> > +#include "vfio_private.h"
> 
> Doesn't the 'dma_'  prefix belong to the generic DMA code?

Sure, we could these more vfio-centric.

> > +struct dma_map_page {
> > +	struct list_head	list;
> > +	dma_addr_t		daddr;
> > +	unsigned long		vaddr;
> > +	int			npage;
> > +	int			rdwr;
> > +};
> > +
> > +/*
> > + * This code handles mapping and unmapping of user data buffers
> > + * into DMA'ble space using the IOMMU
> > + */
> > +
> > +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> > +
> > +struct vwork {
> > +	struct mm_struct	*mm;
> > +	int			npage;
> > +	struct work_struct	work;
> > +};
> > +
> > +/* delayed decrement for locked_vm */
> > +static void vfio_lock_acct_bg(struct work_struct *work)
> > +{
> > +	struct vwork *vwork = container_of(work, struct vwork, work);
> > +	struct mm_struct *mm;
> > +
> > +	mm = vwork->mm;
> > +	down_write(&mm->mmap_sem);
> > +	mm->locked_vm += vwork->npage;
> > +	up_write(&mm->mmap_sem);
> > +	mmput(mm);		/* unref mm */
> > +	kfree(vwork);
> > +}
> > +
> > +static void vfio_lock_acct(int npage)
> > +{
> > +	struct vwork *vwork;
> > +	struct mm_struct *mm;
> > +
> > +	if (!current->mm) {
> > +		/* process exited */
> > +		return;
> > +	}
> > +	if (down_write_trylock(&current->mm->mmap_sem)) {
> > +		current->mm->locked_vm += npage;
> > +		up_write(&current->mm->mmap_sem);
> > +		return;
> > +	}
> > +	/*
> > +	 * Couldn't get mmap_sem lock, so must setup to decrement
>                                                       ^^^^^^^^^
> 
> Increment?

Yep

<snip>
> > +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t
> > start,
> > +			    size_t size, struct dma_map_page *mlp)
> > +{
> > +	struct dma_map_page *split;
> > +	int npage_lo, npage_hi;
> > +
> > +	/* Existing dma region is completely covered, unmap all */
> 
> This works. However, given how vfio_dma_map_dm implements the merging
> logic, I think it is impossible to have
> 
>     (start < mlp->daddr &&
>      start + size > mlp->daddr + NPAGE_TO_SIZE(mlp->npage))

It's quite possible.  This allows userspace to create a sparse mapping,
then blow it all away with a single unmap from 0 to ~0.

> > +	if (start <= mlp->daddr &&
> > +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > +		list_del(&mlp->list);
> > +		npage_lo = mlp->npage;
> > +		kfree(mlp);
> > +		return npage_lo;
> > +	}
> > +
> > +	/* Overlap low address of existing range */
> 
> Same as above (ie, '<' is impossible)

existing:   |<--- A --->|      |<--- B --->|
unmap:                |<--- C --->|

Maybe not good practice from userspace, but we shouldn't count on
userspace to be well behaved.

> > +	if (start <= mlp->daddr) {
> > +		size_t overlap;
> > +
> > +		overlap = start + size - mlp->daddr;
> > +		npage_lo = overlap >> PAGE_SHIFT;
> > +		npage_hi = mlp->npage - npage_lo;
> > +
> > +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> > +		mlp->daddr += overlap;
> > +		mlp->vaddr += overlap;
> > +		mlp->npage -= npage_lo;
> > +		return npage_lo;
> > +	}
> 
> Same as above (ie, '>' is impossible).

Same example as above.

> > +	/* Overlap high address of existing range */
> > +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +		size_t overlap;
> > +
> > +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> > +		npage_hi = overlap >> PAGE_SHIFT;
> > +		npage_lo = mlp->npage - npage_hi;
> > +
> > +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> > +		mlp->npage -= npage_hi;
> > +		return npage_hi;
> > +	}
<snip>
> > +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map
> > *dmp)
> > +{
> > +	int npage;
> > +	struct dma_map_page *mlp, *mmlp = NULL;
> > +	dma_addr_t daddr = dmp->dmaaddr;
> > +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;
> > +	size_t size = dmp->size;
> > +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;
> > +
> > +	if (vaddr & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +	if (daddr & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +	if (size & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +
> > +	npage = size >> PAGE_SHIFT;
> > +	if (!npage)
> > +		return -EINVAL;
> > +
> > +	if (!iommu)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&iommu->dgate);
> > +
> > +	if (vfio_find_dma(iommu, daddr, size)) {
> > +		ret = -EBUSY;
> > +		goto out_lock;
> > +	}
> > +
> > +	/* account for locked pages */
> > +	locked = current->mm->locked_vm + npage;
> > +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> > +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > +			__func__, rlimit(RLIMIT_MEMLOCK));
> > +		ret = -ENOMEM;
> > +		goto out_lock;
> > +	}
> > +
> > +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);
> > +	if (ret)
> > +		goto out_lock;
> > +
> > +	/* Check if we abut a region below */
> 
> Is !daddr possible?

Sure, an IOVA of 0x0.  There's no region below if we start at zero.

> > +	if (daddr) {
> > +		mlp = vfio_find_dma(iommu, daddr - 1, 1);
> > +		if (mlp && mlp->rdwr == rdwr &&
> > +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {
> > +
> > +			mlp->npage += npage;
> > +			daddr = mlp->daddr;
> > +			vaddr = mlp->vaddr;
> > +			npage = mlp->npage;
> > +			size = NPAGE_TO_SIZE(npage);
> > +
> > +			mmlp = mlp;
> > +		}
> > +	}
> 
> Is !(daddr + size) possible?

Same, there's no region above if this region goes to the top of the
address space, ie. 0xffffffff_fffff000 + 0x1000

Hmm, wonder if I'm missing a check for wrapping.

> > +	if (daddr + size) {
> > +		mlp = vfio_find_dma(iommu, daddr + size, 1);
> > +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size)
> > {
> > +
> > +			mlp->npage += npage;
> > +			mlp->daddr = daddr;
> > +			mlp->vaddr = vaddr;
> > +
> > +			/* If merged above and below, remove previously
> > +			 * merged entry.  New entry covers it.  */
> > +			if (mmlp) {
> > +				list_del(&mmlp->list);
> > +				kfree(mmlp);
> > +			}
> > +			mmlp = mlp;
> > +		}
> > +	}
> > +
> > +	if (!mmlp) {
> > +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
> > +		if (!mlp) {
> > +			ret = -ENOMEM;
> > +			vfio_dma_unmap(iommu, daddr, npage, rdwr);
> > +			goto out_lock;
> > +		}
> > +
> > +		mlp->npage = npage;
> > +		mlp->daddr = daddr;
> > +		mlp->vaddr = vaddr;
> > +		mlp->rdwr = rdwr;
> > +		list_add(&mlp->list, &iommu->dm_list);
> > +	}
> > +
> > +out_lock:
> > +	mutex_unlock(&iommu->dgate);
> > +	return ret;
> > +}
> > +
> > +static int vfio_iommu_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_iommu *iommu = filep->private_data;
> > +
> > +	vfio_release_iommu(iommu);
> > +	return 0;
> > +}
> > +
> > +static long vfio_iommu_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_iommu *iommu = filep->private_data;
> > +	int ret = -ENOSYS;
> 
> Any reason for not using "switch" ?

It got ugly in vfio_main, so I decided to be consistent w/ it in the
driver and use if/else here too.  I don't like the aesthetics of extra
{}s to declare variables within a switch, nor do I like declaring all
the variables for each case for the whole function.  Personal quirk.

> > +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> > +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> > +
> > +                ret = put_user(flags, (u64 __user *)arg);
> > +
> > +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> > +		struct vfio_dma_map dm;
> > +
> > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > +			return -EFAULT;
> 
> What does the "_dm" suffix stand for?

Inherited from Tom, but I figure _dma_map_dm = action(dma map),
object(dm), which is a vfio_Dma_Map.

Thanks,

Alex
Alex Williamson Nov. 11, 2011, 10:10 p.m. UTC | #10
Thanks Konrad!  Comments inline.

On Fri, 2011-11-11 at 12:51 -0500, Konrad Rzeszutek Wilk wrote:
> On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> > VFIO provides a secure, IOMMU based interface for user space
> > drivers, including device assignment to virtual machines.
> > This provides the base management of IOMMU groups, devices,
> > and IOMMU objects.  See Documentation/vfio.txt included in
> > this patch for user and kernel API description.
> > 
> > Note, this implements the new API discussed at KVM Forum
> > 2011, as represented by the drvier version 0.2.  It's hoped
> > that this provides a modular enough interface to support PCI
> > and non-PCI userspace drivers across various architectures
> > and IOMMU implementations.
> > 
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> > Fingers crossed, this is the last RFC for VFIO, but we need
> > the iommu group support before this can go upstream
> > (http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
> > hoping this helps push that along.
> > 
> > Since the last posting, this version completely modularizes
> > the device backends and better defines the APIs between the
> > core VFIO code and the device backends.  I expect that we
> > might also adopt a modular IOMMU interface as iommu_ops learns
> > about different types of hardware.  Also many, many cleanups.
> > Check the complete git history for details:
> > 
> > git://github.com/awilliam/linux-vfio.git vfio-ng
> > 
> > (matching qemu tree: git://github.com/awilliam/qemu-vfio.git)
> > 
> > This version, along with the supporting VFIO PCI backend can
> > be found here:
> > 
> > git://github.com/awilliam/linux-vfio.git vfio-next-20111103
> > 
> > I've held off on implementing a kernel->user signaling
> > mechanism for now since the previous netlink version produced
> > too many gag reflexes.  It's easy enough to set a bit in the
> > group flags too indicate such support in the future, so I
> > think we can move ahead without it.
> > 
> > Appreciate any feedback or suggestions.  Thanks,
> > 
> > Alex
> > 
> >  Documentation/ioctl/ioctl-number.txt |    1 
> >  Documentation/vfio.txt               |  304 +++++++++
> >  MAINTAINERS                          |    8 
> >  drivers/Kconfig                      |    2 
> >  drivers/Makefile                     |    1 
> >  drivers/vfio/Kconfig                 |    8 
> >  drivers/vfio/Makefile                |    3 
> >  drivers/vfio/vfio_iommu.c            |  530 ++++++++++++++++
> >  drivers/vfio/vfio_main.c             | 1151 ++++++++++++++++++++++++++++++++++
> >  drivers/vfio/vfio_private.h          |   34 +
> >  include/linux/vfio.h                 |  155 +++++
> >  11 files changed, 2197 insertions(+), 0 deletions(-)
> >  create mode 100644 Documentation/vfio.txt
> >  create mode 100644 drivers/vfio/Kconfig
> >  create mode 100644 drivers/vfio/Makefile
> >  create mode 100644 drivers/vfio/vfio_iommu.c
> >  create mode 100644 drivers/vfio/vfio_main.c
> >  create mode 100644 drivers/vfio/vfio_private.h
> >  create mode 100644 include/linux/vfio.h
> > 
> > diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
> > index 54078ed..59d01e4 100644
> > --- a/Documentation/ioctl/ioctl-number.txt
> > +++ b/Documentation/ioctl/ioctl-number.txt
> > @@ -88,6 +88,7 @@ Code  Seq#(hex)	Include File		Comments
> >  		and kernel/power/user.c
> >  '8'	all				SNP8023 advanced NIC card
> >  					<mailto:mcr@solidum.com>
> > +';'	64-76	linux/vfio.h
> >  '@'	00-0F	linux/radeonfb.h	conflict!
> >  '@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
> >  'A'	00-1F	linux/apm_bios.h	conflict!
> > diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> > new file mode 100644
> > index 0000000..5866896
> > --- /dev/null
> > +++ b/Documentation/vfio.txt
> > @@ -0,0 +1,304 @@
> > +VFIO - "Virtual Function I/O"[1]
> > +-------------------------------------------------------------------------------
> > +Many modern system now provide DMA and interrupt remapping facilities
> > +to help ensure I/O devices behave within the boundaries they've been
> > +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> > +well as POWER systems with Partitionable Endpoints (PEs) and even
> > +embedded powerpc systems (technology name unknown).  The VFIO driver
> > +is an IOMMU/device agnostic framework for exposing direct device
> > +access to userspace, in a secure, IOMMU protected environment.  In
> > +other words, this allows safe, non-privileged, userspace drivers.
> > +
> > +Why do we want that?  Virtual machines often make use of direct device
> > +access ("device assignment") when configured for the highest possible
> > +I/O performance.  From a device and host perspective, this simply turns
> > +the VM into a userspace driver, with the benefits of significantly
> > +reduced latency, higher bandwidth, and direct use of bare-metal device
> > +drivers[2].
> 
> Are there any constraints of running a 32-bit userspace with
> a 64-bit kernel and with 32-bit user space drivers?

Shouldn't be.  I'll need to do some testing on that, but it was working
on the previous generation of vfio.

> > +
> > +Some applications, particularly in the high performance computing
> > +field, also benefit from low-overhead, direct device access from
> > +userspace.  Examples include network adapters (often non-TCP/IP based)
> > +and compute accelerators.  Previous to VFIO, these drivers needed to
> > +go through the full development cycle to become proper upstream driver,
> > +be maintained out of tree, or make use of the UIO framework, which
> > +has no notion of IOMMU protection, limited interrupt support, and
> > +requires root privileges to access things like PCI configuration space.
> > +
> > +The VFIO driver framework intends to unify these, replacing both the
> > +KVM PCI specific device assignment currently used as well as provide
> > +a more secure, more featureful userspace driver environment than UIO.
> > +
> > +Groups, Devices, IOMMUs, oh my
> 
> <chuckles> oh my, eh?

Anything for a corny chuckle :)

> > +-------------------------------------------------------------------------------
> > +
> > +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> > +can't always distinguish transactions from each individual device in
> > +the system.  Sometimes this is because of the IOMMU design, such as with
> > +PEs, other times it's caused by the I/O topology, for instance a
> > +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> > +devices created by these restictions IOMMU groups (or just "groups" for
> > +this document).
> > +
> > +The IOMMU cannot distiguish transactions between the individual devices
> > +within the group, therefore the group is the basic unit of ownership for
> > +a userspace process.  Because of this, groups are also the primary
> > +interface to both devices and IOMMU domains in VFIO.
> > +
> > +The VFIO representation of groups is created as devices are added into
> > +the framework by a VFIO bus driver.  The vfio-pci module is an example
> > +of a bus driver.  This module registers devices along with a set of bus
> > +specific callbacks with the VFIO core.  These callbacks provide the
> > +interfaces later used for device access.  As each new group is created,
> > +as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
> > +character device.
> > +
> > +In addition to the device enumeration and callbacks, the VFIO bus driver
> > +also provides a traditional device driver and is able to bind to devices
> > +on it's bus.  When a device is bound to the bus driver it's available to
> > +VFIO.  When all the devices within a group are bound to their bus drivers,
> > +the group becomes "viable" and a user with sufficient access to the VFIO
> > +group chardev can obtain exclusive access to the set of group devices.
> > +
> > +As documented in linux/vfio.h, several ioctls are provided on the
> > +group chardev:
> > +
> > +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> > + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> > + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> > +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> > +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> > +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> > +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
> > +
> > +The last two ioctls return new file descriptors for accessing
> > +individual devices within the group and programming the IOMMU.  Each of
> > +these new file descriptors provide their own set of file interfaces.
> > +These ioctls will fail if any of the devices within the group are not
> > +bound to their VFIO bus driver.  Additionally, when either of these
> > +interfaces are used, the group is then bound to the struct_mm of the
> > +caller.  The GET_FLAGS ioctl can be used to view the state of the group.
> > +
> > +When either the GET_IOMMU_FD or GET_DEVICE_FD ioctls are invoked, a
> > +new IOMMU domain is created and all of the devices in the group are
> > +attached to it.  This is the only way to ensure full IOMMU isolation
> > +of the group, but potentially wastes resources and cycles if the user
> > +intends to manage multiple groups with the same set of IOMMU mappings.
> > +VFIO therefore provides a group MERGE and UNMERGE interface, which
> > +allows multiple groups to share an IOMMU domain.  Not all IOMMUs allow
> > +arbitrary groups to be merged, so the user should assume merging is
> > +opportunistic.  A new group, with no open device or IOMMU file
> > +descriptors, can be merged into an existing, in-use, group using the
> > +MERGE ioctl.  A merged group can be unmerged using the UNMERGE ioctl
> > +once all of the device file descriptors for the group being merged
> > +"out" are closed.
> > +
> > +When groups are merged, the GET_IOMMU_FD and GET_DEVICE_FD ioctls are
> > +essentially fungible between group file descriptors (ie. if device A
> > +is in group X, and X is merged with Y, a file descriptor for A can be
> > +retrieved using GET_DEVICE_FD on Y.  Likewise, GET_IOMMU_FD returns a
> > +file descriptor referencing the same internal IOMMU object from either
> > +X or Y).  Merged groups can be dissolved either explictly with UNMERGE
> > +or automatically when ALL file descriptors for the merged group are
> > +closed (all IOMMUs, all devices, all groups).
> > +
> > +The IOMMU file descriptor provides this set of ioctls:
> > +
> > +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> > + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> > +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> > +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
> 
> Coherency support is not going to be addressed right? What about sync?
> Say you need to sync CPU to Device address?

Do we need to expose that to userspace or should the underlying
iommu_ops take care of it?

> > +
> > +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> > +We currently only support IOMMU domains that are able to map any
> > +virtual address to any IOVA.  This is indicated by the MAP_ANY flag.
> > +
> > +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> > +and unmapping IOVAs to process virtual addresses:
> > +
> > +struct vfio_dma_map {
> > +        __u64   len;            /* length of structure */
> 
> What is the purpose of the 'len' field? Is it to guard against future
> version changes?

Yes, David Gibson suggested we include flags & len for all data
structures to help future proof them.

> > +        __u64   vaddr;          /* process virtual addr */
> > +        __u64   dmaaddr;        /* desired and/or returned dma address */
> > +        __u64   size;           /* size in bytes */
> > +        __u64   flags;
> > +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> > +};
> > +
> > +Current users of VFIO use relatively static DMA mappings, not requiring
> > +high frequency turnover.  As new users are added, it's expected that the
> 
> Is there a limit to how many DMA mappings can be created?

Not that I'm aware of for the current AMD-Vi/VT-d implementations.  I
suppose iommu_ops would return -ENOSPC if it hit a limit.  I added the
VFIO_IOMMU_FLAGS_MAP_ANY flag above to try to identify that kind of
restriction.

> > +IOMMU file descriptor will evolve to support new mapping interfaces, this
> > +will be reflected in the flags and may present new ioctls and file
> > +interfaces.
> > +
> > +The device GET_FLAGS ioctl is intended to return basic device type and
> > +indicate support for optional capabilities.  Flags currently include whether
> > +the device is PCI or described by Device Tree, and whether the RESET ioctl
> > +is supported:
> 
> And reset in terms of PCIe spec is the FLR?

Yes, just a pass through to pci_reset_function() for the pci vfio bus
driver.

> > +
> > +#define VFIO_DEVICE_GET_FLAGS           _IOR(';', 108, __u64)
> > + #define VFIO_DEVICE_FLAGS_PCI          (1 << 0)
> > + #define VFIO_DEVICE_FLAGS_DT           (1 << 1)
> > + #define VFIO_DEVICE_FLAGS_RESET        (1 << 2)
> > +
> > +The MMIO and IOP resources used by a device are described by regions.
> 
> IOP?

I/O port, I'll spell it out.

> > +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> > +
> > +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
> 
> Don't want __u32?

It could be, not sure if it buys us anything maybe even restricts us.
We likely don't need 2^32 regions (famous last words?), so we could
later define <0 to something?

> > +
> > +Regions are described by a struct vfio_region_info, which is retrieved by
> > +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> > +the desired region (0 based index).  Note that devices may implement zero
> > 
> +sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> > +mapping).
> 
> Huh?

PCI has the following static mapping:

enum {
        VFIO_PCI_BAR0_REGION_INDEX,
        VFIO_PCI_BAR1_REGION_INDEX,
        VFIO_PCI_BAR2_REGION_INDEX,
        VFIO_PCI_BAR3_REGION_INDEX,
        VFIO_PCI_BAR4_REGION_INDEX,
        VFIO_PCI_BAR5_REGION_INDEX,
        VFIO_PCI_ROM_REGION_INDEX,
        VFIO_PCI_CONFIG_REGION_INDEX,
        VFIO_PCI_NUM_REGIONS
};

So 8 regions are always reported regardless of whether the device
implements all the BARs and the ROM.  Then we have a fixed bar:index
mapping so we don't have to create a region_info field to describe the
bar number for the index.

> > +
> > +struct vfio_region_info {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;          /* region number */
> > +        __u64   size;           /* size in bytes of region */
> > +        __u64   offset;         /* start offset of region */
> > +        __u64   flags;
> > +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> > +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> 
> What is FLAG_MMAP? Does it mean: 1) it can be mmaped, or 2) it is mmaped?

Supports mmap

> FLAG_RO is pretty obvious - presumarily this is for firmware regions and such.
> And PHYS_VALID is if the region is disabled for some reasons? If so
> would the name FLAG_DISABLED be better?

No, POWER guys have some need to report the host physical address of the
region, so the flag indicates whether the below field is present and
valid.  I'll clarify these in the docs.

> 
> > +        __u64   phys;           /* physical address of region */
> > +};
> > +
> > +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> > +
> > +The offset indicates the offset into the device file descriptor which
> > +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> > +available access types and validity of optional fields.  For instance
> > +the phys field may only be valid for certain devices types.
> > +
> > +Interrupts are described using a similar interface.  GET_NUM_IRQS
> > +reports the number or IRQ indexes for the device.
> > +
> > +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
> 
> _u32?

Same as above, but I don't have a strong preference.

> > +
> > +struct vfio_irq_info {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;          /* IRQ number */
> > +        __u32   count;          /* number of individual IRQs */
> > +        __u64   flags;
> > +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> > +};
> > +
> > +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> > +type to index mapping).
> 
> I am not really sure what that means.

This is so PCI can expose:

enum {
        VFIO_PCI_INTX_IRQ_INDEX,
        VFIO_PCI_MSI_IRQ_INDEX,
        VFIO_PCI_MSIX_IRQ_INDEX,
        VFIO_PCI_NUM_IRQS
};

So like regions it always exposes 3 IRQ indexes where count=0 if the
device doesn't actually support that type of interrupt.  I just want to
spell out that bus drivers have this kind of flexibility.

> > +
> > +Information about each index can be retrieved using the GET_IRQ_INFO
> > +ioctl, used much like GET_REGION_INFO.
> > +
> > +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> > +
> > +Individual indexes can describe single or sets of IRQs.  This provides the
> > +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> > +
> > +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> > +as shown below, are used to pass the IRQ info index, the number of eventfds,
> > +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> > +
> > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> 
> Are eventfds u64 or u32?

int, they're just file descriptors

> Why not just define a structure?
> struct vfio_irq_eventfds {
> 	__u32	index;
> 	__u32	count;
> 	__u64	eventfds[0]
> };

We could do that if preferred.  Hmm, are we then going to need
size/flags?

> How do you get an eventfd to feed in here?

eventfd(2), in qemu event_notifier_init() -> event_notifier_get_fd()

> > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> 
> u32?

Not here, it's an fd, so should be an int.

> > +
> > +When a level triggered interrupt is signaled, the interrupt is masked
> > +on the host.  This prevents an unresponsive userspace driver from
> > +continuing to interrupt the host system.  After servicing the interrupt,
> > +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> > +triggered interrupts implicitly have a count of 1 per index.
> 
> So they are enabled automatically? Meaning you don't even hav to do
> SET_IRQ_EVENTFDS b/c the count is set to 1?

I suppose that should be "no more than 1 per index" (ie. PCI would
report a count of 0 for VFIO_PCI_INTX_IRQ_INDEX if the device doesn't
support INTx).  I think you might be confusing VFIO_DEVICE_GET_IRQ_INFO
which tells how many are available with VFIO_DEVICE_SET_IRQ_EVENTFDS
which does the enabling/disabling.  All interrupts are disabled by
default because userspace needs to give us a way to signal them via
eventfds.  It will be device dependent whether multiple index can be
enabled simultaneously.  Hmm, is that another flag on the irq_info
struct or do we expect drivers to implicitly have that kind of
knowledge?

> > +
> > +/* Unmask IRQ index, arg[0] = index */
> > +#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)
> 
> So this is for MSI as well? So if I've an index = 1, with count = 4,
> and doing unmaks IRQ will chip enable all the MSI event at once?

No, this is only for re-enabling level triggered interrupts as discussed
above.  Edge triggered interrupts like MSI don't need an unmask... we
may want to do something to accelerate the MSI-X table access for
masking specific interrupts, but I figured that would need to be PCI
aware since those are PCI features, and would therefore be some future
extension of the PCI bus driver and exposed via VFIO_DEVICE_GET_FLAGS.

> I guess there is not much point in enabling/disabling selective MSI
> IRQs..

Some older OSes are said to make extensive use of masking for MSI, so we
probably want this at some point.  I'm assuming future PCI extension for
now.

> > +
> > +Level triggered interrupts can also be unmasked using an irqfd.  Use
> 
> irqfd or eventfd?

irqfd is an eventfd in reverse.  eventfd = kernel signals userspace via
an fd, irqfd = userspace signals kernel via an fd.

> > +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
> 
> So only level triggered? Hmm, how do I know whether the device is
> level or edge? Or is that edge (MSI) can also be unmaked using the
> eventfs

Yes, only for level.  Isn't a device going to know what type of
interrupt it uses?  MSI masking is PCI specific, not handled by this.

> > +
> > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> > +
> > +When supported, as indicated by the device flags, reset the device.
> > +
> > +#define VFIO_DEVICE_RESET               _IO(';', 116)
> 
> Does it disable the 'count'? Err, does it disable the IRQ on the
> device after this and one should call VFIO_DEVICE_SET_IRQ_EVENTFDS
> to set new eventfds? Or does it re-use the eventfds and the device
> is enabled after this?

It doesn't affect the interrupt programming.  Should it?

> > +
> > +Device tree devices also invlude ioctls for further defining the
> 
> include
> 
> > +device tree properties of the device:
> > +
> > +struct vfio_dtpath {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;
> 
> 0 based I presume?

Everything else is, I would assume so/

> > +        __u64   flags;
> > +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> 
> What is region in this context?? Or would this make much more sense
> if I knew what Device Tree actually is.

Powerpc guys, any comments?  This was their suggestion.  These are
effectively the first device specific extension, available when
VFIO_DEVICE_FLAGS_DT is set.

> > +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> > +        char    *path;
> 
> Ah, now I see why you want 'len' here.. But I am still at loss
> why you want that with the other structures.

Attempt to future proof and validate input.

> > +};
> > +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> > +
> > +struct vfio_dtindex {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;
> > +        __u32   prop_type;
> 
> Is that an enum type? Is this definied somewhere?
> > +        __u32   prop_index;
> 
> What is the purpose of this field?

Need input from powerpc folks here

> > +        __u64   flags;
> > +#define VFIO_DTINDEX_FLAGS_REGION       (1 << 0)
> > +#define VFIO_DTINDEX_FLAGS_IRQ          (1 << 1)
> > +};
> > +#define VFIO_DEVICE_GET_DTINDEX         _IOWR(';', 118, struct vfio_dtindex)
> > +
> > +
> > +VFIO bus driver API
> > +-------------------------------------------------------------------------------
> > +
> > +Bus drivers, such as PCI, have three jobs:
> > + 1) Add/remove devices from vfio
> > + 2) Provide vfio_device_ops for device access
> > + 3) Device binding and unbinding
> 
> suspend/resume?

In the previous version of vfio, the vfio core signaled suspend/resume
to userspace via netlink, effectively putting userspace on the pm
notifier chain.  I was intending to do the same here.

> > +
> > +When initialized, the bus driver should enumerate the devices on it's
> > +bus and call vfio_group_add_dev() for each device.  If the bus supports
> > +hotplug, notifiers should be enabled to track devices being added and
> > +removed.  vfio_group_del_dev() removes a previously added device from
> > +vfio.
> > +
> > +Adding a device registers a vfio_device_ops function pointer structure
> > +for the device:
> 
> Huh? So this gets created for _every_ 'struct device' that is added
> the VFIO bus? Is this structure exposed? Or is this an internal one?

Every device added creates a struct vfio_device and if necessary a
struct vfio_group.  These are internal, just for managing groups and
devices.

> > +
> > +struct vfio_device_ops {
> > +	bool			(*match)(struct device *, char *);
> > +	int			(*get)(void *);
> > +	void			(*put)(void *);
> > +	ssize_t			(*read)(void *, char __user *,
> > +					size_t, loff_t *);
> > +	ssize_t			(*write)(void *, const char __user *,
> > +					 size_t, loff_t *);
> > +	long			(*ioctl)(void *, unsigned int, unsigned long);
> > +	int			(*mmap)(void *, struct vm_area_struct *);
> > +};
> > +
> > +When a device is bound to the bus driver, the bus driver indicates this
> > +to vfio using the vfio_bind_dev() interface.  The device_data parameter
> 
> Might want to paste the function decleration for it.. b/c I am not sure
> where the 'device_data' parameter is on the argument list.

Ok

> > +is a pointer to an opaque data structure for use only by the bus driver.
> > +The get, put, read, write, ioctl, and mmap vfio_device_ops all pass
> > +this data structure back to the bus driver.  When a device is unbound
> 
> Oh, so it is on the 'void *'.

Right

> > +from the bus driver, the vfio_unbind_dev() interface signals this to
> > +vfio.  This function returns the pointer to the device_data structure
> 
> That function
> > +registered for the device.
> 
> I am not really sure what this section purpose is? Could this be part
> of the header file or the code? It does not look to be part of the
> ioctl API?

We've passed into the "VFIO bus driver API" section of the document, to
explain the interaction between vfio-core and vfio bus drivers.

> > +
> > +As noted previously, a group contains one or more devices, so
> > +GROUP_GET_DEVICE_FD needs to identify the specific device being requested.
> > +The vfio_device_ops.match callback is used to allow bus drivers to determine
> > +the match.  For drivers like vfio-pci, it's a simple match to dev_name(),
> > +which is unique in the system due to the PCI bus topology, other bus drivers
> > +may need to include parent devices to create a unique match, so this is
> > +left as a bus driver interface.
> > +
> > +-------------------------------------------------------------------------------
> > +
> > +[1] VFIO was originally an acronym for "Virtual Function I/O" in it's
> > +initial implementation by Tom Lyon while as Cisco.  We've since outgrown
> > +the acronym, but it's catchy.
> > +
> > +[2] As always there are trade-offs to virtual machine device
> > +assignment that are beyond the scope of VFIO.  It's expected that
> > +future IOMMU technologies will reduce some, but maybe not all, of
> > +these trade-offs.
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index f05f5f6..4bd5aa0 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -7106,6 +7106,14 @@ S:	Maintained
> >  F:	Documentation/filesystems/vfat.txt
> >  F:	fs/fat/
> >  
> > +VFIO DRIVER
> > +M:	Alex Williamson <alex.williamson@redhat.com>
> > +L:	kvm@vger.kernel.org
> 
> No vfio mailing list? Or a vfio-mailing list? 

IIRC, Avi had agreed that we could use kvm for now.  I don't know that
vfio will warrant it's own list.  If it picks up, sure, we can move it.

> > +S:	Maintained
> > +F:	Documentation/vfio.txt
> > +F:	drivers/vfio/
> > +F:	include/linux/vfio.h
> > +
> >  VIDEOBUF2 FRAMEWORK
> >  M:	Pawel Osciak <pawel@osciak.com>
> >  M:	Marek Szyprowski <m.szyprowski@samsung.com>
> > diff --git a/drivers/Kconfig b/drivers/Kconfig
> > index b5e6f24..e15578b 100644
> > --- a/drivers/Kconfig
> > +++ b/drivers/Kconfig
> > @@ -112,6 +112,8 @@ source "drivers/auxdisplay/Kconfig"
> >  
> >  source "drivers/uio/Kconfig"
> >  
> > +source "drivers/vfio/Kconfig"
> > +
> >  source "drivers/vlynq/Kconfig"
> >  
> >  source "drivers/virtio/Kconfig"
> > diff --git a/drivers/Makefile b/drivers/Makefile
> > index 1b31421..5f138b5 100644
> > --- a/drivers/Makefile
> > +++ b/drivers/Makefile
> > @@ -58,6 +58,7 @@ obj-$(CONFIG_ATM)		+= atm/
> >  obj-$(CONFIG_FUSION)		+= message/
> >  obj-y				+= firewire/
> >  obj-$(CONFIG_UIO)		+= uio/
> > +obj-$(CONFIG_VFIO)		+= vfio/
> >  obj-y				+= cdrom/
> >  obj-y				+= auxdisplay/
> >  obj-$(CONFIG_PCCARD)		+= pcmcia/
> > diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> > new file mode 100644
> > index 0000000..9acb1e7
> > --- /dev/null
> > +++ b/drivers/vfio/Kconfig
> > @@ -0,0 +1,8 @@
> > +menuconfig VFIO
> > +	tristate "VFIO Non-Privileged userspace driver framework"
> > +	depends on IOMMU_API
> > +	help
> > +	  VFIO provides a framework for secure userspace device drivers.
> > +	  See Documentation/vfio.txt for more details.
> > +
> > +	  If you don't know what to do here, say N.
> > diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> > new file mode 100644
> > index 0000000..088faf1
> > --- /dev/null
> > +++ b/drivers/vfio/Makefile
> > @@ -0,0 +1,3 @@
> > +vfio-y := vfio_main.o vfio_iommu.o
> > +
> > +obj-$(CONFIG_VFIO) := vfio.o
> > diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> > new file mode 100644
> > index 0000000..029dae3
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_iommu.c
> > @@ -0,0 +1,530 @@
> > +/*
> > + * VFIO: IOMMU DMA mapping support
> > + *
> > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> > + *     Author: Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * Derived from original vfio:
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@cisco.com
> > + */
> > +
> > +#include <linux/compat.h>
> > +#include <linux/device.h>
> > +#include <linux/fs.h>
> > +#include <linux/iommu.h>
> > +#include <linux/module.h>
> > +#include <linux/mm.h>
> > +#include <linux/sched.h>
> > +#include <linux/slab.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/vfio.h>
> > +#include <linux/workqueue.h>
> > +
> > +#include "vfio_private.h"
> > +
> > +struct dma_map_page {
> > +	struct list_head	list;
> > +	dma_addr_t		daddr;
> > +	unsigned long		vaddr;
> > +	int			npage;
> > +	int			rdwr;
> 
> rdwr? Is this a flag thing? Could it be made in an enum?

Or maybe better would just be a bool.

> > +};
> > +
> > +/*
> > + * This code handles mapping and unmapping of user data buffers
> > + * into DMA'ble space using the IOMMU
> > + */
> > +
> > +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> > +
> > +struct vwork {
> > +	struct mm_struct	*mm;
> > +	int			npage;
> > +	struct work_struct	work;
> > +};
> > +
> > +/* delayed decrement for locked_vm */
> > +static void vfio_lock_acct_bg(struct work_struct *work)
> > +{
> > +	struct vwork *vwork = container_of(work, struct vwork, work);
> > +	struct mm_struct *mm;
> > +
> > +	mm = vwork->mm;
> > +	down_write(&mm->mmap_sem);
> > +	mm->locked_vm += vwork->npage;
> > +	up_write(&mm->mmap_sem);
> > +	mmput(mm);		/* unref mm */
> > +	kfree(vwork);
> > +}
> > +
> > +static void vfio_lock_acct(int npage)
> > +{
> > +	struct vwork *vwork;
> > +	struct mm_struct *mm;
> > +
> > +	if (!current->mm) {
> > +		/* process exited */
> > +		return;
> > +	}
> > +	if (down_write_trylock(&current->mm->mmap_sem)) {
> > +		current->mm->locked_vm += npage;
> > +		up_write(&current->mm->mmap_sem);
> > +		return;
> > +	}
> > +	/*
> > +	 * Couldn't get mmap_sem lock, so must setup to decrement
> > +	 * mm->locked_vm later. If locked_vm were atomic, we wouldn't
> > +	 * need this silliness
> > +	 */
> > +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> > +	if (!vwork)
> > +		return;
> > +	mm = get_task_mm(current);	/* take ref mm */
> > +	if (!mm) {
> > +		kfree(vwork);
> > +		return;
> > +	}
> > +	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> > +	vwork->mm = mm;
> > +	vwork->npage = npage;
> > +	schedule_work(&vwork->work);
> > +}
> > +
> > +/* Some mappings aren't backed by a struct page, for example an mmap'd
> > + * MMIO range for our own or another device.  These use a different
> > + * pfn conversion and shouldn't be tracked as locked pages. */
> > +static int is_invalid_reserved_pfn(unsigned long pfn)
> 
> static bool
> 
> > +{
> > +	if (pfn_valid(pfn)) {
> > +		int reserved;
> > +		struct page *tail = pfn_to_page(pfn);
> > +		struct page *head = compound_trans_head(tail);
> > +		reserved = PageReserved(head);
> 
> bool reserved = PageReserved(head);

Agree on both

> > +		if (head != tail) {
> > +			/* "head" is not a dangling pointer
> > +			 * (compound_trans_head takes care of that)
> > +			 * but the hugepage may have been split
> > +			 * from under us (and we may not hold a
> > +			 * reference count on the head page so it can
> > +			 * be reused before we run PageReferenced), so
> > +			 * we've to check PageTail before returning
> > +			 * what we just read.
> > +			 */
> > +			smp_rmb();
> > +			if (PageTail(tail))
> > +				return reserved;
> > +		}
> > +		return PageReserved(tail);
> > +	}
> > +
> > +	return true;
> > +}
> > +
> > +static int put_pfn(unsigned long pfn, int rdwr)
> > +{
> > +	if (!is_invalid_reserved_pfn(pfn)) {
> > +		struct page *page = pfn_to_page(pfn);
> > +		if (rdwr)
> > +			SetPageDirty(page);
> > +		put_page(page);
> > +		return 1;
> > +	}
> > +	return 0;
> > +}
> > +
> > +/* Unmap DMA region */
> > +/* dgate must be held */
> 
> dgate?

DMA gate, the mutex for iommu operations.  This a carry over from old
vfio.  As there's only one mutex on the struct vfio_iommu, I can just
rename that to "lock".

> > +static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > +			    int npage, int rdwr)
> > +{
> > +	int i, unlocked = 0;
> > +
> > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > +		unsigned long pfn;
> > +
> > +		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
> > +		if (pfn) {
> > +			iommu_unmap(iommu->domain, iova, 0);
> 
> What is the '0' for? Perhaps a comment: /* We only do zero order */

yep.  We'll need to improve this at some point to take advantage of
large iommu pages, but it shouldn't affect the API.  I'll add comment.

> > +			unlocked += put_pfn(pfn, rdwr);
> > +		}
> > +	}
> > +	return unlocked;
> > +}
> > +
> > +static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > +			   unsigned long npage, int rdwr)
> > +{
> > +	int unlocked;
> > +
> > +	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);
> > +	vfio_lock_acct(-unlocked);
> > +}
> > +
> > +/* Unmap ALL DMA regions */
> > +void vfio_iommu_unmapall(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos, *pos2;
> 
> pos2 should probably be just called 'tmp'

ok

> > +	struct dma_map_page *mlp;
> 
> What does 'mlp' stand for?
> 
> mlp -> dma_page ?

Carry over from original code, I can guess, but not sure what Tom was
originally thinking.  I think everyone has asked so far, so I'll make a
pass at coming up with a names that I can explain.

> > +
> > +	mutex_lock(&iommu->dgate);
> > +	list_for_each_safe(pos, pos2, &iommu->dm_list) {
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> 
> Uh, so if it did not get put_page() we would try to still delete it?
> Couldn't that lead to corruption as the 'mlp' is returned to the poll?
> 
> Ah wait, the put_page is on the DMA page, so it is OK to
> delete the tracking structure. It will be just a leaked page.

Assume you're referencing this chunk:

vfio_dma_unmap
  __vfio_dma_unmap
    ...
        pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
        if (pfn) {
                iommu_unmap(iommu->domain, iova, 0);
                unlocked += put_pfn(pfn, rdwr);
        }

So we skip things that aren't mapped in the iommu, but anything not
mapped should have already been put (failed vfio_dma_map).  If it is
mapped, we put it if we originally got it via get_user_pages_fast.
unlocked would only not get incremented here if it was an mmap'd page
(such as the mmap of an mmio space of another vfio device), via the code
in vaddr_get_pfn (stolen from KVM).

> > +		list_del(&mlp->list);
> > +		kfree(mlp);
> > +	}
> > +	mutex_unlock(&iommu->dgate);
> > +}
> > +
> > +static int vaddr_get_pfn(unsigned long vaddr, int rdwr, unsigned long *pfn)
> > +{
> > +	struct page *page[1];
> > +	struct vm_area_struct *vma;
> > +	int ret = -EFAULT;
> > +
> > +	if (get_user_pages_fast(vaddr, 1, rdwr, page) == 1) {
> > +		*pfn = page_to_pfn(page[0]);
> > +		return 0;
> > +	}
> > +
> > +	down_read(&current->mm->mmap_sem);
> > +
> > +	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> > +
> > +	if (vma && vma->vm_flags & VM_PFNMAP) {
> > +		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> > +		if (is_invalid_reserved_pfn(*pfn))
> > +			ret = 0;
> 
> Did you mean to break here?

We're in an if block, not a loop.

> > +	}
> > +
> > +	up_read(&current->mm->mmap_sem);
> > +
> > +	return ret;
> > +}
> > +
> > +/* Map DMA region */
> > +/* dgate must be held */
> > +static int vfio_dma_map(struct vfio_iommu *iommu, unsigned long iova,
> > +			unsigned long vaddr, int npage, int rdwr)
> > +{
> > +	unsigned long start = iova;
> > +	int i, ret, locked = 0, prot = IOMMU_READ;
> > +
> > +	/* Verify pages are not already mapped */
> 
> I think a 'that' is missing above.

Ok.

> > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
> > +		if (iommu_iova_to_phys(iommu->domain, iova))
> > +			return -EBUSY;
> > +
> > +	iova = start;
> > +
> > +	if (rdwr)
> > +		prot |= IOMMU_WRITE;
> > +	if (iommu->cache)
> > +		prot |= IOMMU_CACHE;
> > +
> > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> > +		unsigned long pfn = 0;
> > +
> > +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> > +		if (ret) {
> > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > +			return ret;
> > +		}
> > +
> > +		/* Only add actual locked pages to accounting */
> > +		if (!is_invalid_reserved_pfn(pfn))
> > +			locked++;
> > +
> > +		ret = iommu_map(iommu->domain, iova,
> > +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
> 
> Put a comment by the 0 saying /* order 0 pages only! */

Yep

> > +		if (ret) {
> > +			/* Back out mappings on error */
> > +			put_pfn(pfn, rdwr);
> > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > +			return ret;
> > +		}
> > +	}
> > +	vfio_lock_acct(locked);
> > +	return 0;
> > +}
> > +
> > +static inline int ranges_overlap(unsigned long start1, size_t size1,
> 
> Perhaps a bool?

Sure

> > +				 unsigned long start2, size_t size2)
> > +{
> > +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);
> > +}
> > +
> > +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
> > +					  dma_addr_t start, size_t size)
> > +{
> > +	struct list_head *pos;
> > +	struct dma_map_page *mlp;
> > +
> > +	list_for_each(pos, &iommu->dm_list) {
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > +				   start, size))
> > +			return mlp;
> > +	}
> > +	return NULL;
> > +}
> > +
> > +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
> > +			    size_t size, struct dma_map_page *mlp)
> > +{
> > +	struct dma_map_page *split;
> > +	int npage_lo, npage_hi;
> > +
> > +	/* Existing dma region is completely covered, unmap all */
> > +	if (start <= mlp->daddr &&
> > +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > +		list_del(&mlp->list);
> > +		npage_lo = mlp->npage;
> > +		kfree(mlp);
> > +		return npage_lo;
> > +	}
> > +
> > +	/* Overlap low address of existing range */
> > +	if (start <= mlp->daddr) {
> > +		size_t overlap;
> > +
> > +		overlap = start + size - mlp->daddr;
> > +		npage_lo = overlap >> PAGE_SHIFT;
> > +		npage_hi = mlp->npage - npage_lo;
> > +
> > +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> > +		mlp->daddr += overlap;
> > +		mlp->vaddr += overlap;
> > +		mlp->npage -= npage_lo;
> > +		return npage_lo;
> > +	}
> > +
> > +	/* Overlap high address of existing range */
> > +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +		size_t overlap;
> > +
> > +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> > +		npage_hi = overlap >> PAGE_SHIFT;
> > +		npage_lo = mlp->npage - npage_hi;
> > +
> > +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> > +		mlp->npage -= npage_hi;
> > +		return npage_hi;
> > +	}
> > +
> > +	/* Split existing */
> > +	npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
> > +	npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
> > +
> > +	split = kzalloc(sizeof *split, GFP_KERNEL);
> > +	if (!split)
> > +		return -ENOMEM;
> > +
> > +	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
> > +
> > +	mlp->npage = npage_lo;
> > +
> > +	split->npage = npage_hi;
> > +	split->daddr = start + size;
> > +	split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
> > +	split->rdwr = mlp->rdwr;
> > +	list_add(&split->list, &iommu->dm_list);
> > +	return size >> PAGE_SHIFT;
> > +}
> > +
> > +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > +{
> > +	int ret = 0;
> > +	size_t npage = dmp->size >> PAGE_SHIFT;
> > +	struct list_head *pos, *n;
> > +
> > +	if (dmp->dmaaddr & ~PAGE_MASK)
> > +		return -EINVAL;
> > +	if (dmp->size & ~PAGE_MASK)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&iommu->dgate);
> > +
> > +	list_for_each_safe(pos, n, &iommu->dm_list) {
> > +		struct dma_map_page *mlp;
> > +
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > +				   dmp->dmaaddr, dmp->size)) {
> > +			ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
> > +						      dmp->size, mlp);
> > +			if (ret > 0)
> > +				npage -= NPAGE_TO_SIZE(ret);
> > +			if (ret < 0 || npage == 0)
> > +				break;
> > +		}
> > +	}
> > +	mutex_unlock(&iommu->dgate);
> > +	return ret > 0 ? 0 : ret;
> > +}
> > +
> > +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > +{
> > +	int npage;
> > +	struct dma_map_page *mlp, *mmlp = NULL;
> > +	dma_addr_t daddr = dmp->dmaaddr;
> > +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;
> > +	size_t size = dmp->size;
> > +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;
> > +
> > +	if (vaddr & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +	if (daddr & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +	if (size & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +
> > +	npage = size >> PAGE_SHIFT;
> > +	if (!npage)
> > +		return -EINVAL;
> > +
> > +	if (!iommu)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&iommu->dgate);
> > +
> > +	if (vfio_find_dma(iommu, daddr, size)) {
> > +		ret = -EBUSY;
> > +		goto out_lock;
> > +	}
> > +
> > +	/* account for locked pages */
> > +	locked = current->mm->locked_vm + npage;
> > +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> > +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > +			__func__, rlimit(RLIMIT_MEMLOCK));
> > +		ret = -ENOMEM;
> > +		goto out_lock;
> > +	}
> > +
> > +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);
> > +	if (ret)
> > +		goto out_lock;
> > +
> > +	/* Check if we abut a region below */
> > +	if (daddr) {
> > +		mlp = vfio_find_dma(iommu, daddr - 1, 1);
> > +		if (mlp && mlp->rdwr == rdwr &&
> > +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {
> > +
> > +			mlp->npage += npage;
> > +			daddr = mlp->daddr;
> > +			vaddr = mlp->vaddr;
> > +			npage = mlp->npage;
> > +			size = NPAGE_TO_SIZE(npage);
> > +
> > +			mmlp = mlp;
> > +		}
> > +	}
> > +
> > +	if (daddr + size) {
> > +		mlp = vfio_find_dma(iommu, daddr + size, 1);
> > +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size) {
> > +
> > +			mlp->npage += npage;
> > +			mlp->daddr = daddr;
> > +			mlp->vaddr = vaddr;
> > +
> > +			/* If merged above and below, remove previously
> > +			 * merged entry.  New entry covers it.  */
> > +			if (mmlp) {
> > +				list_del(&mmlp->list);
> > +				kfree(mmlp);
> > +			}
> > +			mmlp = mlp;
> > +		}
> > +	}
> > +
> > +	if (!mmlp) {
> > +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
> > +		if (!mlp) {
> > +			ret = -ENOMEM;
> > +			vfio_dma_unmap(iommu, daddr, npage, rdwr);
> > +			goto out_lock;
> > +		}
> > +
> > +		mlp->npage = npage;
> > +		mlp->daddr = daddr;
> > +		mlp->vaddr = vaddr;
> > +		mlp->rdwr = rdwr;
> > +		list_add(&mlp->list, &iommu->dm_list);
> > +	}
> > +
> > +out_lock:
> > +	mutex_unlock(&iommu->dgate);
> > +	return ret;
> > +}
> > +
> > +static int vfio_iommu_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_iommu *iommu = filep->private_data;
> > +
> > +	vfio_release_iommu(iommu);
> > +	return 0;
> > +}
> > +
> > +static long vfio_iommu_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_iommu *iommu = filep->private_data;
> > +	int ret = -ENOSYS;
> > +
> > +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> 
> Something is weird with the tabbing here..

Indeed, the joys of switching between kernel and qemu ;)  fixed

> > +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> > +
> > +                ret = put_user(flags, (u64 __user *)arg);
> > +
> > +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> > +		struct vfio_dma_map dm;
> > +
> > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > +			return -EFAULT;
> > +
> > +		ret = vfio_dma_map_dm(iommu, &dm);
> > +
> > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > +			ret = -EFAULT;
> > +
> > +	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
> > +		struct vfio_dma_map dm;
> > +
> > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > +			return -EFAULT;
> > +
> > +		ret = vfio_dma_unmap_dm(iommu, &dm);
> > +
> > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > +			ret = -EFAULT;
> > +	}
> > +	return ret;
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_iommu_compat_ioctl(struct file *filep,
> > +				    unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +const struct file_operations vfio_iommu_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.release	= vfio_iommu_release,
> > +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_iommu_compat_ioctl,
> > +#endif
> > +};
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > new file mode 100644
> > index 0000000..6169356
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -0,0 +1,1151 @@
> > +/*
> > + * VFIO framework
> > + *
> > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> > + *     Author: Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * Derived from original vfio:
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@cisco.com
> > + */
> > +
> > +#include <linux/cdev.h>
> > +#include <linux/compat.h>
> > +#include <linux/device.h>
> > +#include <linux/file.h>
> > +#include <linux/anon_inodes.h>
> > +#include <linux/fs.h>
> > +#include <linux/idr.h>
> > +#include <linux/iommu.h>
> > +#include <linux/mm.h>
> > +#include <linux/module.h>
> > +#include <linux/slab.h>
> > +#include <linux/string.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/vfio.h>
> > +#include <linux/wait.h>
> > +
> > +#include "vfio_private.h"
> > +
> > +#define DRIVER_VERSION	"0.2"
> > +#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
> > +#define DRIVER_DESC	"VFIO - User Level meta-driver"
> > +
> > +static int allow_unsafe_intrs;
> 
> __read_mostly

Ok

> > +module_param(allow_unsafe_intrs, int, 0);
> 
> S_IRUGO ?

I actually intended that to be S_IRUGO | S_IWUSR just like the kvm
parameter so it can be toggled runtime.

> > +MODULE_PARM_DESC(allow_unsafe_intrs,
> > +        "Allow use of IOMMUs which do not support interrupt remapping");
> > +
> > +static struct vfio {
> > +	dev_t			devt;
> > +	struct cdev		cdev;
> > +	struct list_head	group_list;
> > +	struct mutex		lock;
> > +	struct kref		kref;
> > +	struct class		*class;
> > +	struct idr		idr;
> > +	wait_queue_head_t	release_q;
> > +} vfio;
> 
> You probably want to move this below the 'vfio_group'
> as vfio contains the vfio_group.

Only via the group_list.  Are you suggesting for readability or to avoid
forward declarations (which we don't need between these two with current
ordering).

> > +
> > +static const struct file_operations vfio_group_fops;
> > +extern const struct file_operations vfio_iommu_fops;
> > +
> > +struct vfio_group {
> > +	dev_t			devt;
> > +	unsigned int		groupid;
> > +	struct bus_type		*bus;
> > +	struct vfio_iommu	*iommu;
> > +	struct list_head	device_list;
> > +	struct list_head	iommu_next;
> > +	struct list_head	group_next;
> > +	int			refcnt;
> > +};
> > +
> > +struct vfio_device {
> > +	struct device			*dev;
> > +	const struct vfio_device_ops	*ops;
> > +	struct vfio_iommu		*iommu;
> > +	struct vfio_group		*group;
> > +	struct list_head		device_next;
> > +	bool				attached;
> > +	int				refcnt;
> > +	void				*device_data;
> > +};
> 
> And perhaps move this above vfio_group. As vfio_group
> contains a list of these structures?

These are inter-linked, so chicken and egg.  The current ordering is
more function based than definition based.  struct vfio is the highest
level object, groups are next, iommus and devices are next, but we need
to share iommus with the other file, so that lands in the header.

> > +
> > +/*
> > + * Helper functions called under vfio.lock
> > + */
> > +
> > +/* Return true if any devices within a group are opened */
> > +static bool __vfio_group_devs_inuse(struct vfio_group *group)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		if (device->refcnt)
> > +			return true;
> > +	}
> > +	return false;
> > +}
> > +
> > +/* Return true if any of the groups attached to an iommu are opened.
> > + * We can only tear apart merged groups when nothing is left open. */
> > +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +		if (group->refcnt)
> > +			return true;
> > +	}
> > +	return false;
> > +}
> > +
> > +/* An iommu is "in use" if it has a file descriptor open or if any of
> > + * the groups assigned to the iommu have devices open. */
> > +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	if (iommu->refcnt)
> > +		return true;
> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +
> > +		if (__vfio_group_devs_inuse(group))
> > +			return true;
> > +	}
> > +	return false;
> > +}
> > +
> > +static void __vfio_group_set_iommu(struct vfio_group *group,
> > +				   struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	if (group->iommu)
> > +		list_del(&group->iommu_next);
> > +	if (iommu)
> > +		list_add(&group->iommu_next, &iommu->group_list);
> > +
> > +	group->iommu = iommu;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		device->iommu = iommu;
> > +	}
> > +}
> > +
> > +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
> > +				    struct vfio_device *device)
> > +{
> > +	BUG_ON(!iommu->domain && device->attached);
> 
> Whoa. Heavy hammer there.
> 
> Perhaps WARN_ON as you do check it later on.

I think it's warranted, internal consistency is broken if we have a
device that thinks it's attached to an iommu domain that doesn't exist.
It should, of course, never happen and this isn't a performance path.

> > +
> > +	if (!iommu->domain || !device->attached)
> > +		return;
> > +
> > +	iommu_detach_device(iommu->domain, device->dev);
> > +	device->attached = false;
> > +}
> > +
> > +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
> > +				      struct vfio_group *group)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		__vfio_iommu_detach_dev(iommu, device);
> > +	}
> > +}
> > +
> > +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
> > +				   struct vfio_device *device)
> > +{
> > +	int ret;
> > +
> > +	BUG_ON(device->attached);
> 
> How about:
> 
> WARN_ON(device->attached, "The engineer who wrote the user-space device driver is trying to register
> the device again! Tell him/her to stop please.\n");

I would almost demote this one to a WARN_ON, but userspace isn't in
control of attaching and detaching devices from the iommu.  That's a
side effect of getting the iommu or device file descriptor.  So again,
this is an internal consistency check and it should never happen,
regardless of userspace.

> > +
> > +	if (!iommu || !iommu->domain)
> > +		return -EINVAL;
> > +
> > +	ret = iommu_attach_device(iommu->domain, device->dev);
> > +	if (!ret)
> > +		device->attached = true;
> > +
> > +	return ret;
> > +}
> > +
> > +static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,
> > +				     struct vfio_group *group)
> > +{
> > +	struct list_head *pos;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		struct vfio_device *device;
> > +		int ret;
> > +
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		ret = __vfio_iommu_attach_dev(iommu, device);
> > +		if (ret) {
> > +			__vfio_iommu_detach_group(iommu, group);
> > +			return ret;
> > +		}
> > +	}
> > +	return 0;
> > +}
> > +
> > +/* The iommu is viable, ie. ready to be configured, when all the devices
> > + * for all the groups attached to the iommu are bound to their vfio device
> > + * drivers (ex. vfio-pci).  This sets the device_data private data pointer. */
> > +static bool __vfio_iommu_viable(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *gpos, *dpos;
> > +
> > +	list_for_each(gpos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +		group = list_entry(gpos, struct vfio_group, iommu_next);
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (!device->device_data)
> > +				return false;
> > +		}
> > +	}
> > +	return true;
> > +}
> > +
> > +static void __vfio_close_iommu(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +
> > +	if (!iommu->domain)
> > +		return;
> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +
> > +		__vfio_iommu_detach_group(iommu, group);
> > +	}
> > +
> > +	vfio_iommu_unmapall(iommu);
> > +
> > +	iommu_domain_free(iommu->domain);
> > +	iommu->domain = NULL;
> > +	iommu->mm = NULL;
> > +}
> > +
> > +/* Open the IOMMU.  This gates all access to the iommu or device file
> > + * descriptors and sets current->mm as the exclusive user. */
> > +static int __vfio_open_iommu(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos;
> > +	int ret;
> > +
> > +	if (!__vfio_iommu_viable(iommu))
> > +		return -EBUSY;
> > +
> > +	if (iommu->domain)
> > +		return -EINVAL;
> > +
> > +	iommu->domain = iommu_domain_alloc(iommu->bus);
> > +	if (!iommu->domain)
> > +		return -EFAULT;
> 
> ENOMEM?

Yeah, probably more appropriate.

> > +
> > +	list_for_each(pos, &iommu->group_list) {
> > +		struct vfio_group *group;
> > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > +
> > +		ret = __vfio_iommu_attach_group(iommu, group);
> > +		if (ret) {
> > +			__vfio_close_iommu(iommu);
> > +			return ret;
> > +		}
> > +	}
> > +
> > +	if (!allow_unsafe_intrs &&
> > +	    !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
> > +		__vfio_close_iommu(iommu);
> > +		return -EFAULT;
> > +	}
> > +
> > +	iommu->cache = (iommu_domain_has_cap(iommu->domain,
> > +					     IOMMU_CAP_CACHE_COHERENCY) != 0);
> > +	iommu->mm = current->mm;
> > +
> > +	return 0;
> > +}
> > +
> > +/* Actively try to tear down the iommu and merged groups.  If there are no
> > + * open iommu or device fds, we close the iommu.  If we close the iommu and
> > + * there are also no open group fds, we can futher dissolve the group to
> > + * iommu association and free the iommu data structure. */
> > +static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)
> > +{
> > +
> > +	if (__vfio_iommu_inuse(iommu))
> > +		return -EBUSY;
> > +
> > +	__vfio_close_iommu(iommu);
> > +
> > +	if (!__vfio_iommu_groups_inuse(iommu)) {
> > +		struct list_head *pos, *ppos;
> > +
> > +		list_for_each_safe(pos, ppos, &iommu->group_list) {
> > +			struct vfio_group *group;
> > +
> > +			group = list_entry(pos, struct vfio_group, iommu_next);
> > +			__vfio_group_set_iommu(group, NULL);
> > +		}
> > +
> > +
> > +		kfree(iommu);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static struct vfio_device *__vfio_lookup_dev(struct device *dev)
> > +{
> > +	struct list_head *gpos;
> > +	unsigned int groupid;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> 
> Hmm, where is this defined? v3.2-rc1 does not seem to have it?

From patch header:

        Fingers crossed, this is the last RFC for VFIO, but we need
        the iommu group support before this can go upstream
        (http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
        hoping this helps push that along.

hat's the one bit keeping me from doing a non-RFC of the core, besides
fixing all these comments ;)

> > +		return NULL;
> > +
> > +	list_for_each(gpos, &vfio.group_list) {
> > +		struct vfio_group *group;
> > +		struct list_head *dpos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, group_next);
> > +
> > +		if (group->groupid != groupid)
> > +			continue;
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (device->dev == dev)
> > +				return device;
> > +		}
> > +	}
> > +	return NULL;
> > +}
> > +
> > +/* All release paths simply decrement the refcnt, attempt to teardown
> > + * the iommu and merged groups, and wakeup anything that might be
> > + * waiting if we successfully dissolve anything. */
> > +static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)
> > +{
> > +	bool wake;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	(*refcnt)--;
> > +	wake = (__vfio_try_dissolve_iommu(iommu) == 0);
> > +
> > +	mutex_unlock(&vfio.lock);
> > +
> > +	if (wake)
> > +		wake_up(&vfio.release_q);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Device fops - passthrough to vfio device driver w/ device_data
> > + */
> > +static int vfio_device_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	vfio_do_release(&device->refcnt, device->iommu);
> > +
> > +	device->ops->put(device->device_data);
> > +
> > +	return 0;
> > +}
> > +
> > +static long vfio_device_unl_ioctl(struct file *filep,
> > +				  unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->ioctl(device->device_data, cmd, arg);
> > +}
> > +
> > +static ssize_t vfio_device_read(struct file *filep, char __user *buf,
> > +				size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->read(device->device_data, buf, count, ppos);
> > +}
> > +
> > +static ssize_t vfio_device_write(struct file *filep, const char __user *buf,
> > +				 size_t count, loff_t *ppos)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->write(device->device_data, buf, count, ppos);
> > +}
> > +
> > +static int vfio_device_mmap(struct file *filep, struct vm_area_struct *vma)
> > +{
> > +	struct vfio_device *device = filep->private_data;
> > +
> > +	return device->ops->mmap(device->device_data, vma);
> > +}
> > +	
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_device_compat_ioctl(struct file *filep,
> > +				     unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_device_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +const struct file_operations vfio_device_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.release	= vfio_device_release,
> > +	.read		= vfio_device_read,
> > +	.write		= vfio_device_write,
> > +	.unlocked_ioctl	= vfio_device_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_device_compat_ioctl,
> > +#endif
> > +	.mmap		= vfio_device_mmap,
> > +};
> > +
> > +/*
> > + * Group fops
> > + */
> > +static int vfio_group_open(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_group *group;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	group = idr_find(&vfio.idr, iminor(inode));
> > +
> > +	if (!group) {
> > +		ret = -ENODEV;
> > +		goto out;
> > +	}
> > +
> > +	filep->private_data = group;
> > +
> > +	if (!group->iommu) {
> > +		struct vfio_iommu *iommu;
> > +
> > +		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> > +		if (!iommu) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +		INIT_LIST_HEAD(&iommu->group_list);
> > +		INIT_LIST_HEAD(&iommu->dm_list);
> > +		mutex_init(&iommu->dgate);
> > +		iommu->bus = group->bus;
> > +		__vfio_group_set_iommu(group, iommu);
> > +	}
> > +	group->refcnt++;
> > +
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +
> > +	return ret;
> > +}
> > +
> > +static int vfio_group_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_group *group = filep->private_data;
> > +
> > +	return vfio_do_release(&group->refcnt, group->iommu);
> > +}
> > +
> > +/* Attempt to merge the group pointed to by fd into group.  The merge-ee
> > + * group must not have an iommu or any devices open because we cannot
> > + * maintain that context across the merge.  The merge-er group can be
> > + * in use. */
> > +static int vfio_group_merge(struct vfio_group *group, int fd)
> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *old_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +	bool opened = false;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +
> > +	if (!new || new == group || !new->iommu ||
> > +	    new->iommu->domain || new->bus != group->bus) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We need to attach all the devices to each domain separately
> > +	 * in order to validate that the capabilities match for both.  */
> > +	ret = __vfio_open_iommu(new->iommu);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +		opened = true;
> > +	}
> > +
> > +	/* If cache coherency doesn't match we'd potentialy need to
> > +	 * remap existing iommu mappings in the merge-er domain.
> > +	 * Poor return to bother trying to allow this currently. */
> > +	if (iommu_domain_has_cap(group->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY) !=
> > +	    iommu_domain_has_cap(new->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY)) {
> > +		__vfio_close_iommu(new->iommu);
> > +		if (opened)
> > +			__vfio_close_iommu(group->iommu);
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* Close the iommu for the merge-ee and attach all its devices
> > +	 * to the merge-er iommu. */
> > +	__vfio_close_iommu(new->iommu);
> > +
> > +	ret = __vfio_iommu_attach_group(group->iommu, new);
> > +	if (ret)
> > +		goto out;
> > +
> > +	/* set_iommu unlinks new from the iommu, so save a pointer to it */
> > +	old_iommu = new->iommu;
> > +	__vfio_group_set_iommu(new, group->iommu);
> > +	kfree(old_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Unmerge the group pointed to by fd from group. */
> > +static int vfio_group_unmerge(struct vfio_group *group, int fd)
> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *new_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +
> > +	/* Since the merge-out group is already opened, it needs to
> > +	 * have an iommu struct associated with it. */
> > +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
> > +	if (!new_iommu)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&new_iommu->group_list);
> > +	INIT_LIST_HEAD(&new_iommu->dm_list);
> > +	mutex_init(&new_iommu->dgate);
> > +	new_iommu->bus = group->bus;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +	if (!new || new == group || new->iommu != group->iommu) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We can't merge-out a group with devices still in use. */
> > +	if (__vfio_group_devs_inuse(new)) {
> > +		ret = -EBUSY;
> > +		goto out;
> > +	}
> > +
> > +	__vfio_iommu_detach_group(group->iommu, new);
> > +	__vfio_group_set_iommu(new, new_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	if (ret)
> > +		kfree(new_iommu);
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new iommu file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set. */
> > +static int vfio_group_get_iommu_fd(struct vfio_group *group)
> > +{
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
> > +			       group->iommu, O_RDWR);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	group->iommu->refcnt++;
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new device file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set.  It's difficult to
> > + * specify the requirements for matching a user supplied buffer to a
> > + * device, so we use a vfio driver callback to test for a match.  For
> > + * PCI, dev_name(dev) is unique, but other drivers may require including
> > + * a parent device string. */
> > +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
> > +{
> > +	struct vfio_iommu *iommu = group->iommu;
> > +	struct list_head *gpos;
> > +	int ret = -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!iommu->domain) {
> > +		ret = __vfio_open_iommu(iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	list_for_each(gpos, &iommu->group_list) {
> > +		struct list_head *dpos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, iommu_next);
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (device->ops->match(device->dev, buf)) {
> > +				struct file *file;
> > +
> > +				if (device->ops->get(device->device_data)) {
> > +					ret = -EFAULT;
> > +					goto out;
> > +				}
> > +
> > +				/* We can't use anon_inode_getfd(), like above
> > +				 * because we need to modify the f_mode flags
> > +				 * directly to allow more than just ioctls */
> > +				ret = get_unused_fd();
> > +				if (ret < 0) {
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> > +
> > +				file = anon_inode_getfile("[vfio-device]",
> > +							  &vfio_device_fops,
> > +							  device, O_RDWR);
> > +				if (IS_ERR(file)) {
> > +					put_unused_fd(ret);
> > +					ret = PTR_ERR(file);
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> > +
> > +				/* Todo: add an anon_inode interface to do
> > +				 * this.  Appears to be missing by lack of
> > +				 * need rather than explicitly prevented.
> > +				 * Now there's need. */
> > +				file->f_mode |= (FMODE_LSEEK |
> > +						 FMODE_PREAD |
> > +						 FMODE_PWRITE);
> > +
> > +				fd_install(ret, file);
> > +
> > +				device->refcnt++;
> > +				goto out;
> > +			}
> > +		}
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +static long vfio_group_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_group *group = filep->private_data;
> > +
> > +	if (cmd == VFIO_GROUP_GET_FLAGS) {
> > +		u64 flags = 0;
> > +
> > +		mutex_lock(&vfio.lock);
> > +		if (__vfio_iommu_viable(group->iommu))
> > +			flags |= VFIO_GROUP_FLAGS_VIABLE;
> > +		mutex_unlock(&vfio.lock);
> > +
> > +		if (group->iommu->mm)
> > +			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;
> > +
> > +		return put_user(flags, (u64 __user *)arg);
> > +	}
> > +		
> > +	/* Below commands are restricted once the mm is set */
> > +	if (group->iommu->mm && group->iommu->mm != current->mm)
> > +		return -EPERM;
> > +
> > +	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {
> > +		int fd;
> > +		
> > +		if (get_user(fd, (int __user *)arg))
> > +			return -EFAULT;
> > +		if (fd < 0)
> > +			return -EINVAL;
> > +
> > +		if (cmd == VFIO_GROUP_MERGE)
> > +			return vfio_group_merge(group, fd);
> > +		else
> > +			return vfio_group_unmerge(group, fd);
> > +	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {
> > +		return vfio_group_get_iommu_fd(group);
> > +	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {
> > +		char *buf;
> > +		int ret;
> > +
> > +		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
> > +		if (IS_ERR(buf))
> > +			return PTR_ERR(buf);
> > +
> > +		ret = vfio_group_get_device_fd(group, buf);
> > +		kfree(buf);
> > +		return ret;
> > +	}
> > +
> > +	return -ENOSYS;
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_group_compat_ioctl(struct file *filep,
> > +				    unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_group_unl_ioctl(filep, cmd, arg);
> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +static const struct file_operations vfio_group_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.open		= vfio_group_open,
> > +	.release	= vfio_group_release,
> > +	.unlocked_ioctl	= vfio_group_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_group_compat_ioctl,
> > +#endif
> > +};
> > +
> > +/* iommu fd release hook */
> > +int vfio_release_iommu(struct vfio_iommu *iommu)
> > +{
> > +	return vfio_do_release(&iommu->refcnt, iommu);
> > +}
> > +
> > +/*
> > + * VFIO driver API
> > + */
> > +
> > +/* Add a new device to the vfio framework with associated vfio driver
> > + * callbacks.  This is the entry point for vfio drivers to register devices. */
> > +int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_group *group = NULL;
> > +	struct vfio_device *device = NULL;
> > +	unsigned int groupid;
> > +	int ret = 0;
> > +	bool new_group = false;
> > +
> > +	if (!ops)
> > +		return -EINVAL;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		group = list_entry(pos, struct vfio_group, group_next);
> > +		if (group->groupid == groupid)
> > +			break;
> > +		group = NULL;
> > +	}
> > +
> > +	if (!group) {
> > +		int minor;
> > +
> > +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group = kzalloc(sizeof(*group), GFP_KERNEL);
> > +		if (!group) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group->groupid = groupid;
> > +		INIT_LIST_HEAD(&group->device_list);
> > +
> > +		ret = idr_get_new(&vfio.idr, group, &minor);
> > +		if (ret == 0 && minor > MINORMASK) {
> > +			idr_remove(&vfio.idr, minor);
> > +			kfree(group);
> > +			ret = -ENOSPC;
> > +			goto out;
> > +		}
> > +
> > +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> > +		device_create(vfio.class, NULL, group->devt,
> > +			      group, "%u", groupid);
> > +
> > +		group->bus = dev->bus;
> 
> 
> Oh, so that is how the IOMMU iommu_ops get copied! You might
> want to mention that - I was not sure where the 'handoff' is
> was done to insert a device so that it can do iommu_ops properly.
> 
> Ok, so the time when a device is detected whether it can do
> IOMMU is when we try to open it - as that is when iommu_domain_alloc
> is called which can return NULL if the iommu_ops is not set.
> 
> So what about devices that don't have an iommu_ops? Say they
> are using SWIOTLB? (like the AMD-Vi sometimes does if the
> device is not on its list).
> 
> Can we use iommu_present?

I'm not sure I'm following your revelation ;)  Take a look at the
pointer to iommu_device_group I pasted above, or these:

https://github.com/awilliam/linux-vfio/commit/37dd08c90d149caaed7779d4f38850a8f7ed0fa5
https://github.com/awilliam/linux-vfio/commit/63ca8543533d8130db23d7949133e548c3891c97
https://github.com/awilliam/linux-vfio/commit/8d7d70eb8e714fbf8710848a06f8cab0c741631e

That call includes an iommu_present() check, so if there's no iommu or
the iommu can't provide a groupid, the device is skipped over from vfio
(can't be used).

So the ordering is:

 - bus driver registers device
   - if it has an iommu group, add it to the vfio device/group tracking

 - group gets opened
   - user gets iommu or device fd results in iommu_domain_alloc

Devices without iommu_ops don't get to play in the vfio world.

> > +		list_add(&group->group_next, &vfio.group_list);
> > +		new_group = true;
> > +	} else {
> > +		if (group->bus != dev->bus) {
> > +			printk(KERN_WARNING
> > +			       "Error: IOMMU group ID conflict.  Group ID %u "
> > +				"on both bus %s and %s\n", groupid,
> > +				group->bus->name, dev->bus->name);
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +
> > +		list_for_each(pos, &group->device_list) {
> > +			device = list_entry(pos,
> > +					    struct vfio_device, device_next);
> > +			if (device->dev == dev)
> > +				break;
> > +			device = NULL;
> > +		}
> > +	}
> > +
> > +	if (!device) {
> > +		if (__vfio_group_devs_inuse(group) ||
> > +		    (group->iommu && group->iommu->refcnt)) {
> > +			printk(KERN_WARNING
> > +			       "Adding device %s to group %u while group is already in use!!\n",
> > +			       dev_name(dev), group->groupid);
> > +			/* XXX How to prevent other drivers from claiming? */
> > +		}
> > +
> > +		device = kzalloc(sizeof(*device), GFP_KERNEL);
> > +		if (!device) {
> > +			/* If we just created this group, tear it down */
> > +			if (new_group) {
> > +				list_del(&group->group_next);
> > +				device_destroy(vfio.class, group->devt);
> > +				idr_remove(&vfio.idr, MINOR(group->devt));
> > +				kfree(group);
> > +			}
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		list_add(&device->device_next, &group->device_list);
> > +		device->dev = dev;
> > +		device->ops = ops;
> > +		device->iommu = group->iommu; /* NULL if new */
> > +		__vfio_iommu_attach_dev(group->iommu, device);
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_group_add_dev);
> > +
> > +/* Remove a device from the vfio framework */
> > +void vfio_group_del_dev(struct device *dev)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_group *group = NULL;
> > +	struct vfio_device *device = NULL;
> > +	unsigned int groupid;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		group = list_entry(pos, struct vfio_group, group_next);
> > +		if (group->groupid == groupid)
> > +			break;
> > +		group = NULL;
> > +	}
> > +
> > +	if (!group)
> > +		goto out;
> > +
> > +	list_for_each(pos, &group->device_list) {
> > +		device = list_entry(pos, struct vfio_device, device_next);
> > +		if (device->dev == dev)
> > +			break;
> > +		device = NULL;
> > +	}
> > +
> > +	if (!device)
> > +		goto out;
> > +
> > +	BUG_ON(device->refcnt);
> > +
> > +	if (device->attached)
> > +		__vfio_iommu_detach_dev(group->iommu, device);
> > +
> > +	list_del(&device->device_next);
> > +	kfree(device);
> > +
> > +	/* If this was the only device in the group, remove the group.
> > +	 * Note that we intentionally unmerge empty groups here if the
> > +	 * group fd isn't opened. */
> > +	if (list_empty(&group->device_list) && group->refcnt == 0) {
> > +		struct vfio_iommu *iommu = group->iommu;
> > +
> > +		if (iommu) {
> > +			__vfio_group_set_iommu(group, NULL);
> > +			__vfio_try_dissolve_iommu(iommu);
> > +		}
> > +
> > +		device_destroy(vfio.class, group->devt);
> > +		idr_remove(&vfio.idr, MINOR(group->devt));
> > +		list_del(&group->group_next);
> > +		kfree(group);
> > +	}
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_group_del_dev);
> > +
> > +/* When a device is bound to a vfio device driver (ex. vfio-pci), this
> > + * entry point is used to mark the device usable (viable).  The vfio
> > + * device driver associates a private device_data struct with the device
> > + * here, which will later be return for vfio_device_fops callbacks. */
> > +int vfio_bind_dev(struct device *dev, void *device_data)
> > +{
> > +	struct vfio_device *device;
> > +	int ret = -EINVAL;
> > +
> > +	BUG_ON(!device_data);
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	device = __vfio_lookup_dev(dev);
> > +
> > +	BUG_ON(!device);
> > +
> > +	ret = dev_set_drvdata(dev, device);
> > +	if (!ret)
> > +		device->device_data = device_data;
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_bind_dev);
> > +
> > +/* A device is only removeable if the iommu for the group is not in use. */
> > +static bool vfio_device_removeable(struct vfio_device *device)
> > +{
> > +	bool ret = true;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (device->iommu && __vfio_iommu_inuse(device->iommu))
> > +		ret = false;
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Notify vfio that a device is being unbound from the vfio device driver
> > + * and return the device private device_data pointer.  If the group is
> > + * in use, we need to block or take other measures to make it safe for
> > + * the device to be removed from the iommu. */
> > +void *vfio_unbind_dev(struct device *dev)
> > +{
> > +	struct vfio_device *device = dev_get_drvdata(dev);
> > +	void *device_data;
> > +
> > +	BUG_ON(!device);
> > +
> > +again:
> > +	if (!vfio_device_removeable(device)) {
> > +		/* XXX signal for all devices in group to be removed or
> > +		 * resort to killing the process holding the device fds.
> > +		 * For now just block waiting for releases to wake us. */
> > +		wait_event(vfio.release_q, vfio_device_removeable(device));
> > +	}
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	/* Need to re-check that the device is still removeable under lock. */
> > +	if (device->iommu && __vfio_iommu_inuse(device->iommu)) {
> > +		mutex_unlock(&vfio.lock);
> > +		goto again;
> > +	}
> > +
> > +	device_data = device->device_data;
> > +
> > +	device->device_data = NULL;
> > +	dev_set_drvdata(dev, NULL);
> > +
> > +	mutex_unlock(&vfio.lock);
> > +	return device_data;
> > +}
> > +EXPORT_SYMBOL_GPL(vfio_unbind_dev);
> > +
> > +/*
> > + * Module/class support
> > + */
> > +static void vfio_class_release(struct kref *kref)
> > +{
> > +	class_destroy(vfio.class);
> > +	vfio.class = NULL;
> > +}
> > +
> > +static char *vfio_devnode(struct device *dev, mode_t *mode)
> > +{
> > +	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
> > +}
> > +
> > +static int __init vfio_init(void)
> > +{
> > +	int ret;
> > +
> > +	idr_init(&vfio.idr);
> > +	mutex_init(&vfio.lock);
> > +	INIT_LIST_HEAD(&vfio.group_list);
> > +	init_waitqueue_head(&vfio.release_q);
> > +
> > +	kref_init(&vfio.kref);
> > +	vfio.class = class_create(THIS_MODULE, "vfio");
> > +	if (IS_ERR(vfio.class)) {
> > +		ret = PTR_ERR(vfio.class);
> > +		goto err_class;
> > +	}
> > +
> > +	vfio.class->devnode = vfio_devnode;
> > +
> > +	/* FIXME - how many minors to allocate... all of them! */
> > +	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
> > +	if (ret)
> > +		goto err_chrdev;
> > +
> > +	cdev_init(&vfio.cdev, &vfio_group_fops);
> > +	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
> > +	if (ret)
> > +		goto err_cdev;
> > +
> > +	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
> > +
> > +	return 0;
> > +
> > +err_cdev:
> > +	unregister_chrdev_region(vfio.devt, MINORMASK);
> > +err_chrdev:
> > +	kref_put(&vfio.kref, vfio_class_release);
> > +err_class:
> > +	return ret;
> > +}
> > +
> > +static void __exit vfio_cleanup(void)
> > +{
> > +	struct list_head *gpos, *gppos;
> > +
> > +	list_for_each_safe(gpos, gppos, &vfio.group_list) {
> > +		struct vfio_group *group;
> > +		struct list_head *dpos, *dppos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, group_next);
> > +
> > +		list_for_each_safe(dpos, dppos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +			vfio_group_del_dev(device->dev);
> > +		}
> > +	}
> > +
> > +	idr_destroy(&vfio.idr);
> > +	cdev_del(&vfio.cdev);
> > +	unregister_chrdev_region(vfio.devt, MINORMASK);
> > +	kref_put(&vfio.kref, vfio_class_release);
> > +}
> > +
> > +module_init(vfio_init);
> > +module_exit(vfio_cleanup);
> > +
> > +MODULE_VERSION(DRIVER_VERSION);
> > +MODULE_LICENSE("GPL v2");
> > +MODULE_AUTHOR(DRIVER_AUTHOR);
> > +MODULE_DESCRIPTION(DRIVER_DESC);
> > diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
> > new file mode 100644
> > index 0000000..350ad67
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_private.h
> > @@ -0,0 +1,34 @@
> > +/*
> > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> > + *     Author: Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * Derived from original vfio:
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@cisco.com
> > + */
> > +
> > +#include <linux/list.h>
> > +#include <linux/mutex.h>
> > +
> > +#ifndef VFIO_PRIVATE_H
> > +#define VFIO_PRIVATE_H
> > +
> > +struct vfio_iommu {
> > +	struct iommu_domain		*domain;
> > +	struct bus_type			*bus;
> > +	struct mutex			dgate;
> > +	struct list_head		dm_list;
> > +	struct mm_struct		*mm;
> > +	struct list_head		group_list;
> > +	int				refcnt;
> > +	bool				cache;
> > +};
> > +
> > +extern int vfio_release_iommu(struct vfio_iommu *iommu);
> > +extern void vfio_iommu_unmapall(struct vfio_iommu *iommu);
> > +
> > +#endif /* VFIO_PRIVATE_H */
> > diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> > new file mode 100644
> > index 0000000..4269b08
> > --- /dev/null
> > +++ b/include/linux/vfio.h
> > @@ -0,0 +1,155 @@
> > +/*
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@cisco.com
> > + *
> > + * This program is free software; you may redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; version 2 of the License.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + * Portions derived from drivers/uio/uio.c:
> > + * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
> > + * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
> > + * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
> > + * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
> > + *
> > + * Portions derived from drivers/uio/uio_pci_generic.c:
> > + * Copyright (C) 2009 Red Hat, Inc.
> > + * Author: Michael S. Tsirkin <mst@redhat.com>
> > + */
> > +#include <linux/types.h>
> > +
> > +#ifndef VFIO_H
> > +#define VFIO_H
> > +
> > +#ifdef __KERNEL__
> > +
> > +struct vfio_device_ops {
> > +	bool			(*match)(struct device *, char *);
> > +	int			(*get)(void *);
> > +	void			(*put)(void *);
> > +	ssize_t			(*read)(void *, char __user *,
> > +					size_t, loff_t *);
> > +	ssize_t			(*write)(void *, const char __user *,
> > +					 size_t, loff_t *);
> > +	long			(*ioctl)(void *, unsigned int, unsigned long);
> > +	int			(*mmap)(void *, struct vm_area_struct *);
> > +};
> > +
> > +extern int vfio_group_add_dev(struct device *device,
> > +			      const struct vfio_device_ops *ops);
> > +extern void vfio_group_del_dev(struct device *device);
> > +extern int vfio_bind_dev(struct device *device, void *device_data);
> > +extern void *vfio_unbind_dev(struct device *device);
> > +
> > +#endif /* __KERNEL__ */
> > +
> > +/*
> > + * VFIO driver - allow mapping and use of certain devices
> > + * in unprivileged user processes. (If IOMMU is present)
> > + * Especially useful for Virtual Function parts of SR-IOV devices
> > + */
> > +
> > +
> > +/* Kernel & User level defines for ioctls */
> > +
> > +#define VFIO_GROUP_GET_FLAGS		_IOR(';', 100, __u64)
> 
> > + #define VFIO_GROUP_FLAGS_VIABLE	(1 << 0)
> > + #define VFIO_GROUP_FLAGS_MM_LOCKED	(1 << 1)
> > +#define VFIO_GROUP_MERGE		_IOW(';', 101, int)
> > +#define VFIO_GROUP_UNMERGE		_IOW(';', 102, int)
> > +#define VFIO_GROUP_GET_IOMMU_FD		_IO(';', 103)
> > +#define VFIO_GROUP_GET_DEVICE_FD	_IOW(';', 104, char *)
> > +
> > +/*
> > + * Structure for DMA mapping of user buffers
> > + * vaddr, dmaaddr, and size must all be page aligned
> > + */
> > +struct vfio_dma_map {
> > +	__u64	len;		/* length of structure */
> > +	__u64	vaddr;		/* process virtual addr */
> > +	__u64	dmaaddr;	/* desired and/or returned dma address */
> > +	__u64	size;		/* size in bytes */
> > +	__u64	flags;
> > +#define	VFIO_DMA_MAP_FLAG_WRITE		(1 << 0) /* req writeable DMA mem */
> > +};
> > +
> > +#define	VFIO_IOMMU_GET_FLAGS		_IOR(';', 105, __u64)
> > + /* Does the IOMMU support mapping any IOVA to any virtual address? */
> > + #define VFIO_IOMMU_FLAGS_MAP_ANY	(1 << 0)
> > +#define	VFIO_IOMMU_MAP_DMA		_IOWR(';', 106, struct vfio_dma_map)
> > +#define	VFIO_IOMMU_UNMAP_DMA		_IOWR(';', 107, struct vfio_dma_map)
> > +
> > +#define VFIO_DEVICE_GET_FLAGS		_IOR(';', 108, __u64)
> > + #define VFIO_DEVICE_FLAGS_PCI		(1 << 0)
> > + #define VFIO_DEVICE_FLAGS_DT		(1 << 1)
> > + #define VFIO_DEVICE_FLAGS_RESET	(1 << 2)
> > +#define VFIO_DEVICE_GET_NUM_REGIONS	_IOR(';', 109, int)
> > +
> > +struct vfio_region_info {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;		/* region number */
> > +	__u64	size;		/* size in bytes of region */
> > +	__u64	offset;		/* start offset of region */
> > +	__u64	flags;
> > +#define VFIO_REGION_INFO_FLAG_MMAP		(1 << 0)
> > +#define VFIO_REGION_INFO_FLAG_RO		(1 << 1)
> > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID	(1 << 2)
> > +	__u64	phys;		/* physical address of region */
> > +};
> > +
> > +#define VFIO_DEVICE_GET_REGION_INFO	_IOWR(';', 110, struct vfio_region_info)
> > +
> > +#define VFIO_DEVICE_GET_NUM_IRQS	_IOR(';', 111, int)
> > +
> > +struct vfio_irq_info {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;		/* IRQ number */
> > +	__u32	count;		/* number of individual IRQs */
> > +	__u32	flags;
> > +#define VFIO_IRQ_INFO_FLAG_LEVEL		(1 << 0)
> > +};
> > +
> > +#define VFIO_DEVICE_GET_IRQ_INFO	_IOWR(';', 112, struct vfio_irq_info)
> > +
> > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS	_IOW(';', 113, int)
> > +
> > +/* Unmask IRQ index, arg[0] = index */
> > +#define VFIO_DEVICE_UNMASK_IRQ		_IOW(';', 114, int)
> > +
> > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD	_IOW(';', 115, int)
> > +
> > +#define VFIO_DEVICE_RESET		_IO(';', 116)
> > +
> > +struct vfio_dtpath {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;
> > +	__u64	flags;
> > +#define VFIO_DTPATH_FLAGS_REGION	(1 << 0)
> > +#define VFIO_DTPATH_FLAGS_IRQ		(1 << 1)
> > +	char	*path;
> > +};
> > +#define VFIO_DEVICE_GET_DTPATH		_IOWR(';', 117, struct vfio_dtpath)
> > +
> > +struct vfio_dtindex {
> > +	__u32	len;		/* length of structure */
> > +	__u32	index;
> > +	__u32	prop_type;
> > +	__u32	prop_index;
> > +	__u64	flags;
> > +#define VFIO_DTINDEX_FLAGS_REGION	(1 << 0)
> > +#define VFIO_DTINDEX_FLAGS_IRQ		(1 << 1)
> > +};
> > +#define VFIO_DEVICE_GET_DTINDEX		_IOWR(';', 118, struct vfio_dtindex)
> > +
> > +#endif /* VFIO_H */
> 
> 
> So where is the vfio-pci? Is that a seperate posting?

You can find it in the tree pointed to in the patch description:

https://github.com/awilliam/linux-vfio/commit/534725d327e2b7791a229ce72d2ae8a62ee0a4e5

I was hoping to get some consensus around the new core before spending
too much time polishing up the bus driver.  Thanks for the review, it's
very much appreciated!

Alex
Christian Benvenuti (benve) Nov. 11, 2011, 10:22 p.m. UTC | #11
> -----Original Message-----

> From: Alex Williamson [mailto:alex.williamson@redhat.com]

> Sent: Friday, November 11, 2011 10:04 AM

> To: Christian Benvenuti (benve)

> Cc: chrisw@sous-sol.org; aik@au1.ibm.com; pmac@au1.ibm.com;

> dwg@au1.ibm.com; joerg.roedel@amd.com; agraf@suse.de; Aaron Fabbri

> (aafabbri); B08248@freescale.com; B07421@freescale.com; avi@redhat.com;

> konrad.wilk@oracle.com; kvm@vger.kernel.org; qemu-devel@nongnu.org;

> iommu@lists.linux-foundation.org; linux-pci@vger.kernel.org

> Subject: RE: [RFC PATCH] vfio: VFIO Driver core framework

> 

> On Wed, 2011-11-09 at 18:57 -0600, Christian Benvenuti (benve) wrote:

> > Here are few minor comments on vfio_iommu.c ...

> 

> Sorry, I've been poking sticks at trying to figure out a clean way to

> solve the force vfio driver attach problem.


Attach o detach?

> > > diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c

> > > new file mode 100644

> > > index 0000000..029dae3

> > > --- /dev/null

> > > +++ b/drivers/vfio/vfio_iommu.c

> <snip>

> > > +

> > > +#include "vfio_private.h"

> >

> > Doesn't the 'dma_'  prefix belong to the generic DMA code?

> 

> Sure, we could these more vfio-centric.


Like vfio_dma_map_page?

> 

> > > +struct dma_map_page {

> > > +	struct list_head	list;

> > > +	dma_addr_t		daddr;

> > > +	unsigned long		vaddr;

> > > +	int			npage;

> > > +	int			rdwr;

> > > +};

> > > +

> > > +/*

> > > + * This code handles mapping and unmapping of user data buffers

> > > + * into DMA'ble space using the IOMMU

> > > + */

> > > +

> > > +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)

> > > +

> > > +struct vwork {

> > > +	struct mm_struct	*mm;

> > > +	int			npage;

> > > +	struct work_struct	work;

> > > +};

> > > +

> > > +/* delayed decrement for locked_vm */

> > > +static void vfio_lock_acct_bg(struct work_struct *work)

> > > +{

> > > +	struct vwork *vwork = container_of(work, struct vwork, work);

> > > +	struct mm_struct *mm;

> > > +

> > > +	mm = vwork->mm;

> > > +	down_write(&mm->mmap_sem);

> > > +	mm->locked_vm += vwork->npage;

> > > +	up_write(&mm->mmap_sem);

> > > +	mmput(mm);		/* unref mm */

> > > +	kfree(vwork);

> > > +}

> > > +

> > > +static void vfio_lock_acct(int npage)

> > > +{

> > > +	struct vwork *vwork;

> > > +	struct mm_struct *mm;

> > > +

> > > +	if (!current->mm) {

> > > +		/* process exited */

> > > +		return;

> > > +	}

> > > +	if (down_write_trylock(&current->mm->mmap_sem)) {

> > > +		current->mm->locked_vm += npage;

> > > +		up_write(&current->mm->mmap_sem);

> > > +		return;

> > > +	}

> > > +	/*

> > > +	 * Couldn't get mmap_sem lock, so must setup to decrement

> >                                                       ^^^^^^^^^

> >

> > Increment?

> 

> Yep

> 

> <snip>

> > > +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t

> > > start,

> > > +			    size_t size, struct dma_map_page *mlp)

> > > +{

> > > +	struct dma_map_page *split;

> > > +	int npage_lo, npage_hi;

> > > +

> > > +	/* Existing dma region is completely covered, unmap all */

> >

> > This works. However, given how vfio_dma_map_dm implements the merging

> > logic, I think it is impossible to have

> >

> >     (start < mlp->daddr &&

> >      start + size > mlp->daddr + NPAGE_TO_SIZE(mlp->npage))

> 

> It's quite possible.  This allows userspace to create a sparse mapping,

> then blow it all away with a single unmap from 0 to ~0.


I would prefer the user to use exact ranges in the unmap operations
because it would make it easier to detect bugs/leaks in the map/unmap
logic used by the callers.
My assumptions are that:

- the user always keeps track of the mappings

- the user either unmaps one specific mapping or 'all of them'.
  The 'all of them' case would also take care of those cases where
  the user does _not_ keep track of mappings and simply uses
  the "unmap from 0 to ~0" each time.

Because of this you could still provide an exact map/unmap logic
and allow such "unmap from 0 to ~0" by making the latter a special
case.
However, if we want to allow any arbitrary/inexact unmap request, then OK.

> > > +	if (start <= mlp->daddr &&

> > > +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {

> > > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);

> > > +		list_del(&mlp->list);

> > > +		npage_lo = mlp->npage;

> > > +		kfree(mlp);

> > > +		return npage_lo;

> > > +	}

> > > +

> > > +	/* Overlap low address of existing range */

> >

> > Same as above (ie, '<' is impossible)

> 

> existing:   |<--- A --->|      |<--- B --->|

> unmap:                |<--- C --->|

> 

> Maybe not good practice from userspace, but we shouldn't count on

> userspace to be well behaved.

> 

> > > +	if (start <= mlp->daddr) {

> > > +		size_t overlap;

> > > +

> > > +		overlap = start + size - mlp->daddr;

> > > +		npage_lo = overlap >> PAGE_SHIFT;

> > > +		npage_hi = mlp->npage - npage_lo;

> > > +

> > > +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);

> > > +		mlp->daddr += overlap;

> > > +		mlp->vaddr += overlap;

> > > +		mlp->npage -= npage_lo;

> > > +		return npage_lo;

> > > +	}

> >

> > Same as above (ie, '>' is impossible).

> 

> Same example as above.

> 

> > > +	/* Overlap high address of existing range */

> > > +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {

> > > +		size_t overlap;

> > > +

> > > +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;

> > > +		npage_hi = overlap >> PAGE_SHIFT;

> > > +		npage_lo = mlp->npage - npage_hi;

> > > +

> > > +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);

> > > +		mlp->npage -= npage_hi;

> > > +		return npage_hi;

> > > +	}

> <snip>

> > > +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map

> > > *dmp)

> > > +{

> > > +	int npage;

> > > +	struct dma_map_page *mlp, *mmlp = NULL;

> > > +	dma_addr_t daddr = dmp->dmaaddr;

> > > +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;

> > > +	size_t size = dmp->size;

> > > +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;

> > > +

> > > +	if (vaddr & (PAGE_SIZE-1))

> > > +		return -EINVAL;

> > > +	if (daddr & (PAGE_SIZE-1))

> > > +		return -EINVAL;

> > > +	if (size & (PAGE_SIZE-1))

> > > +		return -EINVAL;

> > > +

> > > +	npage = size >> PAGE_SHIFT;

> > > +	if (!npage)

> > > +		return -EINVAL;

> > > +

> > > +	if (!iommu)

> > > +		return -EINVAL;

> > > +

> > > +	mutex_lock(&iommu->dgate);

> > > +

> > > +	if (vfio_find_dma(iommu, daddr, size)) {

> > > +		ret = -EBUSY;

> > > +		goto out_lock;

> > > +	}

> > > +

> > > +	/* account for locked pages */

> > > +	locked = current->mm->locked_vm + npage;

> > > +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;

> > > +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {

> > > +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",

> > > +			__func__, rlimit(RLIMIT_MEMLOCK));

> > > +		ret = -ENOMEM;

> > > +		goto out_lock;

> > > +	}

> > > +

> > > +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);

> > > +	if (ret)

> > > +		goto out_lock;

> > > +

> > > +	/* Check if we abut a region below */

> >

> > Is !daddr possible?

> 

> Sure, an IOVA of 0x0.  There's no region below if we start at zero.

> 

> > > +	if (daddr) {

> > > +		mlp = vfio_find_dma(iommu, daddr - 1, 1);

> > > +		if (mlp && mlp->rdwr == rdwr &&

> > > +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {

> > > +

> > > +			mlp->npage += npage;

> > > +			daddr = mlp->daddr;

> > > +			vaddr = mlp->vaddr;

> > > +			npage = mlp->npage;

> > > +			size = NPAGE_TO_SIZE(npage);

> > > +

> > > +			mmlp = mlp;

> > > +		}

> > > +	}

> >

> > Is !(daddr + size) possible?

> 

> Same, there's no region above if this region goes to the top of the

> address space, ie. 0xffffffff_fffff000 + 0x1000

> 

> Hmm, wonder if I'm missing a check for wrapping.

> 

> > > +	if (daddr + size) {

> > > +		mlp = vfio_find_dma(iommu, daddr + size, 1);

> > > +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size)

> > > {

> > > +

> > > +			mlp->npage += npage;

> > > +			mlp->daddr = daddr;

> > > +			mlp->vaddr = vaddr;

> > > +

> > > +			/* If merged above and below, remove previously

> > > +			 * merged entry.  New entry covers it.  */

> > > +			if (mmlp) {

> > > +				list_del(&mmlp->list);

> > > +				kfree(mmlp);

> > > +			}

> > > +			mmlp = mlp;

> > > +		}

> > > +	}

> > > +

> > > +	if (!mmlp) {

> > > +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);

> > > +		if (!mlp) {

> > > +			ret = -ENOMEM;

> > > +			vfio_dma_unmap(iommu, daddr, npage, rdwr);

> > > +			goto out_lock;

> > > +		}

> > > +

> > > +		mlp->npage = npage;

> > > +		mlp->daddr = daddr;

> > > +		mlp->vaddr = vaddr;

> > > +		mlp->rdwr = rdwr;

> > > +		list_add(&mlp->list, &iommu->dm_list);

> > > +	}

> > > +

> > > +out_lock:

> > > +	mutex_unlock(&iommu->dgate);

> > > +	return ret;

> > > +}

> > > +

> > > +static int vfio_iommu_release(struct inode *inode, struct file

> *filep)

> > > +{

> > > +	struct vfio_iommu *iommu = filep->private_data;

> > > +

> > > +	vfio_release_iommu(iommu);

> > > +	return 0;

> > > +}

> > > +

> > > +static long vfio_iommu_unl_ioctl(struct file *filep,

> > > +				 unsigned int cmd, unsigned long arg)

> > > +{

> > > +	struct vfio_iommu *iommu = filep->private_data;

> > > +	int ret = -ENOSYS;

> >

> > Any reason for not using "switch" ?

> 

> It got ugly in vfio_main, so I decided to be consistent w/ it in the

> driver and use if/else here too.  I don't like the aesthetics of extra

> {}s to declare variables within a switch, nor do I like declaring all

> the variables for each case for the whole function.  Personal quirk.

> 

> > > +        if (cmd == VFIO_IOMMU_GET_FLAGS) {

> > > +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;

> > > +

> > > +                ret = put_user(flags, (u64 __user *)arg);

> > > +

> > > +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {

> > > +		struct vfio_dma_map dm;

> > > +

> > > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))

> > > +			return -EFAULT;

> >

> > What does the "_dm" suffix stand for?

> 

> Inherited from Tom, but I figure _dma_map_dm = action(dma map),

> object(dm), which is a vfio_Dma_Map.


OK. The reason why I asked is that '_dm' does not add anything to 'vfio_dma_map'.

/Chris
Scott Wood Nov. 12, 2011, 12:14 a.m. UTC | #12
On 11/03/2011 03:12 PM, Alex Williamson wrote:
> +Many modern system now provide DMA and interrupt remapping facilities
> +to help ensure I/O devices behave within the boundaries they've been
> +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> +well as POWER systems with Partitionable Endpoints (PEs) and even
> +embedded powerpc systems (technology name unknown).  

Maybe replace "(technology name unknown)" with "(such as Freescale chips
with PAMU)" or similar?

Or just leave out the parenthetical.

> +As documented in linux/vfio.h, several ioctls are provided on the
> +group chardev:
> +
> +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)

This suggests the argument to VFIO_GROUP_GET_DEVICE_FD is a pointer to a
pointer to char rather than a pointer to an array of char (just as e.g.
VFIO_GROUP_MERGE takes a pointer to an int, not just an int).

> +The IOMMU file descriptor provides this set of ioctls:
> +
> +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)

What is the implication if VFIO_IOMMU_FLAGS_MAP_ANY is clear?  Is such
an implementation supposed to add a new flag that describes its
restrictions?

Can we get a way to turn DMA access off and on, short of unmapping
everything, and then mapping it again?

> +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> +We currently only support IOMMU domains that are able to map any
> +virtual address to any IOVA.  This is indicated by the MAP_ANY flag.
> +
> +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> +and unmapping IOVAs to process virtual addresses:
> +
> +struct vfio_dma_map {
> +        __u64   len;            /* length of structure */
> +        __u64   vaddr;          /* process virtual addr */
> +        __u64   dmaaddr;        /* desired and/or returned dma address */
> +        __u64   size;           /* size in bytes */
> +        __u64   flags;
> +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> +};

What are the semantics of "desired and/or returned dma address"?

Are we always supposed to provide a desired address, but it may be
different on return?  Or are there cases where we want to say "give me
whatever you want" or "give me this or fail"?

How much of this needs to be filled out for unmap?

Note that the "length of structure" approach means that ioctl numbers
will change whenever this grows -- perhaps we should avoid encoding the
struct size into these ioctls?

> +struct vfio_region_info {
> +        __u32   len;            /* length of structure */
> +        __u32   index;          /* region number */
> +        __u64   size;           /* size in bytes of region */
> +        __u64   offset;         /* start offset of region */
> +        __u64   flags;
> +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> +        __u64   phys;           /* physical address of region */
> +};
> +
> +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> +
> +The offset indicates the offset into the device file descriptor which
> +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> +available access types and validity of optional fields.  For instance
> +the phys field may only be valid for certain devices types.
> +
> +Interrupts are described using a similar interface.  GET_NUM_IRQS
> +reports the number or IRQ indexes for the device.
> +
> +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
> +
> +struct vfio_irq_info {
> +        __u32   len;            /* length of structure */
> +        __u32   index;          /* IRQ number */
> +        __u32   count;          /* number of individual IRQs */
> +        __u64   flags;
> +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)

Make sure flags is 64-bit aligned -- some 32-bit ABIs, such as x86, will
not do this, causing problems if the kernel is 64-bit and thus assumes a
different layout.

> +Information about each index can be retrieved using the GET_IRQ_INFO
> +ioctl, used much like GET_REGION_INFO.
> +
> +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> +
> +Individual indexes can describe single or sets of IRQs.  This provides the
> +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> +
> +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> +as shown below, are used to pass the IRQ info index, the number of eventfds,
> +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> +
> +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> +
> +When a level triggered interrupt is signaled, the interrupt is masked
> +on the host.  This prevents an unresponsive userspace driver from
> +continuing to interrupt the host system.

It's usually necessary even in the case of responsive userspace, just to
get to the point where userspace can execute (ignoring cases where
userspace runs on one core while the interrupt storms another).

For edge interrupts, will me mask if an interrupt comes in and the
previous interrupt hasn't been read out yet (and then unmask when the
last interrupt gets read out), to isolate us from a rapidly firing
interrupt source that userspace can't keep up with?

> +Device tree devices also invlude ioctls for further defining the
> +device tree properties of the device:
> +
> +struct vfio_dtpath {
> +        __u32   len;            /* length of structure */
> +        __u32   index;
> +        __u64   flags;
> +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> +        char    *path;
> +};
> +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)

Where is length of buffer (and description of associated semantics)?

> +struct vfio_device_ops {
> +	bool			(*match)(struct device *, char *);

const char *?

> +	int			(*get)(void *);
> +	void			(*put)(void *);
> +	ssize_t			(*read)(void *, char __user *,
> +					size_t, loff_t *);
> +	ssize_t			(*write)(void *, const char __user *,
> +					 size_t, loff_t *);
> +	long			(*ioctl)(void *, unsigned int, unsigned long);
> +	int			(*mmap)(void *, struct vm_area_struct *);
> +};

When defining an API, please do not omit parameter names.

Should specify what the driver is supposed to do with get/put -- I guess
not try to unbind when the count is nonzero?  Races could still lead the
unbinder to be blocked, but I guess it lets the driver know when it's
likely to succeed.

> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> new file mode 100644
> index 0000000..9acb1e7
> --- /dev/null
> +++ b/drivers/vfio/Kconfig
> @@ -0,0 +1,8 @@
> +menuconfig VFIO
> +	tristate "VFIO Non-Privileged userspace driver framework"
> +	depends on IOMMU_API
> +	help
> +	  VFIO provides a framework for secure userspace device drivers.
> +	  See Documentation/vfio.txt for more details.
> +
> +	  If you don't know what to do here, say N.

Can we limit the IOMMU_API dependency to the IOMMU parts of VFIO?  It
would still be useful for devices which don't do DMA, or where we accept
the lack of protection/translation (e.g. we have a customer that wants
to do KVM device assignment on one of our lower-end chips that lacks an
IOMMU).

> +struct dma_map_page {
> +	struct list_head	list;
> +	dma_addr_t		daddr;
> +	unsigned long		vaddr;
> +	int			npage;
> +	int			rdwr;
> +};

npage should be long.

What is "rdwr"?  non-zero for write?  non-zero for read? :-)
is_write would be a better name.

> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> +		unsigned long pfn = 0;
> +
> +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> +		if (ret) {
> +			__vfio_dma_unmap(iommu, start, i, rdwr);
> +			return ret;
> +		}
> +
> +		/* Only add actual locked pages to accounting */
> +		if (!is_invalid_reserved_pfn(pfn))
> +			locked++;
> +
> +		ret = iommu_map(iommu->domain, iova,
> +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
> +		if (ret) {
> +			/* Back out mappings on error */
> +			put_pfn(pfn, rdwr);
> +			__vfio_dma_unmap(iommu, start, i, rdwr);
> +			return ret;
> +		}
> +	}

There's no way to hand this stuff to the IOMMU driver in chunks larger
than a page?  That's going to be a problem for our IOMMU, which wants to
deal with large windows.

> +	vfio_lock_acct(locked);
> +	return 0;
> +}
> +
> +static inline int ranges_overlap(unsigned long start1, size_t size1,
> +				 unsigned long start2, size_t size2)
> +{
> +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);
> +}

You pass DMA addresses to this, so use dma_addr_t.  unsigned long is not
always large enough.

What if one of the ranges wraps around (including the legitimate
possibility of start + size == 0)?

> +static long vfio_iommu_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_iommu *iommu = filep->private_data;
> +	int ret = -ENOSYS;

-ENOIOCTLCMD or -ENOTTY?

> +
> +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> +
> +                ret = put_user(flags, (u64 __user *)arg);
> +
> +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> +		struct vfio_dma_map dm;

Whitespace.

Any reason not to use a switch?

> +/* Return true if any devices within a group are opened */
> +static bool __vfio_group_devs_inuse(struct vfio_group *group)
[snip]
> +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
[snip]
> +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
[snip]
> +static void __vfio_group_set_iommu(struct vfio_group *group,
> +				   struct vfio_iommu *iommu)

...and so on.

Why all the leading underscores?  Doesn't look like you're trying to
distinguish between this and a more public version with the same name.

> +/* Get a new device file descriptor.  This will open the iommu, setting
> + * the current->mm ownership if it's not already set.  It's difficult to
> + * specify the requirements for matching a user supplied buffer to a
> + * device, so we use a vfio driver callback to test for a match.  For
> + * PCI, dev_name(dev) is unique, but other drivers may require including
> + * a parent device string. */
> +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
> +{
> +	struct vfio_iommu *iommu = group->iommu;
> +	struct list_head *gpos;
> +	int ret = -ENODEV;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	if (!iommu->domain) {
> +		ret = __vfio_open_iommu(iommu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	list_for_each(gpos, &iommu->group_list) {
> +		struct list_head *dpos;
> +
> +		group = list_entry(gpos, struct vfio_group, iommu_next);
> +
> +		list_for_each(dpos, &group->device_list) {
> +			struct vfio_device *device;
> +
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +
> +			if (device->ops->match(device->dev, buf)) {

If there's a match, we're done with the loop -- might as well break out
now rather than indent everything else.

> +				struct file *file;
> +
> +				if (device->ops->get(device->device_data)) {
> +					ret = -EFAULT;
> +					goto out;
> +				}

Why does a failure of get() result in -EFAULT?  -EFAULT is for bad user
addresses.

> +
> +				/* We can't use anon_inode_getfd(), like above
> +				 * because we need to modify the f_mode flags
> +				 * directly to allow more than just ioctls */
> +				ret = get_unused_fd();
> +				if (ret < 0) {
> +					device->ops->put(device->device_data);
> +					goto out;
> +				}
> +
> +				file = anon_inode_getfile("[vfio-device]",
> +							  &vfio_device_fops,
> +							  device, O_RDWR);
> +				if (IS_ERR(file)) {
> +					put_unused_fd(ret);
> +					ret = PTR_ERR(file);
> +					device->ops->put(device->device_data);
> +					goto out;
> +				}

Maybe cleaner with goto-based error management?

> +/* Add a new device to the vfio framework with associated vfio driver
> + * callbacks.  This is the entry point for vfio drivers to register devices. */
> +int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
> +{
> +	struct list_head *pos;
> +	struct vfio_group *group = NULL;
> +	struct vfio_device *device = NULL;
> +	unsigned int groupid;
> +	int ret = 0;
> +	bool new_group = false;
> +
> +	if (!ops)
> +		return -EINVAL;
> +
> +	if (iommu_device_group(dev, &groupid))
> +		return -ENODEV;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	list_for_each(pos, &vfio.group_list) {
> +		group = list_entry(pos, struct vfio_group, group_next);
> +		if (group->groupid == groupid)
> +			break;
> +		group = NULL;
> +	}

Factor this into vfio_dev_to_group() (and likewise for other such lookups)?

> +	if (!group) {
> +		int minor;
> +
> +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		group = kzalloc(sizeof(*group), GFP_KERNEL);
> +		if (!group) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		group->groupid = groupid;
> +		INIT_LIST_HEAD(&group->device_list);
> +
> +		ret = idr_get_new(&vfio.idr, group, &minor);
> +		if (ret == 0 && minor > MINORMASK) {
> +			idr_remove(&vfio.idr, minor);
> +			kfree(group);
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +
> +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> +		device_create(vfio.class, NULL, group->devt,
> +			      group, "%u", groupid);
> +
> +		group->bus = dev->bus;
> +		list_add(&group->group_next, &vfio.group_list);

Factor out into vfio_create_group()?

> +		new_group = true;
> +	} else {
> +		if (group->bus != dev->bus) {
> +			printk(KERN_WARNING
> +			       "Error: IOMMU group ID conflict.  Group ID %u "
> +				"on both bus %s and %s\n", groupid,
> +				group->bus->name, dev->bus->name);
> +			ret = -EFAULT;
> +			goto out;
> +		}

It took me a little while to figure out that this was comparing bus
types, not actual bus instances (which would be an inappropriate
restriction). :-P

Still, isn't it what we really care about that it's the same IOMMU
domain?  Couldn't different bus types share an iommu_ops?

And again, -EFAULT isn't the right error.

-Scott
Alex Williamson Nov. 14, 2011, 8:54 p.m. UTC | #13
On Fri, 2011-11-11 at 18:14 -0600, Scott Wood wrote:
> On 11/03/2011 03:12 PM, Alex Williamson wrote:
> > +Many modern system now provide DMA and interrupt remapping facilities
> > +to help ensure I/O devices behave within the boundaries they've been
> > +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> > +well as POWER systems with Partitionable Endpoints (PEs) and even
> > +embedded powerpc systems (technology name unknown).  
> 
> Maybe replace "(technology name unknown)" with "(such as Freescale chips
> with PAMU)" or similar?
> 
> Or just leave out the parenthetical.

I was hoping that comment would lead to an answer.  Thanks for the
info ;)

> > +As documented in linux/vfio.h, several ioctls are provided on the
> > +group chardev:
> > +
> > +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> > + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> > + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> > +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> > +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> > +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> > +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
> 
> This suggests the argument to VFIO_GROUP_GET_DEVICE_FD is a pointer to a
> pointer to char rather than a pointer to an array of char (just as e.g.
> VFIO_GROUP_MERGE takes a pointer to an int, not just an int).

I believe I was following the UI_SET_PHYS ioctl as an example, which is
defined as a char *.  I'll change to char and verify.

> > +The IOMMU file descriptor provides this set of ioctls:
> > +
> > +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> > + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> > +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> > +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
> 
> What is the implication if VFIO_IOMMU_FLAGS_MAP_ANY is clear?  Is such
> an implementation supposed to add a new flag that describes its
> restrictions?

If MAP_ANY is clear then I would expect a new flag is set defining a new
mapping paradigm, probably with an ioctl to describe the
restrictions/parameters.  MAP_ANY effectively means there are no
restrictions.

> Can we get a way to turn DMA access off and on, short of unmapping
> everything, and then mapping it again?

iommu_ops doesn't support such an interface, so no, not currently.

> > +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> > +We currently only support IOMMU domains that are able to map any
> > +virtual address to any IOVA.  This is indicated by the MAP_ANY flag.
> > +
> > +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> > +and unmapping IOVAs to process virtual addresses:
> > +
> > +struct vfio_dma_map {
> > +        __u64   len;            /* length of structure */
> > +        __u64   vaddr;          /* process virtual addr */
> > +        __u64   dmaaddr;        /* desired and/or returned dma address */
> > +        __u64   size;           /* size in bytes */
> > +        __u64   flags;
> > +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> > +};
> 
> What are the semantics of "desired and/or returned dma address"?

I believe the original intention was that a user could leave dmaaddr
clear and let the iommu layer provide an iova address.  The iommu api
has since evolved and that mapping scheme really isn't present anymore.
We'll currently fail if we can map the requested address.  I'll update
the docs to make that be the definition.

> Are we always supposed to provide a desired address, but it may be
> different on return?  Or are there cases where we want to say "give me
> whatever you want" or "give me this or fail"?

Exactly, that's what it used to be, but we don't really implement that
any more.

> How much of this needs to be filled out for unmap?

dmaaddr & size, will update docs.

> Note that the "length of structure" approach means that ioctl numbers
> will change whenever this grows -- perhaps we should avoid encoding the
> struct size into these ioctls?

How so?  What's described here is effectively the base size.  If we
later add feature foo requiring additional fields, we set a flag, change
the size, and tack those fields onto the end.  The kernel side should
balk if the size doesn't match what it expects from the flags it
understands (which I think I probably need to be more strict about).

> > +struct vfio_region_info {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;          /* region number */
> > +        __u64   size;           /* size in bytes of region */
> > +        __u64   offset;         /* start offset of region */
> > +        __u64   flags;
> > +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> > +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> > +        __u64   phys;           /* physical address of region */
> > +};

In light of the above, this struct should not include phys.  In fact, I
should probably remove the PHYS_VALID flag as well until we have a bus
driver implementation that actually makes use of it.

> > +
> > +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> > +
> > +The offset indicates the offset into the device file descriptor which
> > +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> > +available access types and validity of optional fields.  For instance
> > +the phys field may only be valid for certain devices types.
> > +
> > +Interrupts are described using a similar interface.  GET_NUM_IRQS
> > +reports the number or IRQ indexes for the device.
> > +
> > +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
> > +
> > +struct vfio_irq_info {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;          /* IRQ number */
> > +        __u32   count;          /* number of individual IRQs */
> > +        __u64   flags;
> > +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> 
> Make sure flags is 64-bit aligned -- some 32-bit ABIs, such as x86, will
> not do this, causing problems if the kernel is 64-bit and thus assumes a
> different layout.

Shoot, I'll push flags up above count to get it aligned.

> > +Information about each index can be retrieved using the GET_IRQ_INFO
> > +ioctl, used much like GET_REGION_INFO.
> > +
> > +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> > +
> > +Individual indexes can describe single or sets of IRQs.  This provides the
> > +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> > +
> > +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> > +as shown below, are used to pass the IRQ info index, the number of eventfds,
> > +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> > +
> > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> > +
> > +When a level triggered interrupt is signaled, the interrupt is masked
> > +on the host.  This prevents an unresponsive userspace driver from
> > +continuing to interrupt the host system.
> 
> It's usually necessary even in the case of responsive userspace, just to
> get to the point where userspace can execute (ignoring cases where
> userspace runs on one core while the interrupt storms another).

Right, I'll try to clarify.

> For edge interrupts, will me mask if an interrupt comes in and the
> previous interrupt hasn't been read out yet (and then unmask when the
> last interrupt gets read out), to isolate us from a rapidly firing
> interrupt source that userspace can't keep up with?

We don't do that currently and I haven't seen a need to.  Seems like
there'd be no API change in doing that if we want at some point.

> > +Device tree devices also invlude ioctls for further defining the
> > +device tree properties of the device:
> > +
> > +struct vfio_dtpath {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;
> > +        __u64   flags;
> > +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> > +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> > +        char    *path;
> > +};
> > +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> 
> Where is length of buffer (and description of associated semantics)?

I think I should probably take the same approach as the phys field
above, leave it to the dt bus driver to add these ioctls and fields as
I'm almost certain to get it wrong trying to predict what it's going to
need.  Likewise, VFIO_DEVICE_FLAGS_PCI should be defined as part of the
pci bus driver patch, even though it doesn't need any extra
ioctls/fields.

> > +struct vfio_device_ops {
> > +	bool			(*match)(struct device *, char *);
> 
> const char *?

will fix

> > +	int			(*get)(void *);
> > +	void			(*put)(void *);
> > +	ssize_t			(*read)(void *, char __user *,
> > +					size_t, loff_t *);
> > +	ssize_t			(*write)(void *, const char __user *,
> > +					 size_t, loff_t *);
> > +	long			(*ioctl)(void *, unsigned int, unsigned long);
> > +	int			(*mmap)(void *, struct vm_area_struct *);
> > +};
> 
> When defining an API, please do not omit parameter names.

ok

> Should specify what the driver is supposed to do with get/put -- I guess
> not try to unbind when the count is nonzero?  Races could still lead the
> unbinder to be blocked, but I guess it lets the driver know when it's
> likely to succeed.

Right, for the pci bus driver, it's mainly for reference counting,
including the module_get to prevent vfio-pci from being unloaded.  On
the first get for a device, we also do a pci_enable() and pci_disable()
on last put.  I'll try to clarify in the docs.

> > diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> > new file mode 100644
> > index 0000000..9acb1e7
> > --- /dev/null
> > +++ b/drivers/vfio/Kconfig
> > @@ -0,0 +1,8 @@
> > +menuconfig VFIO
> > +	tristate "VFIO Non-Privileged userspace driver framework"
> > +	depends on IOMMU_API
> > +	help
> > +	  VFIO provides a framework for secure userspace device drivers.
> > +	  See Documentation/vfio.txt for more details.
> > +
> > +	  If you don't know what to do here, say N.
> 
> Can we limit the IOMMU_API dependency to the IOMMU parts of VFIO?  It
> would still be useful for devices which don't do DMA, or where we accept
> the lack of protection/translation (e.g. we have a customer that wants
> to do KVM device assignment on one of our lower-end chips that lacks an
> IOMMU).

Ugh.  I'm not really onboard with it given that we're trying to sell
vfio as a secure user space driver interface with iommu-based
protection.  That said, vifo_iommu.c is already it's own file, with the
thought that other platforms might need to manage the iommu differently.
Theoretically the IOMMU_API requirement could be tied specifically to
vfio_iommu and another iommu backend added.

> > +struct dma_map_page {
> > +	struct list_head	list;
> > +	dma_addr_t		daddr;
> > +	unsigned long		vaddr;
> > +	int			npage;
> > +	int			rdwr;
> > +};
> 
> npage should be long.

Seems like I went back and forth on that a couple times, I'll see if I
can remember why I landed on int or change it.  Practically, int is "big
enough", but that's not a good answer.

> What is "rdwr"?  non-zero for write?  non-zero for read? :-)
> is_write would be a better name.

Others commented on this too, I'll switch to a bool rename it so it's
obvious that it means write access enabled.

> 
> > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> > +		unsigned long pfn = 0;
> > +
> > +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> > +		if (ret) {
> > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > +			return ret;
> > +		}
> > +
> > +		/* Only add actual locked pages to accounting */
> > +		if (!is_invalid_reserved_pfn(pfn))
> > +			locked++;
> > +
> > +		ret = iommu_map(iommu->domain, iova,
> > +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
> > +		if (ret) {
> > +			/* Back out mappings on error */
> > +			put_pfn(pfn, rdwr);
> > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > +			return ret;
> > +		}
> > +	}
> 
> There's no way to hand this stuff to the IOMMU driver in chunks larger
> than a page?  That's going to be a problem for our IOMMU, which wants to
> deal with large windows.

There is, this is just a simple implementation that maps individual
pages.  We "just" need to determine physically contiguous chunks and
mlock them instead of using get_user_pages.  The current implementation
is much like how KVM maps iommu pages, but there shouldn't be a user API
change to try to use larger chinks.  We want this for IOMMU large page
support too.

> > +	vfio_lock_acct(locked);
> > +	return 0;
> > +}
> > +
> > +static inline int ranges_overlap(unsigned long start1, size_t size1,
> > +				 unsigned long start2, size_t size2)
> > +{
> > +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);
> > +}
> 
> You pass DMA addresses to this, so use dma_addr_t.  unsigned long is not
> always large enough.

ok

> What if one of the ranges wraps around (including the legitimate
> possibility of start + size == 0)?

Looks like a bug.

> > +static long vfio_iommu_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_iommu *iommu = filep->private_data;
> > +	int ret = -ENOSYS;
> 
> -ENOIOCTLCMD or -ENOTTY?

ok

> > +
> > +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> > +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> > +
> > +                ret = put_user(flags, (u64 __user *)arg);
> > +
> > +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> > +		struct vfio_dma_map dm;
> 
> Whitespace.

yep, will fix

> Any reason not to use a switch?

Personal preference.  It got ugly using a switch in vfio_main, trying to
keep variable scope to the case, followed suit here for consistency.

> > +/* Return true if any devices within a group are opened */
> > +static bool __vfio_group_devs_inuse(struct vfio_group *group)
> [snip]
> > +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
> [snip]
> > +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
> [snip]
> > +static void __vfio_group_set_iommu(struct vfio_group *group,
> > +				   struct vfio_iommu *iommu)
> 
> ...and so on.
> 
> Why all the leading underscores?  Doesn't look like you're trying to
> distinguish between this and a more public version with the same name.

__ implies it should be called under vfio.lock.

> > +/* Get a new device file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set.  It's difficult to
> > + * specify the requirements for matching a user supplied buffer to a
> > + * device, so we use a vfio driver callback to test for a match.  For
> > + * PCI, dev_name(dev) is unique, but other drivers may require including
> > + * a parent device string. */
> > +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
> > +{
> > +	struct vfio_iommu *iommu = group->iommu;
> > +	struct list_head *gpos;
> > +	int ret = -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!iommu->domain) {
> > +		ret = __vfio_open_iommu(iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	list_for_each(gpos, &iommu->group_list) {
> > +		struct list_head *dpos;
> > +
> > +		group = list_entry(gpos, struct vfio_group, iommu_next);
> > +
> > +		list_for_each(dpos, &group->device_list) {
> > +			struct vfio_device *device;
> > +
> > +			device = list_entry(dpos,
> > +					    struct vfio_device, device_next);
> > +
> > +			if (device->ops->match(device->dev, buf)) {
> 
> If there's a match, we're done with the loop -- might as well break out
> now rather than indent everything else.

Sure, even just changing the polarity and making this a continue would
help the formatting below.

> > +				struct file *file;
> > +
> > +				if (device->ops->get(device->device_data)) {
> > +					ret = -EFAULT;
> > +					goto out;
> > +				}
> 
> Why does a failure of get() result in -EFAULT?  -EFAULT is for bad user
> addresses.

I'll just return what get() returns.

> > +
> > +				/* We can't use anon_inode_getfd(), like above
> > +				 * because we need to modify the f_mode flags
> > +				 * directly to allow more than just ioctls */
> > +				ret = get_unused_fd();
> > +				if (ret < 0) {
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> > +
> > +				file = anon_inode_getfile("[vfio-device]",
> > +							  &vfio_device_fops,
> > +							  device, O_RDWR);
> > +				if (IS_ERR(file)) {
> > +					put_unused_fd(ret);
> > +					ret = PTR_ERR(file);
> > +					device->ops->put(device->device_data);
> > +					goto out;
> > +				}
> 
> Maybe cleaner with goto-based error management?

I didn't see enough duplication creeping in to try that here.

> > +/* Add a new device to the vfio framework with associated vfio driver
> > + * callbacks.  This is the entry point for vfio drivers to register devices. */
> > +int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
> > +{
> > +	struct list_head *pos;
> > +	struct vfio_group *group = NULL;
> > +	struct vfio_device *device = NULL;
> > +	unsigned int groupid;
> > +	int ret = 0;
> > +	bool new_group = false;
> > +
> > +	if (!ops)
> > +		return -EINVAL;
> > +
> > +	if (iommu_device_group(dev, &groupid))
> > +		return -ENODEV;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	list_for_each(pos, &vfio.group_list) {
> > +		group = list_entry(pos, struct vfio_group, group_next);
> > +		if (group->groupid == groupid)
> > +			break;
> > +		group = NULL;
> > +	}
> 
> Factor this into vfio_dev_to_group() (and likewise for other such lookups)?

Yeah, this ends up getting duplicated a few places.

> > +	if (!group) {
> > +		int minor;
> > +
> > +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group = kzalloc(sizeof(*group), GFP_KERNEL);
> > +		if (!group) {
> > +			ret = -ENOMEM;
> > +			goto out;
> > +		}
> > +
> > +		group->groupid = groupid;
> > +		INIT_LIST_HEAD(&group->device_list);
> > +
> > +		ret = idr_get_new(&vfio.idr, group, &minor);
> > +		if (ret == 0 && minor > MINORMASK) {
> > +			idr_remove(&vfio.idr, minor);
> > +			kfree(group);
> > +			ret = -ENOSPC;
> > +			goto out;
> > +		}
> > +
> > +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> > +		device_create(vfio.class, NULL, group->devt,
> > +			      group, "%u", groupid);
> > +
> > +		group->bus = dev->bus;
> > +		list_add(&group->group_next, &vfio.group_list);
> 
> Factor out into vfio_create_group()?

sounds good

> > +		new_group = true;
> > +	} else {
> > +		if (group->bus != dev->bus) {
> > +			printk(KERN_WARNING
> > +			       "Error: IOMMU group ID conflict.  Group ID %u "
> > +				"on both bus %s and %s\n", groupid,
> > +				group->bus->name, dev->bus->name);
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> 
> It took me a little while to figure out that this was comparing bus
> types, not actual bus instances (which would be an inappropriate
> restriction). :-P
> 
> Still, isn't it what we really care about that it's the same IOMMU
> domain?  Couldn't different bus types share an iommu_ops?

Nope, iommu_ops registration is now per bus_type.  Also, Christian
pointed out that groupid is really only guaranteed to be unique per
bus_type so I've been updating groupid comparisons to compare the
groupid, bus_type pair.  

> And again, -EFAULT isn't the right error.

Ok.

Thank you very much for the comments,

Alex
Alex Williamson Nov. 14, 2011, 9:46 p.m. UTC | #14
On Mon, 2011-11-14 at 13:54 -0700, Alex Williamson wrote:
> On Fri, 2011-11-11 at 18:14 -0600, Scott Wood wrote:
> > On 11/03/2011 03:12 PM, Alex Williamson wrote: 
> > > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> > > +		unsigned long pfn = 0;
> > > +
> > > +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> > > +		if (ret) {
> > > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > > +			return ret;
> > > +		}
> > > +
> > > +		/* Only add actual locked pages to accounting */
> > > +		if (!is_invalid_reserved_pfn(pfn))
> > > +			locked++;
> > > +
> > > +		ret = iommu_map(iommu->domain, iova,
> > > +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
> > > +		if (ret) {
> > > +			/* Back out mappings on error */
> > > +			put_pfn(pfn, rdwr);
> > > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > > +			return ret;
> > > +		}
> > > +	}
> > 
> > There's no way to hand this stuff to the IOMMU driver in chunks larger
> > than a page?  That's going to be a problem for our IOMMU, which wants to
> > deal with large windows.
> 
> There is, this is just a simple implementation that maps individual
> pages.  We "just" need to determine physically contiguous chunks and
> mlock them instead of using get_user_pages.  The current implementation
> is much like how KVM maps iommu pages, but there shouldn't be a user API
> change to try to use larger chinks.  We want this for IOMMU large page
> support too.

Also, at one point intel-iommu didn't allow sub-ranges to be unmapped;
an unmap of a single page would unmap the entire original mapping that
contained that page.  That made it easier to map each page individually
for the flexibility it provided on unmap.  I need to see if we still
have that restriction.  Thanks,

Alex
Scott Wood Nov. 14, 2011, 10:26 p.m. UTC | #15
On 11/14/2011 02:54 PM, Alex Williamson wrote:
> On Fri, 2011-11-11 at 18:14 -0600, Scott Wood wrote:
>> What are the semantics of "desired and/or returned dma address"?
> 
> I believe the original intention was that a user could leave dmaaddr
> clear and let the iommu layer provide an iova address.  The iommu api
> has since evolved and that mapping scheme really isn't present anymore.
> We'll currently fail if we can map the requested address.  I'll update
> the docs to make that be the definition.

OK... if there is any desire in the future to have the kernel pick an
address (which could be useful for IOMMUs that don't set
VFIO_IOMMU_FLAGS_MAP_ANY), there should be an explicit flag for this,
since zero could be a valid address to request (doesn't mean "clear").

>> Note that the "length of structure" approach means that ioctl numbers
>> will change whenever this grows -- perhaps we should avoid encoding the
>> struct size into these ioctls?
> 
> How so?  What's described here is effectively the base size.  If we
> later add feature foo requiring additional fields, we set a flag, change
> the size, and tack those fields onto the end.  The kernel side should
> balk if the size doesn't match what it expects from the flags it
> understands (which I think I probably need to be more strict about).

The size of the struct is encoded into the ioctl number via the _IOWR()
macro.  If we want the struct to be growable in the future, we should
leave that out and just use _IO().  Otherwise if the size of the struct
changes, the ioctl number changes.  This is annoying for old userspace
plus new kernel (have to add compat entries to the switch), and broken
for old kernel plus new userspace.

>> Can we limit the IOMMU_API dependency to the IOMMU parts of VFIO?  It
>> would still be useful for devices which don't do DMA, or where we accept
>> the lack of protection/translation (e.g. we have a customer that wants
>> to do KVM device assignment on one of our lower-end chips that lacks an
>> IOMMU).
> 
> Ugh.  I'm not really onboard with it given that we're trying to sell
> vfio as a secure user space driver interface with iommu-based
> protection.

That's its main use case, but it doesn't make much sense to duplicate
the non-iommu-related bits for other use cases.

This applies at runtime too, some devices don't do DMA at all (and thus
may not be part of an IOMMU group, even if there is an IOMMU present for
other devices -- could be considered a standalone group of one device,
with a null IOMMU backend).  Support for such devices can wait, but it's
good to keep the possibility in mind.

-Scott
Alexander Graf Nov. 14, 2011, 10:48 p.m. UTC | #16
Am 14.11.2011 um 23:26 schrieb Scott Wood <scottwood@freescale.com>:

> On 11/14/2011 02:54 PM, Alex Williamson wrote:
>> On Fri, 2011-11-11 at 18:14 -0600, Scott Wood wrote:
>>> What are the semantics of "desired and/or returned dma address"?
>> 
>> I believe the original intention was that a user could leave dmaaddr
>> clear and let the iommu layer provide an iova address.  The iommu api
>> has since evolved and that mapping scheme really isn't present anymore.
>> We'll currently fail if we can map the requested address.  I'll update
>> the docs to make that be the definition.
> 
> OK... if there is any desire in the future to have the kernel pick an
> address (which could be useful for IOMMUs that don't set
> VFIO_IOMMU_FLAGS_MAP_ANY), there should be an explicit flag for this,
> since zero could be a valid address to request (doesn't mean "clear").
> 
>>> Note that the "length of structure" approach means that ioctl numbers
>>> will change whenever this grows -- perhaps we should avoid encoding the
>>> struct size into these ioctls?
>> 
>> How so?  What's described here is effectively the base size.  If we
>> later add feature foo requiring additional fields, we set a flag, change
>> the size, and tack those fields onto the end.  The kernel side should
>> balk if the size doesn't match what it expects from the flags it
>> understands (which I think I probably need to be more strict about).
> 
> The size of the struct is encoded into the ioctl number via the _IOWR()
> macro.  If we want the struct to be growable in the future, we should
> leave that out and just use _IO().  Otherwise if the size of the struct
> changes, the ioctl number changes.  This is annoying for old userspace
> plus new kernel (have to add compat entries to the switch), and broken
> for old kernel plus new userspace.

Avi wanted to write up a patch for this to allow ioctls with arbitrary size, for exctly this purpose.

> 
>>> Can we limit the IOMMU_API dependency to the IOMMU parts of VFIO?  It
>>> would still be useful for devices which don't do DMA, or where we accept
>>> the lack of protection/translation (e.g. we have a customer that wants
>>> to do KVM device assignment on one of our lower-end chips that lacks an
>>> IOMMU).
>> 
>> Ugh.  I'm not really onboard with it given that we're trying to sell
>> vfio as a secure user space driver interface with iommu-based
>> protection.
> 
> That's its main use case, but it doesn't make much sense to duplicate
> the non-iommu-related bits for other use cases.
> 
> This applies at runtime too, some devices don't do DMA at all (and thus
> may not be part of an IOMMU group, even if there is an IOMMU present for
> other devices -- could be considered a standalone group of one device,
> with a null IOMMU backend).  Support for such devices can wait, but it's
> good to keep the possibility in mind.

I agree. Potentially backing a device with a nop iommu also makes testing easier.

Alex

>
Alex Williamson Nov. 14, 2011, 10:59 p.m. UTC | #17
On Fri, 2011-11-11 at 16:22 -0600, Christian Benvenuti (benve) wrote:
> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Friday, November 11, 2011 10:04 AM
> > To: Christian Benvenuti (benve)
> > Cc: chrisw@sous-sol.org; aik@au1.ibm.com; pmac@au1.ibm.com;
> > dwg@au1.ibm.com; joerg.roedel@amd.com; agraf@suse.de; Aaron Fabbri
> > (aafabbri); B08248@freescale.com; B07421@freescale.com; avi@redhat.com;
> > konrad.wilk@oracle.com; kvm@vger.kernel.org; qemu-devel@nongnu.org;
> > iommu@lists.linux-foundation.org; linux-pci@vger.kernel.org
> > Subject: RE: [RFC PATCH] vfio: VFIO Driver core framework
> > 
> > On Wed, 2011-11-09 at 18:57 -0600, Christian Benvenuti (benve) wrote:
> > > Here are few minor comments on vfio_iommu.c ...
> > 
> > Sorry, I've been poking sticks at trying to figure out a clean way to
> > solve the force vfio driver attach problem.
> 
> Attach o detach?

Attach.  For the case when a new device appears that belongs to a group
that already in use.  I'll probably add a claim() operation to the
vfio_device_ops that tells the driver to grab it.  I was hoping for pci
this would just add it to the dynamic ids, but that hits device lock
problems.

> > > > diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> > > > new file mode 100644
> > > > index 0000000..029dae3
> > > > --- /dev/null
> > > > +++ b/drivers/vfio/vfio_iommu.c
> > <snip>
> > > > +
> > > > +#include "vfio_private.h"
> > >
> > > Doesn't the 'dma_'  prefix belong to the generic DMA code?
> > 
> > Sure, we could these more vfio-centric.
> 
> Like vfio_dma_map_page?

Something like that, though _page doesn't seem appropriate as it tracks
a region.

> > 
> > > > +struct dma_map_page {
> > > > +	struct list_head	list;
> > > > +	dma_addr_t		daddr;
> > > > +	unsigned long		vaddr;
> > > > +	int			npage;
> > > > +	int			rdwr;
> > > > +};
> > > > +
> > > > +/*
> > > > + * This code handles mapping and unmapping of user data buffers
> > > > + * into DMA'ble space using the IOMMU
> > > > + */
> > > > +
> > > > +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> > > > +
> > > > +struct vwork {
> > > > +	struct mm_struct	*mm;
> > > > +	int			npage;
> > > > +	struct work_struct	work;
> > > > +};
> > > > +
> > > > +/* delayed decrement for locked_vm */
> > > > +static void vfio_lock_acct_bg(struct work_struct *work)
> > > > +{
> > > > +	struct vwork *vwork = container_of(work, struct vwork, work);
> > > > +	struct mm_struct *mm;
> > > > +
> > > > +	mm = vwork->mm;
> > > > +	down_write(&mm->mmap_sem);
> > > > +	mm->locked_vm += vwork->npage;
> > > > +	up_write(&mm->mmap_sem);
> > > > +	mmput(mm);		/* unref mm */
> > > > +	kfree(vwork);
> > > > +}
> > > > +
> > > > +static void vfio_lock_acct(int npage)
> > > > +{
> > > > +	struct vwork *vwork;
> > > > +	struct mm_struct *mm;
> > > > +
> > > > +	if (!current->mm) {
> > > > +		/* process exited */
> > > > +		return;
> > > > +	}
> > > > +	if (down_write_trylock(&current->mm->mmap_sem)) {
> > > > +		current->mm->locked_vm += npage;
> > > > +		up_write(&current->mm->mmap_sem);
> > > > +		return;
> > > > +	}
> > > > +	/*
> > > > +	 * Couldn't get mmap_sem lock, so must setup to decrement
> > >                                                       ^^^^^^^^^
> > >
> > > Increment?
> > 
> > Yep

Actually, side note, this is increment/decrement depending on the sign
of the parameter.  So "update" may be more appropriate.  I think Tom
originally used increment in one place and decrement in another to show
it's dual use.

> > <snip>
> > > > +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t
> > > > start,
> > > > +			    size_t size, struct dma_map_page *mlp)
> > > > +{
> > > > +	struct dma_map_page *split;
> > > > +	int npage_lo, npage_hi;
> > > > +
> > > > +	/* Existing dma region is completely covered, unmap all */
> > >
> > > This works. However, given how vfio_dma_map_dm implements the merging
> > > logic, I think it is impossible to have
> > >
> > >     (start < mlp->daddr &&
> > >      start + size > mlp->daddr + NPAGE_TO_SIZE(mlp->npage))
> > 
> > It's quite possible.  This allows userspace to create a sparse mapping,
> > then blow it all away with a single unmap from 0 to ~0.
> 
> I would prefer the user to use exact ranges in the unmap operations
> because it would make it easier to detect bugs/leaks in the map/unmap
> logic used by the callers.
> My assumptions are that:
> 
> - the user always keeps track of the mappings

My qemu code plays a little on the loose side here, acting as a
passthrough for the internal memory client.  But even there, worst case
would probably be trying to unmap a non-existent entry, not unmapping a
sparse range.

> - the user either unmaps one specific mapping or 'all of them'.
>   The 'all of them' case would also take care of those cases where
>   the user does _not_ keep track of mappings and simply uses
>   the "unmap from 0 to ~0" each time.
> 
> Because of this you could still provide an exact map/unmap logic
> and allow such "unmap from 0 to ~0" by making the latter a special
> case.
> However, if we want to allow any arbitrary/inexact unmap request, then OK.

I can't think of any good reasons we shouldn't be more strict.  I think
it was primarily just convenient to hit all the corner cases since we
merge all the requests together for tracking and need to be able to
split them back apart.  It does feel a little awkward to have a 0/~0
special case though, but I don't think it's worth adding another ioctl
to handle it.

<snip>
> > > > +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> > > > +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> > > > +
> > > > +                ret = put_user(flags, (u64 __user *)arg);
> > > > +
> > > > +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> > > > +		struct vfio_dma_map dm;
> > > > +
> > > > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > > > +			return -EFAULT;
> > >
> > > What does the "_dm" suffix stand for?
> > 
> > Inherited from Tom, but I figure _dma_map_dm = action(dma map),
> > object(dm), which is a vfio_Dma_Map.
> 
> OK. The reason why I asked is that '_dm' does not add anything to 'vfio_dma_map'.

Yep.  Thanks,

Alex
David Gibson Nov. 15, 2011, midnight UTC | #18
On Fri, Nov 11, 2011 at 03:10:56PM -0700, Alex Williamson wrote:
> Thanks Konrad!  Comments inline.
> On Fri, 2011-11-11 at 12:51 -0500, Konrad Rzeszutek Wilk wrote:
> > On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
[snip]
> > > +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> > > +
> > > +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
> > 
> > Don't want __u32?
> 
> It could be, not sure if it buys us anything maybe even restricts us.
> We likely don't need 2^32 regions (famous last words?), so we could
> later define <0 to something?

As a rule, it's best to use explicit fixed width types for all ioctl()
arguments, to avoid compat hell for 32-bit userland on 64-bit kernel
setups.

[snip]
> > > +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> > > +type to index mapping).
> > 
> > I am not really sure what that means.
> 
> This is so PCI can expose:
> 
> enum {
>         VFIO_PCI_INTX_IRQ_INDEX,
>         VFIO_PCI_MSI_IRQ_INDEX,
>         VFIO_PCI_MSIX_IRQ_INDEX,
>         VFIO_PCI_NUM_IRQS
> };
> 
> So like regions it always exposes 3 IRQ indexes where count=0 if the
> device doesn't actually support that type of interrupt.  I just want to
> spell out that bus drivers have this kind of flexibility.

I knew what you were aiming for, so I could see what you meant here,
but I don't think the doco is very clearly expressed at all.
David Gibson Nov. 15, 2011, 12:05 a.m. UTC | #19
On Mon, Nov 14, 2011 at 03:59:00PM -0700, Alex Williamson wrote:
> On Fri, 2011-11-11 at 16:22 -0600, Christian Benvenuti (benve) wrote:
[snip]

> > - the user either unmaps one specific mapping or 'all of them'.
> >   The 'all of them' case would also take care of those cases where
> >   the user does _not_ keep track of mappings and simply uses
> >   the "unmap from 0 to ~0" each time.
> > 
> > Because of this you could still provide an exact map/unmap logic
> > and allow such "unmap from 0 to ~0" by making the latter a special
> > case.
> > However, if we want to allow any arbitrary/inexact unmap request, then OK.
> 
> I can't think of any good reasons we shouldn't be more strict.  I think
> it was primarily just convenient to hit all the corner cases since we
> merge all the requests together for tracking and need to be able to
> split them back apart.  It does feel a little awkward to have a 0/~0
> special case though, but I don't think it's worth adding another ioctl
> to handle it.

Being strict, or at least enforcing strictness, requires that the
infrastructure track all the maps, so that the unmaps can be
matching.  This is not a natural thing with the data structures you
want for all IOMMUs.  For example on POWER, the IOMMU (aka TCE table)
is a simple 1-level pagetable.  One pointer with a couple of
permission bits per IOMMU page.  Handling oddly overlapping operations
on that data structure is natural, enforcing strict matching of maps
and unmaps is not and would require extra information to be stored by
vfio.  On POWER, the IOMMU operations often *are* a hot path, so
manipulating those structures would have a real cost, too.
Benjamin Herrenschmidt Nov. 15, 2011, 12:49 a.m. UTC | #20
On Tue, 2011-11-15 at 11:05 +1100, David Gibson wrote:
> Being strict, or at least enforcing strictness, requires that the
> infrastructure track all the maps, so that the unmaps can be
> matching.  This is not a natural thing with the data structures you
> want for all IOMMUs.  For example on POWER, the IOMMU (aka TCE table)
> is a simple 1-level pagetable.  One pointer with a couple of
> permission bits per IOMMU page.  Handling oddly overlapping operations
> on that data structure is natural, enforcing strict matching of maps
> and unmaps is not and would require extra information to be stored by
> vfio.  On POWER, the IOMMU operations often *are* a hot path, so
> manipulating those structures would have a real cost, too. 

In fact they are a very hot path even. There's no way we can afford the
cost of tracking per page mapping/unmapping (other than bumping the page
count on a page that's currently mapped or via some debug-only feature).

Cheers,
Ben.
Alex Williamson Nov. 15, 2011, 2:29 a.m. UTC | #21
On Mon, 2011-11-14 at 13:54 -0700, Alex Williamson wrote:
> On Fri, 2011-11-11 at 18:14 -0600, Scott Wood wrote:
> > On 11/03/2011 03:12 PM, Alex Williamson wrote:
> > > +	int			(*get)(void *);
> > > +	void			(*put)(void *);
> > > +	ssize_t			(*read)(void *, char __user *,
> > > +					size_t, loff_t *);
> > > +	ssize_t			(*write)(void *, const char __user *,
> > > +					 size_t, loff_t *);
> > > +	long			(*ioctl)(void *, unsigned int, unsigned long);
> > > +	int			(*mmap)(void *, struct vm_area_struct *);
> > > +};
> > 
> > When defining an API, please do not omit parameter names.
> 
> ok
> 
> > Should specify what the driver is supposed to do with get/put -- I guess
> > not try to unbind when the count is nonzero?  Races could still lead the
> > unbinder to be blocked, but I guess it lets the driver know when it's
> > likely to succeed.
> 
> Right, for the pci bus driver, it's mainly for reference counting,
> including the module_get to prevent vfio-pci from being unloaded.  On
> the first get for a device, we also do a pci_enable() and pci_disable()
> on last put.  I'll try to clarify in the docs.

Looking at these again, I should just rename them to open/release.  That
matches the points when they're called.  I suspect I started with just
reference counting and it grew to more of a full blown open/release.
Thanks,

Alex
David Gibson Nov. 15, 2011, 6:34 a.m. UTC | #22
On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> VFIO provides a secure, IOMMU based interface for user space
> drivers, including device assignment to virtual machines.
> This provides the base management of IOMMU groups, devices,
> and IOMMU objects.  See Documentation/vfio.txt included in
> this patch for user and kernel API description.
> 
> Note, this implements the new API discussed at KVM Forum
> 2011, as represented by the drvier version 0.2.  It's hoped
> that this provides a modular enough interface to support PCI
> and non-PCI userspace drivers across various architectures
> and IOMMU implementations.
> 
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
> Fingers crossed, this is the last RFC for VFIO, but we need
> the iommu group support before this can go upstream
> (http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
> hoping this helps push that along.
> 
> Since the last posting, this version completely modularizes
> the device backends and better defines the APIs between the
> core VFIO code and the device backends.  I expect that we
> might also adopt a modular IOMMU interface as iommu_ops learns
> about different types of hardware.  Also many, many cleanups.
> Check the complete git history for details:
> 
> git://github.com/awilliam/linux-vfio.git vfio-ng
> 
> (matching qemu tree: git://github.com/awilliam/qemu-vfio.git)
> 
> This version, along with the supporting VFIO PCI backend can
> be found here:
> 
> git://github.com/awilliam/linux-vfio.git vfio-next-20111103
> 
> I've held off on implementing a kernel->user signaling
> mechanism for now since the previous netlink version produced
> too many gag reflexes.  It's easy enough to set a bit in the
> group flags too indicate such support in the future, so I
> think we can move ahead without it.
> 
> Appreciate any feedback or suggestions.  Thanks,
> 
> Alex
> 
>  Documentation/ioctl/ioctl-number.txt |    1 
>  Documentation/vfio.txt               |  304 +++++++++
>  MAINTAINERS                          |    8 
>  drivers/Kconfig                      |    2 
>  drivers/Makefile                     |    1 
>  drivers/vfio/Kconfig                 |    8 
>  drivers/vfio/Makefile                |    3 
>  drivers/vfio/vfio_iommu.c            |  530 ++++++++++++++++
>  drivers/vfio/vfio_main.c             | 1151 ++++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio_private.h          |   34 +
>  include/linux/vfio.h                 |  155 +++++
>  11 files changed, 2197 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/vfio.txt
>  create mode 100644 drivers/vfio/Kconfig
>  create mode 100644 drivers/vfio/Makefile
>  create mode 100644 drivers/vfio/vfio_iommu.c
>  create mode 100644 drivers/vfio/vfio_main.c
>  create mode 100644 drivers/vfio/vfio_private.h
>  create mode 100644 include/linux/vfio.h
> 
> diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
> index 54078ed..59d01e4 100644
> --- a/Documentation/ioctl/ioctl-number.txt
> +++ b/Documentation/ioctl/ioctl-number.txt
> @@ -88,6 +88,7 @@ Code  Seq#(hex)	Include File		Comments
>  		and kernel/power/user.c
>  '8'	all				SNP8023 advanced NIC card
>  					<mailto:mcr@solidum.com>
> +';'	64-76	linux/vfio.h
>  '@'	00-0F	linux/radeonfb.h	conflict!
>  '@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
>  'A'	00-1F	linux/apm_bios.h	conflict!
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> new file mode 100644
> index 0000000..5866896
> --- /dev/null
> +++ b/Documentation/vfio.txt
> @@ -0,0 +1,304 @@
> +VFIO - "Virtual Function I/O"[1]
> +-------------------------------------------------------------------------------
> +Many modern system now provide DMA and interrupt remapping facilities
> +to help ensure I/O devices behave within the boundaries they've been
> +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> +well as POWER systems with Partitionable Endpoints (PEs) and even
> +embedded powerpc systems (technology name unknown).  The VFIO driver
> +is an IOMMU/device agnostic framework for exposing direct device
> +access to userspace, in a secure, IOMMU protected environment.  In
> +other words, this allows safe, non-privileged, userspace drivers.

It's perhaps worth emphasisng that "safe" depends on the hardware
being sufficiently well behaved.  BenH, I know, thinks there are a
*lot* of cards that, e.g. have debug registers that allow a backdoor
to their own config space via MMIO, which would bypass vfio's
filtering of config space access.  And that's before we even get into
the varying degrees of completeness in the isolation provided by
different IOMMUs.

> +Why do we want that?  Virtual machines often make use of direct device
> +access ("device assignment") when configured for the highest possible
> +I/O performance.  From a device and host perspective, this simply turns
> +the VM into a userspace driver, with the benefits of significantly
> +reduced latency, higher bandwidth, and direct use of bare-metal device
> +drivers[2].
> +
> +Some applications, particularly in the high performance computing
> +field, also benefit from low-overhead, direct device access from
> +userspace.  Examples include network adapters (often non-TCP/IP based)
> +and compute accelerators.  Previous to VFIO, these drivers needed to

s/Previous/Prior/  although that may be a .us vs .au usage thing.

> +go through the full development cycle to become proper upstream driver,
> +be maintained out of tree, or make use of the UIO framework, which
> +has no notion of IOMMU protection, limited interrupt support, and
> +requires root privileges to access things like PCI configuration space.
> +
> +The VFIO driver framework intends to unify these, replacing both the
> +KVM PCI specific device assignment currently used as well as provide
> +a more secure, more featureful userspace driver environment than UIO.
> +
> +Groups, Devices, IOMMUs, oh my
> +-------------------------------------------------------------------------------
> +
> +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> +can't always distinguish transactions from each individual device in
> +the system.  Sometimes this is because of the IOMMU design, such as with
> +PEs, other times it's caused by the I/O topology, for instance a
> +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> +devices created by these restictions IOMMU groups (or just "groups" for
> +this document).
> +
> +The IOMMU cannot distiguish transactions between the individual devices
> +within the group, therefore the group is the basic unit of ownership for
> +a userspace process.  Because of this, groups are also the primary
> +interface to both devices and IOMMU domains in VFIO.
> +
> +The VFIO representation of groups is created as devices are added into
> +the framework by a VFIO bus driver.  The vfio-pci module is an example
> +of a bus driver.  This module registers devices along with a set of bus
> +specific callbacks with the VFIO core.  These callbacks provide the
> +interfaces later used for device access.  As each new group is created,
> +as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
> +character device.

Ok.. so, the fact that it's called "vfio-pci" suggests that the VFIO
bus driver is per bus type, not per bus instance.   But grouping
constraints could be per bus instance, if you have a couple of
different models of PCI host bridge with IOMMUs of different
capabilities built in, for example.

> +In addition to the device enumeration and callbacks, the VFIO bus driver
> +also provides a traditional device driver and is able to bind to devices
> +on it's bus.  When a device is bound to the bus driver it's available to
> +VFIO.  When all the devices within a group are bound to their bus drivers,
> +the group becomes "viable" and a user with sufficient access to the VFIO
> +group chardev can obtain exclusive access to the set of group devices.
> +
> +As documented in linux/vfio.h, several ioctls are provided on the
> +group chardev:
> +
> +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
> +
> +The last two ioctls return new file descriptors for accessing
> +individual devices within the group and programming the IOMMU.  Each of
> +these new file descriptors provide their own set of file interfaces.
> +These ioctls will fail if any of the devices within the group are not
> +bound to their VFIO bus driver.  Additionally, when either of these
> +interfaces are used, the group is then bound to the struct_mm of the
> +caller.  The GET_FLAGS ioctl can be used to view the state of the group.
> +
> +When either the GET_IOMMU_FD or GET_DEVICE_FD ioctls are invoked, a
> +new IOMMU domain is created and all of the devices in the group are
> +attached to it.  This is the only way to ensure full IOMMU isolation
> +of the group, but potentially wastes resources and cycles if the user
> +intends to manage multiple groups with the same set of IOMMU mappings.
> +VFIO therefore provides a group MERGE and UNMERGE interface, which
> +allows multiple groups to share an IOMMU domain.  Not all IOMMUs allow
> +arbitrary groups to be merged, so the user should assume merging is
> +opportunistic.

I do not think "opportunistic" means what you think it means..

>  A new group, with no open device or IOMMU file
> +descriptors, can be merged into an existing, in-use, group using the
> +MERGE ioctl.  A merged group can be unmerged using the UNMERGE ioctl
> +once all of the device file descriptors for the group being merged
> +"out" are closed.
> +
> +When groups are merged, the GET_IOMMU_FD and GET_DEVICE_FD ioctls are
> +essentially fungible between group file descriptors (ie. if device
> A

IDNT "fungible" MWYTIM, either.

> +is in group X, and X is merged with Y, a file descriptor for A can be
> +retrieved using GET_DEVICE_FD on Y.  Likewise, GET_IOMMU_FD returns a
> +file descriptor referencing the same internal IOMMU object from either
> +X or Y).  Merged groups can be dissolved either explictly with UNMERGE
> +or automatically when ALL file descriptors for the merged group are
> +closed (all IOMMUs, all devices, all groups).

Blech.  I'm really not liking this merge/unmerge API as it stands,
it's horribly confusing.  At the very least, we need some better
terminology.  We need some term for the metagroups; supergroups; iommu
domains or-at-least-they-will-be-once-we-open-the-iommu or
whathaveyous.

The first confusing thing about this interface is that each open group
handle actually refers to two different things; the original group you
opened and the metagroup it's a part of.  For the GET_IOMMU_FD and
GET_DEVICE_FD operations, you're using the metagroup and two "merged"
group handles are interchangeable.  For other MERGE and especially
UNMERGE operations, it matters which is the original group.

The semantics of "merge" and "unmerge" under those names are really
non-obvious.  Merge kind of has to merge two whole metagroups, but
it's unclear if unmerge reverses one merge, or just takes out one
(atom) group.  These operations need better names, at least.

Then it's unclear what order you can do various operations, and which
order you can open and close various things.  You can kind of figure
it out but it takes far more thinking than it should.


So at the _very_ least, we need to invent new terminology and find a
much better way of describing this API's semantics.  I still think an
entirely different interface, where metagroups are created from
outside with a lifetime that's not tied to an fd would be a better
idea.



Now, you specify that you can't use a group as the second argument of
a merge if it already has an open iommu, but it's not clear from the
doc if you can merge things into a group with an open iommu.  Banning
this would make life simpler, because the IOMMU's effective
capabilities may change if you add more devices to the domain.  That's
yet another non-obvious constraint in the interface ordering, though.

> +The IOMMU file descriptor provides this set of ioctls:
> +
> +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
> +
> +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> +We currently only support IOMMU domains that are able to map any
> +virtual address to any IOVA.  This is indicated by the MAP_ANY
> flag.

So.  I tend to think of an IOMMU mapping IOVAs to memory pages, rather
than memory pages to IOVAs.  The IOMMU itself, of course maps to
physical addresses, and the meaning of "virtual address" in this
context is not really clear.  I think you would be better off saying
the IOMMU can map any IOVA to any memory page.  From a hardware POV
that means any physical address, but of course for a VFIO user a page
is specified by its process virtual address.

I think we need to pin exactly what "MAP_ANY" means down better.  Now,
VFIO is pretty much a lost cause if you can't map any normal process
memory page into the IOMMU, so I think the only thing that is really
covered is IOVAs.  But saying "can map any IOVA" is not clear, because
if you can't map it, it's not a (valid) IOVA.  Better to say that
IOVAs can be any 64-bit value, which I think is what you really mean
here.

Of course, since POWER is a platform where this is *not* true, I'd
prefer to have something giving the range of valid IOVAs in the core
to start with.

> +
> +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> +and unmapping IOVAs to process virtual addresses:
> +
> +struct vfio_dma_map {
> +        __u64   len;            /* length of structure */

Thanks for adding these structure length fields.  But I think they
should be called something other than 'len', which is likely to be
confused with size (or some other length that's actually related to
the operation's parameters).  Better to call it 'structlen' or
'argslen' or something.

> +        __u64   vaddr;          /* process virtual addr */
> +        __u64   dmaaddr;        /* desired and/or returned dma address */
> +        __u64   size;           /* size in bytes */
> +        __u64   flags;
> +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */

Make it independent READ and WRITE flags from the start.  Not all
combinations will be be valid on all hardware, but that way we have
the possibilities covered without having to use strange encodings
later.

> +};
> +
> +Current users of VFIO use relatively static DMA mappings, not requiring
> +high frequency turnover.  As new users are added, it's expected that the
> +IOMMU file descriptor will evolve to support new mapping interfaces, this
> +will be reflected in the flags and may present new ioctls and file
> +interfaces.
> +
> +The device GET_FLAGS ioctl is intended to return basic device type and
> +indicate support for optional capabilities.  Flags currently include whether
> +the device is PCI or described by Device Tree, and whether the RESET ioctl
> +is supported:
> +
> +#define VFIO_DEVICE_GET_FLAGS           _IOR(';', 108, __u64)
> + #define VFIO_DEVICE_FLAGS_PCI          (1 << 0)
> + #define VFIO_DEVICE_FLAGS_DT           (1 << 1)

TBH, I don't think the VFIO for DT stuff is mature enough yet to be in
an initial infrastructure patch, though we should certainly be
discussing it as an add-on patch.

> + #define VFIO_DEVICE_FLAGS_RESET        (1 << 2)
> +
> +The MMIO and IOP resources used by a device are described by regions.
> +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> +
> +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
> +
> +Regions are described by a struct vfio_region_info, which is retrieved by
> +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> +the desired region (0 based index).  Note that devices may implement zero
> +sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> +mapping).

So, I think you're saying that a zero-sized region is used to encode a
NOP region, that is, to basically put a "no region here" in between
valid region indices.  You should spell that out.

[Incidentally, any chance you could borrow one of RH's tech writers
for this?  I'm afraid you seem to lack the knack for clear and easily
read documentation]

> +struct vfio_region_info {
> +        __u32   len;            /* length of structure */
> +        __u32   index;          /* region number */
> +        __u64   size;           /* size in bytes of region */
> +        __u64   offset;         /* start offset of region */
> +        __u64   flags;
> +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)

Again having separate read and write bits from the start will save
strange encodings later.

> +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> +        __u64   phys;           /* physical address of region */
> +};

I notice there is no field for "type" e.g. MMIO vs. PIO vs. config
space for PCI.  If you added that having a NONE type might be a
clearer way of encoding a non-region than just having size==0.

> +
> +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> +
> +The offset indicates the offset into the device file descriptor which
> +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> +available access types and validity of optional fields.  For instance
> +the phys field may only be valid for certain devices types.
> +
> +Interrupts are described using a similar interface.  GET_NUM_IRQS
> +reports the number or IRQ indexes for the device.
> +
> +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
> +
> +struct vfio_irq_info {
> +        __u32   len;            /* length of structure */
> +        __u32   index;          /* IRQ number */
> +        __u32   count;          /* number of individual IRQs */

Is there a reason for allowing irqs in batches like this, rather than
having each MSI be reflected by a separate irq_info?

> +        __u64   flags;
> +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> +};
> +
> +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> +type to index mapping).

I know what you mean, but you need a clearer way to express it.

> +Information about each index can be retrieved using the GET_IRQ_INFO
> +ioctl, used much like GET_REGION_INFO.
> +
> +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> +
> +Individual indexes can describe single or sets of IRQs.  This provides the
> +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> +
> +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> +as shown below, are used to pass the IRQ info index, the number of eventfds,
> +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> +
> +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> +
> +When a level triggered interrupt is signaled, the interrupt is masked
> +on the host.  This prevents an unresponsive userspace driver from
> +continuing to interrupt the host system.  After servicing the interrupt,
> +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> +triggered interrupts implicitly have a count of 1 per index.

This is a silly restriction.  Even PCI devices can have up to 4 LSIs
on a function in theory, though no-one ever does.  Embedded devices
can and do have multiple level interrupts.

> +
> +/* Unmask IRQ index, arg[0] = index */
> +#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)
> +
> +Level triggered interrupts can also be unmasked using an irqfd.  Use
> +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
> +
> +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> +
> +When supported, as indicated by the device flags, reset the device.
> +
> +#define VFIO_DEVICE_RESET               _IO(';', 116)
> +
> +Device tree devices also invlude ioctls for further defining the
> +device tree properties of the device:
> +
> +struct vfio_dtpath {
> +        __u32   len;            /* length of structure */
> +        __u32   index;
> +        __u64   flags;
> +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> +        char    *path;
> +};
> +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> +
> +struct vfio_dtindex {
> +        __u32   len;            /* length of structure */
> +        __u32   index;
> +        __u32   prop_type;
> +        __u32   prop_index;
> +        __u64   flags;
> +#define VFIO_DTINDEX_FLAGS_REGION       (1 << 0)
> +#define VFIO_DTINDEX_FLAGS_IRQ          (1 << 1)
> +};
> +#define VFIO_DEVICE_GET_DTINDEX         _IOWR(';', 118, struct vfio_dtindex)
> +
> +
> +VFIO bus driver API
> +-------------------------------------------------------------------------------
> +
> +Bus drivers, such as PCI, have three jobs:
> + 1) Add/remove devices from vfio
> + 2) Provide vfio_device_ops for device access
> + 3) Device binding and unbinding
> +
> +When initialized, the bus driver should enumerate the devices on it's

s/it's/its/

> +bus and call vfio_group_add_dev() for each device.  If the bus supports
> +hotplug, notifiers should be enabled to track devices being added and
> +removed.  vfio_group_del_dev() removes a previously added device from
> +vfio.
> +
> +Adding a device registers a vfio_device_ops function pointer structure
> +for the device:
> +
> +struct vfio_device_ops {
> +	bool			(*match)(struct device *, char *);
> +	int			(*get)(void *);
> +	void			(*put)(void *);
> +	ssize_t			(*read)(void *, char __user *,
> +					size_t, loff_t *);
> +	ssize_t			(*write)(void *, const char __user *,
> +					 size_t, loff_t *);
> +	long			(*ioctl)(void *, unsigned int, unsigned long);
> +	int			(*mmap)(void *, struct vm_area_struct *);
> +};
> +
> +When a device is bound to the bus driver, the bus driver indicates this
> +to vfio using the vfio_bind_dev() interface.  The device_data parameter
> +is a pointer to an opaque data structure for use only by the bus driver.
> +The get, put, read, write, ioctl, and mmap vfio_device_ops all pass
> +this data structure back to the bus driver.  When a device is unbound
> +from the bus driver, the vfio_unbind_dev() interface signals this to
> +vfio.  This function returns the pointer to the device_data structure
> +registered for the device.
> +
> +As noted previously, a group contains one or more devices, so
> +GROUP_GET_DEVICE_FD needs to identify the specific device being requested.
> +The vfio_device_ops.match callback is used to allow bus drivers to determine
> +the match.  For drivers like vfio-pci, it's a simple match to dev_name(),
> +which is unique in the system due to the PCI bus topology, other bus drivers
> +may need to include parent devices to create a unique match, so this is
> +left as a bus driver interface.
> +
> +-------------------------------------------------------------------------------
> +
> +[1] VFIO was originally an acronym for "Virtual Function I/O" in it's
> +initial implementation by Tom Lyon while as Cisco.  We've since outgrown
> +the acronym, but it's catchy.
> +
> +[2] As always there are trade-offs to virtual machine device
> +assignment that are beyond the scope of VFIO.  It's expected that
> +future IOMMU technologies will reduce some, but maybe not all, of
> +these trade-offs.
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f05f5f6..4bd5aa0 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -7106,6 +7106,14 @@ S:	Maintained
>  F:	Documentation/filesystems/vfat.txt
>  F:	fs/fat/
>  
> +VFIO DRIVER
> +M:	Alex Williamson <alex.williamson@redhat.com>
> +L:	kvm@vger.kernel.org
> +S:	Maintained
> +F:	Documentation/vfio.txt
> +F:	drivers/vfio/
> +F:	include/linux/vfio.h
> +
>  VIDEOBUF2 FRAMEWORK
>  M:	Pawel Osciak <pawel@osciak.com>
>  M:	Marek Szyprowski <m.szyprowski@samsung.com>
> diff --git a/drivers/Kconfig b/drivers/Kconfig
> index b5e6f24..e15578b 100644
> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -112,6 +112,8 @@ source "drivers/auxdisplay/Kconfig"
>  
>  source "drivers/uio/Kconfig"
>  
> +source "drivers/vfio/Kconfig"
> +
>  source "drivers/vlynq/Kconfig"
>  
>  source "drivers/virtio/Kconfig"
> diff --git a/drivers/Makefile b/drivers/Makefile
> index 1b31421..5f138b5 100644
> --- a/drivers/Makefile
> +++ b/drivers/Makefile
> @@ -58,6 +58,7 @@ obj-$(CONFIG_ATM)		+= atm/
>  obj-$(CONFIG_FUSION)		+= message/
>  obj-y				+= firewire/
>  obj-$(CONFIG_UIO)		+= uio/
> +obj-$(CONFIG_VFIO)		+= vfio/
>  obj-y				+= cdrom/
>  obj-y				+= auxdisplay/
>  obj-$(CONFIG_PCCARD)		+= pcmcia/
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> new file mode 100644
> index 0000000..9acb1e7
> --- /dev/null
> +++ b/drivers/vfio/Kconfig
> @@ -0,0 +1,8 @@
> +menuconfig VFIO
> +	tristate "VFIO Non-Privileged userspace driver framework"
> +	depends on IOMMU_API
> +	help
> +	  VFIO provides a framework for secure userspace device drivers.
> +	  See Documentation/vfio.txt for more details.
> +
> +	  If you don't know what to do here, say N.
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> new file mode 100644
> index 0000000..088faf1
> --- /dev/null
> +++ b/drivers/vfio/Makefile
> @@ -0,0 +1,3 @@
> +vfio-y := vfio_main.o vfio_iommu.o
> +
> +obj-$(CONFIG_VFIO) := vfio.o
> diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
> new file mode 100644
> index 0000000..029dae3
> --- /dev/null
> +++ b/drivers/vfio/vfio_iommu.c
> @@ -0,0 +1,530 @@
> +/*
> + * VFIO: IOMMU DMA mapping support
> + *
> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + */
> +
> +#include <linux/compat.h>
> +#include <linux/device.h>
> +#include <linux/fs.h>
> +#include <linux/iommu.h>
> +#include <linux/module.h>
> +#include <linux/mm.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +#include <linux/workqueue.h>
> +
> +#include "vfio_private.h"
> +
> +struct dma_map_page {
> +	struct list_head	list;
> +	dma_addr_t		daddr;
> +	unsigned long		vaddr;
> +	int			npage;
> +	int			rdwr;
> +};
> +
> +/*
> + * This code handles mapping and unmapping of user data buffers
> + * into DMA'ble space using the IOMMU
> + */
> +
> +#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
> +
> +struct vwork {
> +	struct mm_struct	*mm;
> +	int			npage;
> +	struct work_struct	work;
> +};
> +
> +/* delayed decrement for locked_vm */
> +static void vfio_lock_acct_bg(struct work_struct *work)
> +{
> +	struct vwork *vwork = container_of(work, struct vwork, work);
> +	struct mm_struct *mm;
> +
> +	mm = vwork->mm;
> +	down_write(&mm->mmap_sem);
> +	mm->locked_vm += vwork->npage;
> +	up_write(&mm->mmap_sem);
> +	mmput(mm);		/* unref mm */
> +	kfree(vwork);
> +}
> +
> +static void vfio_lock_acct(int npage)
> +{
> +	struct vwork *vwork;
> +	struct mm_struct *mm;
> +
> +	if (!current->mm) {
> +		/* process exited */
> +		return;
> +	}
> +	if (down_write_trylock(&current->mm->mmap_sem)) {
> +		current->mm->locked_vm += npage;
> +		up_write(&current->mm->mmap_sem);
> +		return;
> +	}
> +	/*
> +	 * Couldn't get mmap_sem lock, so must setup to decrement
> +	 * mm->locked_vm later. If locked_vm were atomic, we wouldn't
> +	 * need this silliness
> +	 */
> +	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> +	if (!vwork)
> +		return;
> +	mm = get_task_mm(current);	/* take ref mm */
> +	if (!mm) {
> +		kfree(vwork);
> +		return;
> +	}
> +	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
> +	vwork->mm = mm;
> +	vwork->npage = npage;
> +	schedule_work(&vwork->work);
> +}
> +
> +/* Some mappings aren't backed by a struct page, for example an mmap'd
> + * MMIO range for our own or another device.  These use a different
> + * pfn conversion and shouldn't be tracked as locked pages. */
> +static int is_invalid_reserved_pfn(unsigned long pfn)
> +{
> +	if (pfn_valid(pfn)) {
> +		int reserved;
> +		struct page *tail = pfn_to_page(pfn);
> +		struct page *head = compound_trans_head(tail);
> +		reserved = PageReserved(head);
> +		if (head != tail) {
> +			/* "head" is not a dangling pointer
> +			 * (compound_trans_head takes care of that)
> +			 * but the hugepage may have been split
> +			 * from under us (and we may not hold a
> +			 * reference count on the head page so it can
> +			 * be reused before we run PageReferenced), so
> +			 * we've to check PageTail before returning
> +			 * what we just read.
> +			 */
> +			smp_rmb();
> +			if (PageTail(tail))
> +				return reserved;
> +		}
> +		return PageReserved(tail);
> +	}
> +
> +	return true;
> +}
> +
> +static int put_pfn(unsigned long pfn, int rdwr)
> +{
> +	if (!is_invalid_reserved_pfn(pfn)) {
> +		struct page *page = pfn_to_page(pfn);
> +		if (rdwr)
> +			SetPageDirty(page);
> +		put_page(page);
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +/* Unmap DMA region */
> +/* dgate must be held */
> +static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> +			    int npage, int rdwr)

Use of "read" and "write" in DMA can often be confusing, since it's
not always clear if you're talking from the perspective of the CPU or
the device (_writing_ data to a device will usually involve it doing
DMA _reads_ from memory).  It's often best to express things as DMA
direction, 'to device', and 'from device' instead.

> +{
> +	int i, unlocked = 0;
> +
> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> +		unsigned long pfn;
> +
> +		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
> +		if (pfn) {
> +			iommu_unmap(iommu->domain, iova, 0);
> +			unlocked += put_pfn(pfn, rdwr);
> +		}
> +	}
> +	return unlocked;
> +}
> +
> +static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> +			   unsigned long npage, int rdwr)
> +{
> +	int unlocked;
> +
> +	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);
> +	vfio_lock_acct(-unlocked);

Have you checked that your accounting will work out if the user maps
the same memory page to multiple IOVAs?

> +}
> +
> +/* Unmap ALL DMA regions */
> +void vfio_iommu_unmapall(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos, *pos2;
> +	struct dma_map_page *mlp;
> +
> +	mutex_lock(&iommu->dgate);
> +	list_for_each_safe(pos, pos2, &iommu->dm_list) {
> +		mlp = list_entry(pos, struct dma_map_page, list);
> +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> +		list_del(&mlp->list);
> +		kfree(mlp);
> +	}
> +	mutex_unlock(&iommu->dgate);

Ouch, no good at all.  Keeping track of every DMA map is no good on
POWER or other systems where IOMMU operations are a hot path.  I think
you'll need an iommu specific hook for this instead, which uses
whatever data structures are natural for the IOMMU.  For example a
1-level pagetable, like we use on POWER will just zero every entry.

> +}
> +
> +static int vaddr_get_pfn(unsigned long vaddr, int rdwr, unsigned long *pfn)
> +{
> +	struct page *page[1];
> +	struct vm_area_struct *vma;
> +	int ret = -EFAULT;
> +
> +	if (get_user_pages_fast(vaddr, 1, rdwr, page) == 1) {
> +		*pfn = page_to_pfn(page[0]);
> +		return 0;
> +	}
> +
> +	down_read(&current->mm->mmap_sem);
> +
> +	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> +
> +	if (vma && vma->vm_flags & VM_PFNMAP) {
> +		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> +		if (is_invalid_reserved_pfn(*pfn))
> +			ret = 0;
> +	}

It's kind of nasty that you take gup_fast(), already designed to grab
pointers for multiple user pages, then just use it one page at a time,
even for a big map.

> +	up_read(&current->mm->mmap_sem);
> +
> +	return ret;
> +}
> +
> +/* Map DMA region */
> +/* dgate must be held */
> +static int vfio_dma_map(struct vfio_iommu *iommu, unsigned long iova,
> +			unsigned long vaddr, int npage, int rdwr)

iova should be a dma_addr_t.  Bus address size need not match virtual
address size, and may not fit in an unsigned long.

> +{
> +	unsigned long start = iova;
> +	int i, ret, locked = 0, prot = IOMMU_READ;
> +
> +	/* Verify pages are not already mapped */
> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
> +		if (iommu_iova_to_phys(iommu->domain, iova))
> +			return -EBUSY;
> +
> +	iova = start;
> +
> +	if (rdwr)
> +		prot |= IOMMU_WRITE;
> +	if (iommu->cache)
> +		prot |= IOMMU_CACHE;
> +
> +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> +		unsigned long pfn = 0;
> +
> +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> +		if (ret) {
> +			__vfio_dma_unmap(iommu, start, i, rdwr);
> +			return ret;
> +		}
> +
> +		/* Only add actual locked pages to accounting */
> +		if (!is_invalid_reserved_pfn(pfn))
> +			locked++;
> +
> +		ret = iommu_map(iommu->domain, iova,
> +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
> +		if (ret) {
> +			/* Back out mappings on error */
> +			put_pfn(pfn, rdwr);
> +			__vfio_dma_unmap(iommu, start, i, rdwr);
> +			return ret;
> +		}
> +	}
> +	vfio_lock_acct(locked);
> +	return 0;
> +}
> +
> +static inline int ranges_overlap(unsigned long start1, size_t size1,
> +				 unsigned long start2, size_t size2)
> +{
> +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);

Needs overflow safety.

> +}
> +
> +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
> +					  dma_addr_t start, size_t size)
> +{
> +	struct list_head *pos;
> +	struct dma_map_page *mlp;
> +
> +	list_for_each(pos, &iommu->dm_list) {
> +		mlp = list_entry(pos, struct dma_map_page, list);
> +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> +				   start, size))
> +			return mlp;
> +	}
> +	return NULL;
> +}

Again, keeping track of each dma map operation is no good for
performance.

> +
> +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
> +			    size_t size, struct dma_map_page *mlp)
> +{
> +	struct dma_map_page *split;
> +	int npage_lo, npage_hi;
> +
> +	/* Existing dma region is completely covered, unmap all */
> +	if (start <= mlp->daddr &&
> +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> +		list_del(&mlp->list);
> +		npage_lo = mlp->npage;
> +		kfree(mlp);
> +		return npage_lo;
> +	}
> +
> +	/* Overlap low address of existing range */
> +	if (start <= mlp->daddr) {
> +		size_t overlap;
> +
> +		overlap = start + size - mlp->daddr;
> +		npage_lo = overlap >> PAGE_SHIFT;
> +		npage_hi = mlp->npage - npage_lo;
> +
> +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> +		mlp->daddr += overlap;
> +		mlp->vaddr += overlap;
> +		mlp->npage -= npage_lo;
> +		return npage_lo;
> +	}
> +
> +	/* Overlap high address of existing range */
> +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> +		size_t overlap;
> +
> +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> +		npage_hi = overlap >> PAGE_SHIFT;
> +		npage_lo = mlp->npage - npage_hi;
> +
> +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> +		mlp->npage -= npage_hi;
> +		return npage_hi;
> +	}
> +
> +	/* Split existing */
> +	npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
> +	npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
> +
> +	split = kzalloc(sizeof *split, GFP_KERNEL);
> +	if (!split)
> +		return -ENOMEM;
> +
> +	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
> +
> +	mlp->npage = npage_lo;
> +
> +	split->npage = npage_hi;
> +	split->daddr = start + size;
> +	split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
> +	split->rdwr = mlp->rdwr;
> +	list_add(&split->list, &iommu->dm_list);
> +	return size >> PAGE_SHIFT;
> +}
> +
> +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> +{
> +	int ret = 0;
> +	size_t npage = dmp->size >> PAGE_SHIFT;
> +	struct list_head *pos, *n;
> +
> +	if (dmp->dmaaddr & ~PAGE_MASK)
> +		return -EINVAL;
> +	if (dmp->size & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->dgate);
> +
> +	list_for_each_safe(pos, n, &iommu->dm_list) {
> +		struct dma_map_page *mlp;
> +
> +		mlp = list_entry(pos, struct dma_map_page, list);
> +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> +				   dmp->dmaaddr, dmp->size)) {
> +			ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
> +						      dmp->size, mlp);
> +			if (ret > 0)
> +				npage -= NPAGE_TO_SIZE(ret);
> +			if (ret < 0 || npage == 0)
> +				break;
> +		}
> +	}
> +	mutex_unlock(&iommu->dgate);
> +	return ret > 0 ? 0 : ret;
> +}
> +
> +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> +{
> +	int npage;
> +	struct dma_map_page *mlp, *mmlp = NULL;
> +	dma_addr_t daddr = dmp->dmaaddr;
> +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;
> +	size_t size = dmp->size;
> +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;
> +
> +	if (vaddr & (PAGE_SIZE-1))
> +		return -EINVAL;
> +	if (daddr & (PAGE_SIZE-1))
> +		return -EINVAL;
> +	if (size & (PAGE_SIZE-1))
> +		return -EINVAL;
> +
> +	npage = size >> PAGE_SHIFT;
> +	if (!npage)
> +		return -EINVAL;
> +
> +	if (!iommu)
> +		return -EINVAL;
> +
> +	mutex_lock(&iommu->dgate);
> +
> +	if (vfio_find_dma(iommu, daddr, size)) {
> +		ret = -EBUSY;
> +		goto out_lock;
> +	}
> +
> +	/* account for locked pages */
> +	locked = current->mm->locked_vm + npage;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> +			__func__, rlimit(RLIMIT_MEMLOCK));
> +		ret = -ENOMEM;
> +		goto out_lock;
> +	}
> +
> +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);
> +	if (ret)
> +		goto out_lock;
> +
> +	/* Check if we abut a region below */
> +	if (daddr) {
> +		mlp = vfio_find_dma(iommu, daddr - 1, 1);
> +		if (mlp && mlp->rdwr == rdwr &&
> +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {
> +
> +			mlp->npage += npage;
> +			daddr = mlp->daddr;
> +			vaddr = mlp->vaddr;
> +			npage = mlp->npage;
> +			size = NPAGE_TO_SIZE(npage);
> +
> +			mmlp = mlp;
> +		}
> +	}
> +
> +	if (daddr + size) {
> +		mlp = vfio_find_dma(iommu, daddr + size, 1);
> +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size) {
> +
> +			mlp->npage += npage;
> +			mlp->daddr = daddr;
> +			mlp->vaddr = vaddr;
> +
> +			/* If merged above and below, remove previously
> +			 * merged entry.  New entry covers it.  */
> +			if (mmlp) {
> +				list_del(&mmlp->list);
> +				kfree(mmlp);
> +			}
> +			mmlp = mlp;
> +		}
> +	}
> +
> +	if (!mmlp) {
> +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
> +		if (!mlp) {
> +			ret = -ENOMEM;
> +			vfio_dma_unmap(iommu, daddr, npage, rdwr);
> +			goto out_lock;
> +		}
> +
> +		mlp->npage = npage;
> +		mlp->daddr = daddr;
> +		mlp->vaddr = vaddr;
> +		mlp->rdwr = rdwr;
> +		list_add(&mlp->list, &iommu->dm_list);
> +	}
> +
> +out_lock:
> +	mutex_unlock(&iommu->dgate);
> +	return ret;
> +}

This whole tracking infrastructure is way too complex to impose on
every IOMMU.  We absolutely don't want to do all this when just
updating a 1-level pagetable.

> +static int vfio_iommu_release(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_iommu *iommu = filep->private_data;
> +
> +	vfio_release_iommu(iommu);
> +	return 0;
> +}
> +
> +static long vfio_iommu_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_iommu *iommu = filep->private_data;
> +	int ret = -ENOSYS;
> +
> +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> +
> +                ret = put_user(flags, (u64 __user *)arg);

Um.. flags surely have to come from the IOMMU driver.

> +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> +		struct vfio_dma_map dm;
> +
> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> +			return -EFAULT;
> +
> +		ret = vfio_dma_map_dm(iommu, &dm);
> +
> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> +			ret = -EFAULT;
> +
> +	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
> +		struct vfio_dma_map dm;
> +
> +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> +			return -EFAULT;
> +
> +		ret = vfio_dma_unmap_dm(iommu, &dm);
> +
> +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> +			ret = -EFAULT;
> +	}
> +	return ret;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long vfio_iommu_compat_ioctl(struct file *filep,
> +				    unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_iommu_unl_ioctl(filep, cmd, arg);

Um, this only works if the structures are exactly compatible between
32-bit and 64-bit ABIs.  I don't think that is always true.

> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +const struct file_operations vfio_iommu_fops = {
> +	.owner		= THIS_MODULE,
> +	.release	= vfio_iommu_release,
> +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_iommu_compat_ioctl,
> +#endif
> +};
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> new file mode 100644
> index 0000000..6169356
> --- /dev/null
> +++ b/drivers/vfio/vfio_main.c
> @@ -0,0 +1,1151 @@
> +/*
> + * VFIO framework
> + *
> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + */
> +
> +#include <linux/cdev.h>
> +#include <linux/compat.h>
> +#include <linux/device.h>
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
> +#include <linux/fs.h>
> +#include <linux/idr.h>
> +#include <linux/iommu.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +#include <linux/wait.h>
> +
> +#include "vfio_private.h"
> +
> +#define DRIVER_VERSION	"0.2"
> +#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
> +#define DRIVER_DESC	"VFIO - User Level meta-driver"
> +
> +static int allow_unsafe_intrs;
> +module_param(allow_unsafe_intrs, int, 0);
> +MODULE_PARM_DESC(allow_unsafe_intrs,
> +        "Allow use of IOMMUs which do not support interrupt remapping");

This should not be a global option, but part of the AMD/Intel IOMMU
specific code.  In general it's a question of how strict the IOMMU
driver is about isolation when it determines what the groups are, and
only the IOMMU driver can know what the possibilities are for its
class of hardware.

> +
> +static struct vfio {
> +	dev_t			devt;
> +	struct cdev		cdev;
> +	struct list_head	group_list;
> +	struct mutex		lock;
> +	struct kref		kref;
> +	struct class		*class;
> +	struct idr		idr;
> +	wait_queue_head_t	release_q;
> +} vfio;
> +
> +static const struct file_operations vfio_group_fops;
> +extern const struct file_operations vfio_iommu_fops;
> +
> +struct vfio_group {
> +	dev_t			devt;
> +	unsigned int		groupid;
> +	struct bus_type		*bus;
> +	struct vfio_iommu	*iommu;
> +	struct list_head	device_list;
> +	struct list_head	iommu_next;
> +	struct list_head	group_next;
> +	int			refcnt;
> +};
> +
> +struct vfio_device {
> +	struct device			*dev;
> +	const struct vfio_device_ops	*ops;
> +	struct vfio_iommu		*iommu;
> +	struct vfio_group		*group;
> +	struct list_head		device_next;
> +	bool				attached;
> +	int				refcnt;
> +	void				*device_data;
> +};
> +
> +/*
> + * Helper functions called under vfio.lock
> + */
> +
> +/* Return true if any devices within a group are opened */
> +static bool __vfio_group_devs_inuse(struct vfio_group *group)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		if (device->refcnt)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/* Return true if any of the groups attached to an iommu are opened.
> + * We can only tear apart merged groups when nothing is left open. */
> +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +		if (group->refcnt)
> +			return true;
> +	}
> +	return false;
> +}
> +
> +/* An iommu is "in use" if it has a file descriptor open or if any of
> + * the groups assigned to the iommu have devices open. */
> +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	if (iommu->refcnt)
> +		return true;
> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +
> +		if (__vfio_group_devs_inuse(group))
> +			return true;
> +	}
> +	return false;
> +}
> +
> +static void __vfio_group_set_iommu(struct vfio_group *group,
> +				   struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	if (group->iommu)
> +		list_del(&group->iommu_next);
> +	if (iommu)
> +		list_add(&group->iommu_next, &iommu->group_list);
> +
> +	group->iommu = iommu;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		device->iommu = iommu;
> +	}
> +}
> +
> +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
> +				    struct vfio_device *device)
> +{
> +	BUG_ON(!iommu->domain && device->attached);
> +
> +	if (!iommu->domain || !device->attached)
> +		return;
> +
> +	iommu_detach_device(iommu->domain, device->dev);
> +	device->attached = false;
> +}
> +
> +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
> +				      struct vfio_group *group)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		__vfio_iommu_detach_dev(iommu, device);
> +	}
> +}
> +
> +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
> +				   struct vfio_device *device)
> +{
> +	int ret;
> +
> +	BUG_ON(device->attached);
> +
> +	if (!iommu || !iommu->domain)
> +		return -EINVAL;
> +
> +	ret = iommu_attach_device(iommu->domain, device->dev);
> +	if (!ret)
> +		device->attached = true;
> +
> +	return ret;
> +}
> +
> +static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,
> +				     struct vfio_group *group)
> +{
> +	struct list_head *pos;
> +
> +	list_for_each(pos, &group->device_list) {
> +		struct vfio_device *device;
> +		int ret;
> +
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		ret = __vfio_iommu_attach_dev(iommu, device);
> +		if (ret) {
> +			__vfio_iommu_detach_group(iommu, group);
> +			return ret;
> +		}
> +	}
> +	return 0;
> +}
> +
> +/* The iommu is viable, ie. ready to be configured, when all the devices
> + * for all the groups attached to the iommu are bound to their vfio device
> + * drivers (ex. vfio-pci).  This sets the device_data private data pointer. */
> +static bool __vfio_iommu_viable(struct vfio_iommu *iommu)
> +{
> +	struct list_head *gpos, *dpos;
> +
> +	list_for_each(gpos, &iommu->group_list) {
> +		struct vfio_group *group;
> +		group = list_entry(gpos, struct vfio_group, iommu_next);
> +
> +		list_for_each(dpos, &group->device_list) {
> +			struct vfio_device *device;
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +
> +			if (!device->device_data)
> +				return false;
> +		}
> +	}
> +	return true;
> +}
> +
> +static void __vfio_close_iommu(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +
> +	if (!iommu->domain)
> +		return;
> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +
> +		__vfio_iommu_detach_group(iommu, group);
> +	}
> +
> +	vfio_iommu_unmapall(iommu);
> +
> +	iommu_domain_free(iommu->domain);
> +	iommu->domain = NULL;
> +	iommu->mm = NULL;
> +}
> +
> +/* Open the IOMMU.  This gates all access to the iommu or device file
> + * descriptors and sets current->mm as the exclusive user. */
> +static int __vfio_open_iommu(struct vfio_iommu *iommu)
> +{
> +	struct list_head *pos;
> +	int ret;
> +
> +	if (!__vfio_iommu_viable(iommu))
> +		return -EBUSY;
> +
> +	if (iommu->domain)
> +		return -EINVAL;
> +
> +	iommu->domain = iommu_domain_alloc(iommu->bus);
> +	if (!iommu->domain)
> +		return -EFAULT;
> +
> +	list_for_each(pos, &iommu->group_list) {
> +		struct vfio_group *group;
> +		group = list_entry(pos, struct vfio_group, iommu_next);
> +
> +		ret = __vfio_iommu_attach_group(iommu, group);
> +		if (ret) {
> +			__vfio_close_iommu(iommu);
> +			return ret;
> +		}
> +	}
> +
> +	if (!allow_unsafe_intrs &&
> +	    !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
> +		__vfio_close_iommu(iommu);
> +		return -EFAULT;
> +	}
> +
> +	iommu->cache = (iommu_domain_has_cap(iommu->domain,
> +					     IOMMU_CAP_CACHE_COHERENCY) != 0);
> +	iommu->mm = current->mm;
> +
> +	return 0;
> +}
> +
> +/* Actively try to tear down the iommu and merged groups.  If there are no
> + * open iommu or device fds, we close the iommu.  If we close the iommu and
> + * there are also no open group fds, we can futher dissolve the group to
> + * iommu association and free the iommu data structure. */
> +static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)
> +{
> +
> +	if (__vfio_iommu_inuse(iommu))
> +		return -EBUSY;
> +
> +	__vfio_close_iommu(iommu);
> +
> +	if (!__vfio_iommu_groups_inuse(iommu)) {
> +		struct list_head *pos, *ppos;
> +
> +		list_for_each_safe(pos, ppos, &iommu->group_list) {
> +			struct vfio_group *group;
> +
> +			group = list_entry(pos, struct vfio_group, iommu_next);
> +			__vfio_group_set_iommu(group, NULL);
> +		}
> +
> +
> +		kfree(iommu);
> +	}
> +
> +	return 0;
> +}
> +
> +static struct vfio_device *__vfio_lookup_dev(struct device *dev)
> +{
> +	struct list_head *gpos;
> +	unsigned int groupid;
> +
> +	if (iommu_device_group(dev, &groupid))
> +		return NULL;
> +
> +	list_for_each(gpos, &vfio.group_list) {
> +		struct vfio_group *group;
> +		struct list_head *dpos;
> +
> +		group = list_entry(gpos, struct vfio_group, group_next);
> +
> +		if (group->groupid != groupid)
> +			continue;
> +
> +		list_for_each(dpos, &group->device_list) {
> +			struct vfio_device *device;
> +
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +
> +			if (device->dev == dev)
> +				return device;
> +		}
> +	}
> +	return NULL;
> +}
> +
> +/* All release paths simply decrement the refcnt, attempt to teardown
> + * the iommu and merged groups, and wakeup anything that might be
> + * waiting if we successfully dissolve anything. */
> +static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)
> +{
> +	bool wake;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	(*refcnt)--;
> +	wake = (__vfio_try_dissolve_iommu(iommu) == 0);
> +
> +	mutex_unlock(&vfio.lock);
> +
> +	if (wake)
> +		wake_up(&vfio.release_q);
> +
> +	return 0;
> +}
> +
> +/*
> + * Device fops - passthrough to vfio device driver w/ device_data
> + */
> +static int vfio_device_release(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	vfio_do_release(&device->refcnt, device->iommu);
> +
> +	device->ops->put(device->device_data);
> +
> +	return 0;
> +}
> +
> +static long vfio_device_unl_ioctl(struct file *filep,
> +				  unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->ioctl(device->device_data, cmd, arg);
> +}
> +
> +static ssize_t vfio_device_read(struct file *filep, char __user *buf,
> +				size_t count, loff_t *ppos)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->read(device->device_data, buf, count, ppos);
> +}
> +
> +static ssize_t vfio_device_write(struct file *filep, const char __user *buf,
> +				 size_t count, loff_t *ppos)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->write(device->device_data, buf, count, ppos);
> +}
> +
> +static int vfio_device_mmap(struct file *filep, struct vm_area_struct *vma)
> +{
> +	struct vfio_device *device = filep->private_data;
> +
> +	return device->ops->mmap(device->device_data, vma);
> +}
> +	
> +#ifdef CONFIG_COMPAT
> +static long vfio_device_compat_ioctl(struct file *filep,
> +				     unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_device_unl_ioctl(filep, cmd, arg);
> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +const struct file_operations vfio_device_fops = {
> +	.owner		= THIS_MODULE,
> +	.release	= vfio_device_release,
> +	.read		= vfio_device_read,
> +	.write		= vfio_device_write,
> +	.unlocked_ioctl	= vfio_device_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_device_compat_ioctl,
> +#endif
> +	.mmap		= vfio_device_mmap,
> +};
> +
> +/*
> + * Group fops
> + */
> +static int vfio_group_open(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_group *group;
> +	int ret = 0;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	group = idr_find(&vfio.idr, iminor(inode));
> +
> +	if (!group) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	filep->private_data = group;
> +
> +	if (!group->iommu) {
> +		struct vfio_iommu *iommu;
> +
> +		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
> +		if (!iommu) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +		INIT_LIST_HEAD(&iommu->group_list);
> +		INIT_LIST_HEAD(&iommu->dm_list);
> +		mutex_init(&iommu->dgate);
> +		iommu->bus = group->bus;
> +		__vfio_group_set_iommu(group, iommu);
> +	}
> +	group->refcnt++;
> +
> +out:
> +	mutex_unlock(&vfio.lock);
> +
> +	return ret;
> +}
> +
> +static int vfio_group_release(struct inode *inode, struct file *filep)
> +{
> +	struct vfio_group *group = filep->private_data;
> +
> +	return vfio_do_release(&group->refcnt, group->iommu);
> +}
> +
> +/* Attempt to merge the group pointed to by fd into group.  The merge-ee
> + * group must not have an iommu or any devices open because we cannot
> + * maintain that context across the merge.  The merge-er group can be
> + * in use. */

Yeah, so merge-er group in use still has its problems, because it
could affect what the IOMMU is capable of.

> +static int vfio_group_merge(struct vfio_group *group, int fd)
> +{
> +	struct vfio_group *new;
> +	struct vfio_iommu *old_iommu;
> +	struct file *file;
> +	int ret = 0;
> +	bool opened = false;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	file = fget(fd);
> +	if (!file) {
> +		ret = -EBADF;
> +		goto out_noput;
> +	}
> +
> +	/* Sanity check, is this really our fd? */
> +	if (file->f_op != &vfio_group_fops) {

This should be a WARN_ON or BUG_ON rather than just an error return, surely.

> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	new = file->private_data;
> +
> +	if (!new || new == group || !new->iommu ||
> +	    new->iommu->domain || new->bus != group->bus) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/* We need to attach all the devices to each domain separately
> +	 * in order to validate that the capabilities match for both.  */
> +	ret = __vfio_open_iommu(new->iommu);
> +	if (ret)
> +		goto out;
> +
> +	if (!group->iommu->domain) {
> +		ret = __vfio_open_iommu(group->iommu);
> +		if (ret)
> +			goto out;
> +		opened = true;
> +	}
> +
> +	/* If cache coherency doesn't match we'd potentialy need to
> +	 * remap existing iommu mappings in the merge-er domain.
> +	 * Poor return to bother trying to allow this currently. */
> +	if (iommu_domain_has_cap(group->iommu->domain,
> +				 IOMMU_CAP_CACHE_COHERENCY) !=
> +	    iommu_domain_has_cap(new->iommu->domain,
> +				 IOMMU_CAP_CACHE_COHERENCY)) {
> +		__vfio_close_iommu(new->iommu);
> +		if (opened)
> +			__vfio_close_iommu(group->iommu);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/* Close the iommu for the merge-ee and attach all its devices
> +	 * to the merge-er iommu. */
> +	__vfio_close_iommu(new->iommu);
> +
> +	ret = __vfio_iommu_attach_group(group->iommu, new);
> +	if (ret)
> +		goto out;
> +
> +	/* set_iommu unlinks new from the iommu, so save a pointer to it */
> +	old_iommu = new->iommu;
> +	__vfio_group_set_iommu(new, group->iommu);
> +	kfree(old_iommu);
> +
> +out:
> +	fput(file);
> +out_noput:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Unmerge the group pointed to by fd from group. */
> +static int vfio_group_unmerge(struct vfio_group *group, int fd)
> +{
> +	struct vfio_group *new;
> +	struct vfio_iommu *new_iommu;
> +	struct file *file;
> +	int ret = 0;
> +
> +	/* Since the merge-out group is already opened, it needs to
> +	 * have an iommu struct associated with it. */
> +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
> +	if (!new_iommu)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&new_iommu->group_list);
> +	INIT_LIST_HEAD(&new_iommu->dm_list);
> +	mutex_init(&new_iommu->dgate);
> +	new_iommu->bus = group->bus;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	file = fget(fd);
> +	if (!file) {
> +		ret = -EBADF;
> +		goto out_noput;
> +	}
> +
> +	/* Sanity check, is this really our fd? */
> +	if (file->f_op != &vfio_group_fops) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	new = file->private_data;
> +	if (!new || new == group || new->iommu != group->iommu) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	/* We can't merge-out a group with devices still in use. */
> +	if (__vfio_group_devs_inuse(new)) {
> +		ret = -EBUSY;
> +		goto out;
> +	}
> +
> +	__vfio_iommu_detach_group(group->iommu, new);
> +	__vfio_group_set_iommu(new, new_iommu);
> +
> +out:
> +	fput(file);
> +out_noput:
> +	if (ret)
> +		kfree(new_iommu);
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Get a new iommu file descriptor.  This will open the iommu, setting
> + * the current->mm ownership if it's not already set. */

I know I've had this explained to me several times before, but I've
forgotten again.  Why do we need to wire the iommu to an mm?

> +static int vfio_group_get_iommu_fd(struct vfio_group *group)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	if (!group->iommu->domain) {
> +		ret = __vfio_open_iommu(group->iommu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
> +			       group->iommu, O_RDWR);
> +	if (ret < 0)
> +		goto out;
> +
> +	group->iommu->refcnt++;
> +out:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Get a new device file descriptor.  This will open the iommu, setting
> + * the current->mm ownership if it's not already set.  It's difficult to
> + * specify the requirements for matching a user supplied buffer to a
> + * device, so we use a vfio driver callback to test for a match.  For
> + * PCI, dev_name(dev) is unique, but other drivers may require including
> + * a parent device string. */

At some point we probably want an interface to enumerate the devices
too, but that can probably wait.

> +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
> +{
> +	struct vfio_iommu *iommu = group->iommu;
> +	struct list_head *gpos;
> +	int ret = -ENODEV;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	if (!iommu->domain) {
> +		ret = __vfio_open_iommu(iommu);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	list_for_each(gpos, &iommu->group_list) {
> +		struct list_head *dpos;
> +
> +		group = list_entry(gpos, struct vfio_group, iommu_next);
> +
> +		list_for_each(dpos, &group->device_list) {
> +			struct vfio_device *device;
> +
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +
> +			if (device->ops->match(device->dev, buf)) {
> +				struct file *file;
> +
> +				if (device->ops->get(device->device_data)) {
> +					ret = -EFAULT;
> +					goto out;
> +				}
> +
> +				/* We can't use anon_inode_getfd(), like above
> +				 * because we need to modify the f_mode flags
> +				 * directly to allow more than just ioctls */
> +				ret = get_unused_fd();
> +				if (ret < 0) {
> +					device->ops->put(device->device_data);
> +					goto out;
> +				}
> +
> +				file = anon_inode_getfile("[vfio-device]",
> +							  &vfio_device_fops,
> +							  device, O_RDWR);
> +				if (IS_ERR(file)) {
> +					put_unused_fd(ret);
> +					ret = PTR_ERR(file);
> +					device->ops->put(device->device_data);
> +					goto out;
> +				}
> +
> +				/* Todo: add an anon_inode interface to do
> +				 * this.  Appears to be missing by lack of
> +				 * need rather than explicitly prevented.
> +				 * Now there's need. */
> +				file->f_mode |= (FMODE_LSEEK |
> +						 FMODE_PREAD |
> +						 FMODE_PWRITE);
> +
> +				fd_install(ret, file);
> +
> +				device->refcnt++;
> +				goto out;
> +			}
> +		}
> +	}
> +out:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +static long vfio_group_unl_ioctl(struct file *filep,
> +				 unsigned int cmd, unsigned long arg)
> +{
> +	struct vfio_group *group = filep->private_data;
> +
> +	if (cmd == VFIO_GROUP_GET_FLAGS) {
> +		u64 flags = 0;
> +
> +		mutex_lock(&vfio.lock);
> +		if (__vfio_iommu_viable(group->iommu))
> +			flags |= VFIO_GROUP_FLAGS_VIABLE;
> +		mutex_unlock(&vfio.lock);
> +
> +		if (group->iommu->mm)
> +			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;
> +
> +		return put_user(flags, (u64 __user *)arg);
> +	}
> +		
> +	/* Below commands are restricted once the mm is set */
> +	if (group->iommu->mm && group->iommu->mm != current->mm)
> +		return -EPERM;
> +
> +	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {
> +		int fd;
> +		
> +		if (get_user(fd, (int __user *)arg))
> +			return -EFAULT;
> +		if (fd < 0)
> +			return -EINVAL;
> +
> +		if (cmd == VFIO_GROUP_MERGE)
> +			return vfio_group_merge(group, fd);
> +		else
> +			return vfio_group_unmerge(group, fd);
> +	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {
> +		return vfio_group_get_iommu_fd(group);
> +	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {
> +		char *buf;
> +		int ret;
> +
> +		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
> +		if (IS_ERR(buf))
> +			return PTR_ERR(buf);
> +
> +		ret = vfio_group_get_device_fd(group, buf);
> +		kfree(buf);
> +		return ret;
> +	}
> +
> +	return -ENOSYS;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long vfio_group_compat_ioctl(struct file *filep,
> +				    unsigned int cmd, unsigned long arg)
> +{
> +	arg = (unsigned long)compat_ptr(arg);
> +	return vfio_group_unl_ioctl(filep, cmd, arg);
> +}
> +#endif	/* CONFIG_COMPAT */
> +
> +static const struct file_operations vfio_group_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= vfio_group_open,
> +	.release	= vfio_group_release,
> +	.unlocked_ioctl	= vfio_group_unl_ioctl,
> +#ifdef CONFIG_COMPAT
> +	.compat_ioctl	= vfio_group_compat_ioctl,
> +#endif
> +};
> +
> +/* iommu fd release hook */
> +int vfio_release_iommu(struct vfio_iommu *iommu)
> +{
> +	return vfio_do_release(&iommu->refcnt, iommu);
> +}
> +
> +/*
> + * VFIO driver API
> + */
> +
> +/* Add a new device to the vfio framework with associated vfio driver
> + * callbacks.  This is the entry point for vfio drivers to register devices. */
> +int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
> +{
> +	struct list_head *pos;
> +	struct vfio_group *group = NULL;
> +	struct vfio_device *device = NULL;
> +	unsigned int groupid;
> +	int ret = 0;
> +	bool new_group = false;
> +
> +	if (!ops)
> +		return -EINVAL;
> +
> +	if (iommu_device_group(dev, &groupid))
> +		return -ENODEV;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	list_for_each(pos, &vfio.group_list) {
> +		group = list_entry(pos, struct vfio_group, group_next);
> +		if (group->groupid == groupid)
> +			break;
> +		group = NULL;
> +	}
> +
> +	if (!group) {
> +		int minor;
> +
> +		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		group = kzalloc(sizeof(*group), GFP_KERNEL);
> +		if (!group) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		group->groupid = groupid;
> +		INIT_LIST_HEAD(&group->device_list);
> +
> +		ret = idr_get_new(&vfio.idr, group, &minor);
> +		if (ret == 0 && minor > MINORMASK) {
> +			idr_remove(&vfio.idr, minor);
> +			kfree(group);
> +			ret = -ENOSPC;
> +			goto out;
> +		}
> +
> +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> +		device_create(vfio.class, NULL, group->devt,
> +			      group, "%u", groupid);
> +
> +		group->bus = dev->bus;
> +		list_add(&group->group_next, &vfio.group_list);
> +		new_group = true;
> +	} else {
> +		if (group->bus != dev->bus) {
> +			printk(KERN_WARNING
> +			       "Error: IOMMU group ID conflict.  Group ID %u "
> +				"on both bus %s and %s\n", groupid,
> +				group->bus->name, dev->bus->name);
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		list_for_each(pos, &group->device_list) {
> +			device = list_entry(pos,
> +					    struct vfio_device, device_next);
> +			if (device->dev == dev)
> +				break;
> +			device = NULL;
> +		}
> +	}
> +
> +	if (!device) {
> +		if (__vfio_group_devs_inuse(group) ||
> +		    (group->iommu && group->iommu->refcnt)) {
> +			printk(KERN_WARNING
> +			       "Adding device %s to group %u while group is already in use!!\n",
> +			       dev_name(dev), group->groupid);
> +			/* XXX How to prevent other drivers from claiming? */
> +		}
> +
> +		device = kzalloc(sizeof(*device), GFP_KERNEL);
> +		if (!device) {
> +			/* If we just created this group, tear it down */
> +			if (new_group) {
> +				list_del(&group->group_next);
> +				device_destroy(vfio.class, group->devt);
> +				idr_remove(&vfio.idr, MINOR(group->devt));
> +				kfree(group);
> +			}
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +
> +		list_add(&device->device_next, &group->device_list);
> +		device->dev = dev;
> +		device->ops = ops;
> +		device->iommu = group->iommu; /* NULL if new */
> +		__vfio_iommu_attach_dev(group->iommu, device);
> +	}
> +out:
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_add_dev);
> +
> +/* Remove a device from the vfio framework */
> +void vfio_group_del_dev(struct device *dev)
> +{
> +	struct list_head *pos;
> +	struct vfio_group *group = NULL;
> +	struct vfio_device *device = NULL;
> +	unsigned int groupid;
> +
> +	if (iommu_device_group(dev, &groupid))
> +		return;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	list_for_each(pos, &vfio.group_list) {
> +		group = list_entry(pos, struct vfio_group, group_next);
> +		if (group->groupid == groupid)
> +			break;
> +		group = NULL;
> +	}
> +
> +	if (!group)
> +		goto out;
> +
> +	list_for_each(pos, &group->device_list) {
> +		device = list_entry(pos, struct vfio_device, device_next);
> +		if (device->dev == dev)
> +			break;
> +		device = NULL;
> +	}
> +
> +	if (!device)
> +		goto out;
> +
> +	BUG_ON(device->refcnt);
> +
> +	if (device->attached)
> +		__vfio_iommu_detach_dev(group->iommu, device);
> +
> +	list_del(&device->device_next);
> +	kfree(device);
> +
> +	/* If this was the only device in the group, remove the group.
> +	 * Note that we intentionally unmerge empty groups here if the
> +	 * group fd isn't opened. */
> +	if (list_empty(&group->device_list) && group->refcnt == 0) {
> +		struct vfio_iommu *iommu = group->iommu;
> +
> +		if (iommu) {
> +			__vfio_group_set_iommu(group, NULL);
> +			__vfio_try_dissolve_iommu(iommu);
> +		}
> +
> +		device_destroy(vfio.class, group->devt);
> +		idr_remove(&vfio.idr, MINOR(group->devt));
> +		list_del(&group->group_next);
> +		kfree(group);
> +	}
> +out:
> +	mutex_unlock(&vfio.lock);
> +}
> +EXPORT_SYMBOL_GPL(vfio_group_del_dev);
> +
> +/* When a device is bound to a vfio device driver (ex. vfio-pci), this
> + * entry point is used to mark the device usable (viable).  The vfio
> + * device driver associates a private device_data struct with the device
> + * here, which will later be return for vfio_device_fops callbacks. */
> +int vfio_bind_dev(struct device *dev, void *device_data)
> +{
> +	struct vfio_device *device;
> +	int ret = -EINVAL;
> +
> +	BUG_ON(!device_data);
> +
> +	mutex_lock(&vfio.lock);
> +
> +	device = __vfio_lookup_dev(dev);
> +
> +	BUG_ON(!device);
> +
> +	ret = dev_set_drvdata(dev, device);
> +	if (!ret)
> +		device->device_data = device_data;
> +
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(vfio_bind_dev);
> +
> +/* A device is only removeable if the iommu for the group is not in use. */
> +static bool vfio_device_removeable(struct vfio_device *device)
> +{
> +	bool ret = true;
> +
> +	mutex_lock(&vfio.lock);
> +
> +	if (device->iommu && __vfio_iommu_inuse(device->iommu))
> +		ret = false;
> +
> +	mutex_unlock(&vfio.lock);
> +	return ret;
> +}
> +
> +/* Notify vfio that a device is being unbound from the vfio device driver
> + * and return the device private device_data pointer.  If the group is
> + * in use, we need to block or take other measures to make it safe for
> + * the device to be removed from the iommu. */
> +void *vfio_unbind_dev(struct device *dev)
> +{
> +	struct vfio_device *device = dev_get_drvdata(dev);
> +	void *device_data;
> +
> +	BUG_ON(!device);
> +
> +again:
> +	if (!vfio_device_removeable(device)) {
> +		/* XXX signal for all devices in group to be removed or
> +		 * resort to killing the process holding the device fds.
> +		 * For now just block waiting for releases to wake us. */
> +		wait_event(vfio.release_q, vfio_device_removeable(device));
> +	}
> +
> +	mutex_lock(&vfio.lock);
> +
> +	/* Need to re-check that the device is still removeable under lock. */
> +	if (device->iommu && __vfio_iommu_inuse(device->iommu)) {
> +		mutex_unlock(&vfio.lock);
> +		goto again;
> +	}
> +
> +	device_data = device->device_data;
> +
> +	device->device_data = NULL;
> +	dev_set_drvdata(dev, NULL);
> +
> +	mutex_unlock(&vfio.lock);
> +	return device_data;
> +}
> +EXPORT_SYMBOL_GPL(vfio_unbind_dev);
> +
> +/*
> + * Module/class support
> + */
> +static void vfio_class_release(struct kref *kref)
> +{
> +	class_destroy(vfio.class);
> +	vfio.class = NULL;
> +}
> +
> +static char *vfio_devnode(struct device *dev, mode_t *mode)
> +{
> +	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
> +}
> +
> +static int __init vfio_init(void)
> +{
> +	int ret;
> +
> +	idr_init(&vfio.idr);
> +	mutex_init(&vfio.lock);
> +	INIT_LIST_HEAD(&vfio.group_list);
> +	init_waitqueue_head(&vfio.release_q);
> +
> +	kref_init(&vfio.kref);
> +	vfio.class = class_create(THIS_MODULE, "vfio");
> +	if (IS_ERR(vfio.class)) {
> +		ret = PTR_ERR(vfio.class);
> +		goto err_class;
> +	}
> +
> +	vfio.class->devnode = vfio_devnode;
> +
> +	/* FIXME - how many minors to allocate... all of them! */
> +	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
> +	if (ret)
> +		goto err_chrdev;
> +
> +	cdev_init(&vfio.cdev, &vfio_group_fops);
> +	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
> +	if (ret)
> +		goto err_cdev;
> +
> +	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
> +
> +	return 0;
> +
> +err_cdev:
> +	unregister_chrdev_region(vfio.devt, MINORMASK);
> +err_chrdev:
> +	kref_put(&vfio.kref, vfio_class_release);
> +err_class:
> +	return ret;
> +}
> +
> +static void __exit vfio_cleanup(void)
> +{
> +	struct list_head *gpos, *gppos;
> +
> +	list_for_each_safe(gpos, gppos, &vfio.group_list) {
> +		struct vfio_group *group;
> +		struct list_head *dpos, *dppos;
> +
> +		group = list_entry(gpos, struct vfio_group, group_next);
> +
> +		list_for_each_safe(dpos, dppos, &group->device_list) {
> +			struct vfio_device *device;
> +
> +			device = list_entry(dpos,
> +					    struct vfio_device, device_next);
> +			vfio_group_del_dev(device->dev);
> +		}
> +	}
> +
> +	idr_destroy(&vfio.idr);
> +	cdev_del(&vfio.cdev);
> +	unregister_chrdev_region(vfio.devt, MINORMASK);
> +	kref_put(&vfio.kref, vfio_class_release);
> +}
> +
> +module_init(vfio_init);
> +module_exit(vfio_cleanup);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
> new file mode 100644
> index 0000000..350ad67
> --- /dev/null
> +++ b/drivers/vfio/vfio_private.h
> @@ -0,0 +1,34 @@
> +/*
> + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> + *     Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * Derived from original vfio:
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + */
> +
> +#include <linux/list.h>
> +#include <linux/mutex.h>
> +
> +#ifndef VFIO_PRIVATE_H
> +#define VFIO_PRIVATE_H
> +
> +struct vfio_iommu {
> +	struct iommu_domain		*domain;
> +	struct bus_type			*bus;
> +	struct mutex			dgate;
> +	struct list_head		dm_list;
> +	struct mm_struct		*mm;
> +	struct list_head		group_list;
> +	int				refcnt;
> +	bool				cache;
> +};
> +
> +extern int vfio_release_iommu(struct vfio_iommu *iommu);
> +extern void vfio_iommu_unmapall(struct vfio_iommu *iommu);
> +
> +#endif /* VFIO_PRIVATE_H */
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> new file mode 100644
> index 0000000..4269b08
> --- /dev/null
> +++ b/include/linux/vfio.h
> @@ -0,0 +1,155 @@
> +/*
> + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> + * Author: Tom Lyon, pugs@cisco.com
> + *
> + * This program is free software; you may redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; version 2 of the License.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + *
> + * Portions derived from drivers/uio/uio.c:
> + * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
> + * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
> + * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
> + * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
> + *
> + * Portions derived from drivers/uio/uio_pci_generic.c:
> + * Copyright (C) 2009 Red Hat, Inc.
> + * Author: Michael S. Tsirkin <mst@redhat.com>
> + */
> +#include <linux/types.h>
> +
> +#ifndef VFIO_H
> +#define VFIO_H
> +
> +#ifdef __KERNEL__
> +
> +struct vfio_device_ops {
> +	bool			(*match)(struct device *, char *);
> +	int			(*get)(void *);
> +	void			(*put)(void *);
> +	ssize_t			(*read)(void *, char __user *,
> +					size_t, loff_t *);
> +	ssize_t			(*write)(void *, const char __user *,
> +					 size_t, loff_t *);
> +	long			(*ioctl)(void *, unsigned int, unsigned long);
> +	int			(*mmap)(void *, struct vm_area_struct *);
> +};
> +
> +extern int vfio_group_add_dev(struct device *device,
> +			      const struct vfio_device_ops *ops);
> +extern void vfio_group_del_dev(struct device *device);
> +extern int vfio_bind_dev(struct device *device, void *device_data);
> +extern void *vfio_unbind_dev(struct device *device);
> +
> +#endif /* __KERNEL__ */
> +
> +/*
> + * VFIO driver - allow mapping and use of certain devices
> + * in unprivileged user processes. (If IOMMU is present)
> + * Especially useful for Virtual Function parts of SR-IOV devices
> + */
> +
> +
> +/* Kernel & User level defines for ioctls */
> +
> +#define VFIO_GROUP_GET_FLAGS		_IOR(';', 100, __u64)
> + #define VFIO_GROUP_FLAGS_VIABLE	(1 << 0)
> + #define VFIO_GROUP_FLAGS_MM_LOCKED	(1 << 1)
> +#define VFIO_GROUP_MERGE		_IOW(';', 101, int)
> +#define VFIO_GROUP_UNMERGE		_IOW(';', 102, int)
> +#define VFIO_GROUP_GET_IOMMU_FD		_IO(';', 103)
> +#define VFIO_GROUP_GET_DEVICE_FD	_IOW(';', 104, char *)
> +
> +/*
> + * Structure for DMA mapping of user buffers
> + * vaddr, dmaaddr, and size must all be page aligned
> + */
> +struct vfio_dma_map {
> +	__u64	len;		/* length of structure */
> +	__u64	vaddr;		/* process virtual addr */
> +	__u64	dmaaddr;	/* desired and/or returned dma address */
> +	__u64	size;		/* size in bytes */
> +	__u64	flags;
> +#define	VFIO_DMA_MAP_FLAG_WRITE		(1 << 0) /* req writeable DMA mem */
> +};
> +
> +#define	VFIO_IOMMU_GET_FLAGS		_IOR(';', 105, __u64)
> + /* Does the IOMMU support mapping any IOVA to any virtual address? */
> + #define VFIO_IOMMU_FLAGS_MAP_ANY	(1 << 0)
> +#define	VFIO_IOMMU_MAP_DMA		_IOWR(';', 106, struct vfio_dma_map)
> +#define	VFIO_IOMMU_UNMAP_DMA		_IOWR(';', 107, struct vfio_dma_map)
> +
> +#define VFIO_DEVICE_GET_FLAGS		_IOR(';', 108, __u64)
> + #define VFIO_DEVICE_FLAGS_PCI		(1 << 0)
> + #define VFIO_DEVICE_FLAGS_DT		(1 << 1)
> + #define VFIO_DEVICE_FLAGS_RESET	(1 << 2)
> +#define VFIO_DEVICE_GET_NUM_REGIONS	_IOR(';', 109, int)
> +
> +struct vfio_region_info {
> +	__u32	len;		/* length of structure */
> +	__u32	index;		/* region number */
> +	__u64	size;		/* size in bytes of region */
> +	__u64	offset;		/* start offset of region */
> +	__u64	flags;
> +#define VFIO_REGION_INFO_FLAG_MMAP		(1 << 0)
> +#define VFIO_REGION_INFO_FLAG_RO		(1 << 1)
> +#define VFIO_REGION_INFO_FLAG_PHYS_VALID	(1 << 2)
> +	__u64	phys;		/* physical address of region */
> +};
> +
> +#define VFIO_DEVICE_GET_REGION_INFO	_IOWR(';', 110, struct vfio_region_info)
> +
> +#define VFIO_DEVICE_GET_NUM_IRQS	_IOR(';', 111, int)
> +
> +struct vfio_irq_info {
> +	__u32	len;		/* length of structure */
> +	__u32	index;		/* IRQ number */
> +	__u32	count;		/* number of individual IRQs */
> +	__u32	flags;
> +#define VFIO_IRQ_INFO_FLAG_LEVEL		(1 << 0)
> +};
> +
> +#define VFIO_DEVICE_GET_IRQ_INFO	_IOWR(';', 112, struct vfio_irq_info)
> +
> +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> +#define VFIO_DEVICE_SET_IRQ_EVENTFDS	_IOW(';', 113, int)
> +
> +/* Unmask IRQ index, arg[0] = index */
> +#define VFIO_DEVICE_UNMASK_IRQ		_IOW(';', 114, int)
> +
> +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD	_IOW(';', 115, int)
> +
> +#define VFIO_DEVICE_RESET		_IO(';', 116)
> +
> +struct vfio_dtpath {
> +	__u32	len;		/* length of structure */
> +	__u32	index;
> +	__u64	flags;
> +#define VFIO_DTPATH_FLAGS_REGION	(1 << 0)
> +#define VFIO_DTPATH_FLAGS_IRQ		(1 << 1)
> +	char	*path;
> +};
> +#define VFIO_DEVICE_GET_DTPATH		_IOWR(';', 117, struct vfio_dtpath)
> +
> +struct vfio_dtindex {
> +	__u32	len;		/* length of structure */
> +	__u32	index;
> +	__u32	prop_type;
> +	__u32	prop_index;
> +	__u64	flags;
> +#define VFIO_DTINDEX_FLAGS_REGION	(1 << 0)
> +#define VFIO_DTINDEX_FLAGS_IRQ		(1 << 1)
> +};
> +#define VFIO_DEVICE_GET_DTINDEX		_IOWR(';', 118, struct vfio_dtindex)
> +
> +#endif /* VFIO_H */
>
Alex Williamson Nov. 15, 2011, 6:01 p.m. UTC | #23
On Tue, 2011-11-15 at 17:34 +1100, David Gibson wrote:
> On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> > diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> > new file mode 100644
> > index 0000000..5866896
> > --- /dev/null
> > +++ b/Documentation/vfio.txt
> > @@ -0,0 +1,304 @@
> > +VFIO - "Virtual Function I/O"[1]
> > +-------------------------------------------------------------------------------
> > +Many modern system now provide DMA and interrupt remapping facilities
> > +to help ensure I/O devices behave within the boundaries they've been
> > +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> > +well as POWER systems with Partitionable Endpoints (PEs) and even
> > +embedded powerpc systems (technology name unknown).  The VFIO driver
> > +is an IOMMU/device agnostic framework for exposing direct device
> > +access to userspace, in a secure, IOMMU protected environment.  In
> > +other words, this allows safe, non-privileged, userspace drivers.
> 
> It's perhaps worth emphasisng that "safe" depends on the hardware
> being sufficiently well behaved.  BenH, I know, thinks there are a
> *lot* of cards that, e.g. have debug registers that allow a backdoor
> to their own config space via MMIO, which would bypass vfio's
> filtering of config space access.  And that's before we even get into
> the varying degrees of completeness in the isolation provided by
> different IOMMUs.

Fair enough.  I know Tom had emphasized "well behaved" in the original
doc.  Virtual functions are probably the best indicator of well behaved.

> > +Why do we want that?  Virtual machines often make use of direct device
> > +access ("device assignment") when configured for the highest possible
> > +I/O performance.  From a device and host perspective, this simply turns
> > +the VM into a userspace driver, with the benefits of significantly
> > +reduced latency, higher bandwidth, and direct use of bare-metal device
> > +drivers[2].
> > +
> > +Some applications, particularly in the high performance computing
> > +field, also benefit from low-overhead, direct device access from
> > +userspace.  Examples include network adapters (often non-TCP/IP based)
> > +and compute accelerators.  Previous to VFIO, these drivers needed to
> 
> s/Previous/Prior/  although that may be a .us vs .au usage thing.

Same difference, AFAICT.

> > +go through the full development cycle to become proper upstream driver,
> > +be maintained out of tree, or make use of the UIO framework, which
> > +has no notion of IOMMU protection, limited interrupt support, and
> > +requires root privileges to access things like PCI configuration space.
> > +
> > +The VFIO driver framework intends to unify these, replacing both the
> > +KVM PCI specific device assignment currently used as well as provide
> > +a more secure, more featureful userspace driver environment than UIO.
> > +
> > +Groups, Devices, IOMMUs, oh my
> > +-------------------------------------------------------------------------------
> > +
> > +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> > +can't always distinguish transactions from each individual device in
> > +the system.  Sometimes this is because of the IOMMU design, such as with
> > +PEs, other times it's caused by the I/O topology, for instance a
> > +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> > +devices created by these restictions IOMMU groups (or just "groups" for
> > +this document).
> > +
> > +The IOMMU cannot distiguish transactions between the individual devices
> > +within the group, therefore the group is the basic unit of ownership for
> > +a userspace process.  Because of this, groups are also the primary
> > +interface to both devices and IOMMU domains in VFIO.
> > +
> > +The VFIO representation of groups is created as devices are added into
> > +the framework by a VFIO bus driver.  The vfio-pci module is an example
> > +of a bus driver.  This module registers devices along with a set of bus
> > +specific callbacks with the VFIO core.  These callbacks provide the
> > +interfaces later used for device access.  As each new group is created,
> > +as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
> > +character device.
> 
> Ok.. so, the fact that it's called "vfio-pci" suggests that the VFIO
> bus driver is per bus type, not per bus instance.   But grouping
> constraints could be per bus instance, if you have a couple of
> different models of PCI host bridge with IOMMUs of different
> capabilities built in, for example.

Yes, vfio-pci manages devices on the pci_bus_type; per type, not per bus
instance.  IOMMUs also register drivers per bus type, not per bus
instance.  The IOMMU driver is free to impose any constraints it wants.

> > +In addition to the device enumeration and callbacks, the VFIO bus driver
> > +also provides a traditional device driver and is able to bind to devices
> > +on it's bus.  When a device is bound to the bus driver it's available to
> > +VFIO.  When all the devices within a group are bound to their bus drivers,
> > +the group becomes "viable" and a user with sufficient access to the VFIO
> > +group chardev can obtain exclusive access to the set of group devices.
> > +
> > +As documented in linux/vfio.h, several ioctls are provided on the
> > +group chardev:
> > +
> > +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> > + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> > + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> > +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> > +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> > +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> > +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
> > +
> > +The last two ioctls return new file descriptors for accessing
> > +individual devices within the group and programming the IOMMU.  Each of
> > +these new file descriptors provide their own set of file interfaces.
> > +These ioctls will fail if any of the devices within the group are not
> > +bound to their VFIO bus driver.  Additionally, when either of these
> > +interfaces are used, the group is then bound to the struct_mm of the
> > +caller.  The GET_FLAGS ioctl can be used to view the state of the group.
> > +
> > +When either the GET_IOMMU_FD or GET_DEVICE_FD ioctls are invoked, a
> > +new IOMMU domain is created and all of the devices in the group are
> > +attached to it.  This is the only way to ensure full IOMMU isolation
> > +of the group, but potentially wastes resources and cycles if the user
> > +intends to manage multiple groups with the same set of IOMMU mappings.
> > +VFIO therefore provides a group MERGE and UNMERGE interface, which
> > +allows multiple groups to share an IOMMU domain.  Not all IOMMUs allow
> > +arbitrary groups to be merged, so the user should assume merging is
> > +opportunistic.
> 
> I do not think "opportunistic" means what you think it means..
> 
> >  A new group, with no open device or IOMMU file
> > +descriptors, can be merged into an existing, in-use, group using the
> > +MERGE ioctl.  A merged group can be unmerged using the UNMERGE ioctl
> > +once all of the device file descriptors for the group being merged
> > +"out" are closed.
> > +
> > +When groups are merged, the GET_IOMMU_FD and GET_DEVICE_FD ioctls are
> > +essentially fungible between group file descriptors (ie. if device
> > A
> 
> IDNT "fungible" MWYTIM, either.

Hmm, feel free to suggest.  Maybe we're hitting .us vs .au connotation.

> > +is in group X, and X is merged with Y, a file descriptor for A can be
> > +retrieved using GET_DEVICE_FD on Y.  Likewise, GET_IOMMU_FD returns a
> > +file descriptor referencing the same internal IOMMU object from either
> > +X or Y).  Merged groups can be dissolved either explictly with UNMERGE
> > +or automatically when ALL file descriptors for the merged group are
> > +closed (all IOMMUs, all devices, all groups).
> 
> Blech.  I'm really not liking this merge/unmerge API as it stands,
> it's horribly confusing.  At the very least, we need some better
> terminology.  We need some term for the metagroups; supergroups; iommu
> domains or-at-least-they-will-be-once-we-open-the-iommu or
> whathaveyous.
> 
> The first confusing thing about this interface is that each open group
> handle actually refers to two different things; the original group you
> opened and the metagroup it's a part of.  For the GET_IOMMU_FD and
> GET_DEVICE_FD operations, you're using the metagroup and two "merged"
> group handles are interchangeable.

Fungible, even ;)

> For other MERGE and especially
> UNMERGE operations, it matters which is the original group.

If I stick two LEGO blocks together, I need to identify the individual
block I want to remove to pull them back apart...

> The semantics of "merge" and "unmerge" under those names are really
> non-obvious.  Merge kind of has to merge two whole metagroups, but
> it's unclear if unmerge reverses one merge, or just takes out one
> (atom) group.  These operations need better names, at least.

Christian suggested a change to UNMERGE that we do not need to
specify a group to unmerge "from".  This makes it more like a list
implementation except there's no defined list_head.  Any member of the
list can pull in a new entry.  Calling UNMERGE on any member extracts
that member.

> Then it's unclear what order you can do various operations, and which
> order you can open and close various things.  You can kind of figure
> it out but it takes far more thinking than it should.
> 
> 
> So at the _very_ least, we need to invent new terminology and find a
> much better way of describing this API's semantics.  I still think an
> entirely different interface, where metagroups are created from
> outside with a lifetime that's not tied to an fd would be a better
> idea.

As we've discussed previously, configfs provides part of this, but has
no ioctl support.  It doesn't make sense to me to go play with groups in
configfs, but then still interact with them via a char dev.  It also
splits the ownership model and makes it harder to enforce who gets to
interact with the devices vs who gets to manipulate groups.  The current
model really isn't that complicated, imho.  As always, feel free to
suggest specific models.  If you have a specific terminology other than
MERGE, please suggest.

> Now, you specify that you can't use a group as the second argument of
> a merge if it already has an open iommu, but it's not clear from the
> doc if you can merge things into a group with an open iommu.

From above:

        A new group, with no open device or IOMMU file descriptors, can
        be merged into an existing, in-use, group using the MERGE ioctl.
                                 ^^^^^^

> Banning
> this would make life simpler, because the IOMMU's effective
> capabilities may change if you add more devices to the domain.  That's
> yet another non-obvious constraint in the interface ordering, though.

Banning this would prevent using merged groups with hotplug, which I
consider to be a primary use case.

> > +The IOMMU file descriptor provides this set of ioctls:
> > +
> > +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> > + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> > +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> > +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
> > +
> > +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> > +We currently only support IOMMU domains that are able to map any
> > +virtual address to any IOVA.  This is indicated by the MAP_ANY
> > flag.
> 
> So.  I tend to think of an IOMMU mapping IOVAs to memory pages, rather
> than memory pages to IOVAs.  

I do too, not sure why I wrote it that way, will fix.

> The IOMMU itself, of course maps to
> physical addresses, and the meaning of "virtual address" in this
> context is not really clear.  I think you would be better off saying
> the IOMMU can map any IOVA to any memory page.  From a hardware POV
> that means any physical address, but of course for a VFIO user a page
> is specified by its process virtual address.

Will fix.

> I think we need to pin exactly what "MAP_ANY" means down better.  Now,
> VFIO is pretty much a lost cause if you can't map any normal process
> memory page into the IOMMU, so I think the only thing that is really
> covered is IOVAs.  But saying "can map any IOVA" is not clear, because
> if you can't map it, it's not a (valid) IOVA.  Better to say that
> IOVAs can be any 64-bit value, which I think is what you really mean
> here.

ok

> Of course, since POWER is a platform where this is *not* true, I'd
> prefer to have something giving the range of valid IOVAs in the core
> to start with.

Since iommu_ops does not yet have any concept of this (nudge, nudge), I
figured this would be added later.  A possible implementation would be
that such an iommu would not set MAP_ANY, would add a new flag for
MAP_RANGE, and provide a new VFIO_IOMMU_GET_RANGE_INFO ioctl to describe
it.  I'm guaranteed to get it wrong if I try to predict all your needs.

> > +
> > +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> > +and unmapping IOVAs to process virtual addresses:
> > +
> > +struct vfio_dma_map {
> > +        __u64   len;            /* length of structure */
> 
> Thanks for adding these structure length fields.  But I think they
> should be called something other than 'len', which is likely to be
> confused with size (or some other length that's actually related to
> the operation's parameters).  Better to call it 'structlen' or
> 'argslen' or something.

Ok.  As Scott noted, I've failed to implement these in a way that
actually allows extension, but I'll work on it.

> > +        __u64   vaddr;          /* process virtual addr */
> > +        __u64   dmaaddr;        /* desired and/or returned dma address */
> > +        __u64   size;           /* size in bytes */
> > +        __u64   flags;
> > +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> 
> Make it independent READ and WRITE flags from the start.  Not all
> combinations will be be valid on all hardware, but that way we have
> the possibilities covered without having to use strange encodings
> later.

Ok.

> > +};
> > +
> > +Current users of VFIO use relatively static DMA mappings, not requiring
> > +high frequency turnover.  As new users are added, it's expected that the
> > +IOMMU file descriptor will evolve to support new mapping interfaces, this
> > +will be reflected in the flags and may present new ioctls and file
> > +interfaces.
> > +
> > +The device GET_FLAGS ioctl is intended to return basic device type and
> > +indicate support for optional capabilities.  Flags currently include whether
> > +the device is PCI or described by Device Tree, and whether the RESET ioctl
> > +is supported:
> > +
> > +#define VFIO_DEVICE_GET_FLAGS           _IOR(';', 108, __u64)
> > + #define VFIO_DEVICE_FLAGS_PCI          (1 << 0)
> > + #define VFIO_DEVICE_FLAGS_DT           (1 << 1)
> 
> TBH, I don't think the VFIO for DT stuff is mature enough yet to be in
> an initial infrastructure patch, though we should certainly be
> discussing it as an add-on patch.

I agree for DT, and PCI should be added with vfio-pci, not the initial
core.

> > + #define VFIO_DEVICE_FLAGS_RESET        (1 << 2)
> > +
> > +The MMIO and IOP resources used by a device are described by regions.
> > +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> > +
> > +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
> > +
> > +Regions are described by a struct vfio_region_info, which is retrieved by
> > +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> > +the desired region (0 based index).  Note that devices may implement zero
> > +sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> > +mapping).
> 
> So, I think you're saying that a zero-sized region is used to encode a
> NOP region, that is, to basically put a "no region here" in between
> valid region indices.  You should spell that out.

Ok.

> [Incidentally, any chance you could borrow one of RH's tech writers
> for this?  I'm afraid you seem to lack the knack for clear and easily
> read documentation]

Thanks for the encouragement :-\  It's no wonder there isn't more
content in Documentation.

> > +struct vfio_region_info {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;          /* region number */
> > +        __u64   size;           /* size in bytes of region */
> > +        __u64   offset;         /* start offset of region */
> > +        __u64   flags;
> > +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> > +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> 
> Again having separate read and write bits from the start will save
> strange encodings later.

Seems highly unlikely, but we have bits to waste...

> > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> > +        __u64   phys;           /* physical address of region */
> > +};
> 
> I notice there is no field for "type" e.g. MMIO vs. PIO vs. config
> space for PCI.  If you added that having a NONE type might be a
> clearer way of encoding a non-region than just having size==0.

I thought there was some resistance to including MMIO and PIO bits in
the flags.  If that's passed, I can add it, but PCI can determine this
through config space (and vfio-pci exposes config space at a fixed
index).  Having a regions w/ size == 0, MMIO and PIO flags unset seems a
little redundant if that's the only reason for having them.  A NONE flag
doesn't make sense to me.  Config space isn't NONE, but neither is it
MMIO nor PIO; and someone would probably be offended about even
mentioning PIO in the specification.

> > +
> > +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> > +
> > +The offset indicates the offset into the device file descriptor which
> > +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> > +available access types and validity of optional fields.  For instance
> > +the phys field may only be valid for certain devices types.
> > +
> > +Interrupts are described using a similar interface.  GET_NUM_IRQS
> > +reports the number or IRQ indexes for the device.
> > +
> > +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
> > +
> > +struct vfio_irq_info {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;          /* IRQ number */
> > +        __u32   count;          /* number of individual IRQs */
> 
> Is there a reason for allowing irqs in batches like this, rather than
> having each MSI be reflected by a separate irq_info?

Yes, bus drivers like vfio-pci can define index 1 as the MSI info
structure and index 2 as MSI-X.  There's really no need to expose 57
individual MSI interrupts and try to map them to the correct device
specific MSI type if they can only logically be enabled in two distinct
groups.  Bus drivers with individually controllable MSI vectors are free
to expose them separately.  I assume device tree paths would help
associate an index to a specific interrupt.

> > +        __u64   flags;
> > +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> > +};
> > +
> > +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> > +type to index mapping).
> 
> I know what you mean, but you need a clearer way to express it.

I'll work on it.

> > +Information about each index can be retrieved using the GET_IRQ_INFO
> > +ioctl, used much like GET_REGION_INFO.
> > +
> > +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> > +
> > +Individual indexes can describe single or sets of IRQs.  This provides the
> > +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> > +
> > +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> > +as shown below, are used to pass the IRQ info index, the number of eventfds,
> > +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> > +
> > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> > +
> > +When a level triggered interrupt is signaled, the interrupt is masked
> > +on the host.  This prevents an unresponsive userspace driver from
> > +continuing to interrupt the host system.  After servicing the interrupt,
> > +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> > +triggered interrupts implicitly have a count of 1 per index.
> 
> This is a silly restriction.  Even PCI devices can have up to 4 LSIs
> on a function in theory, though no-one ever does.  Embedded devices
> can and do have multiple level interrupts.

Per the PCI spec, an individual PCI function can only ever have, at
most, a single INTx line.  A multi-function *device* can have up to 4
INTx lines, but what we're exposing here is a struct device, ie. a PCI
function.

Other devices could certainly have multiple level interrupts, and if
grouping them as we do with MSI on PCI makes sense, please let me know.
I just didn't see the value in making the unmask operations handle
sub-indexes if it's not needed.

> > +
> > +/* Unmask IRQ index, arg[0] = index */
> > +#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)
> > +
> > +Level triggered interrupts can also be unmasked using an irqfd.  Use
> > +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
> > +
> > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> > +
> > +When supported, as indicated by the device flags, reset the device.
> > +
> > +#define VFIO_DEVICE_RESET               _IO(';', 116)
> > +
> > +Device tree devices also invlude ioctls for further defining the
> > +device tree properties of the device:
> > +
> > +struct vfio_dtpath {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;
> > +        __u64   flags;
> > +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> > +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> > +        char    *path;
> > +};
> > +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> > +
> > +struct vfio_dtindex {
> > +        __u32   len;            /* length of structure */
> > +        __u32   index;
> > +        __u32   prop_type;
> > +        __u32   prop_index;
> > +        __u64   flags;
> > +#define VFIO_DTINDEX_FLAGS_REGION       (1 << 0)
> > +#define VFIO_DTINDEX_FLAGS_IRQ          (1 << 1)
> > +};
> > +#define VFIO_DEVICE_GET_DTINDEX         _IOWR(';', 118, struct vfio_dtindex)
> > +
> > +
> > +VFIO bus driver API
> > +-------------------------------------------------------------------------------
> > +
> > +Bus drivers, such as PCI, have three jobs:
> > + 1) Add/remove devices from vfio
> > + 2) Provide vfio_device_ops for device access
> > + 3) Device binding and unbinding
> > +
> > +When initialized, the bus driver should enumerate the devices on it's
> 
> s/it's/its/

Noted.

<snip>
> > +/* Unmap DMA region */
> > +/* dgate must be held */
> > +static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > +			    int npage, int rdwr)
> 
> Use of "read" and "write" in DMA can often be confusing, since it's
> not always clear if you're talking from the perspective of the CPU or
> the device (_writing_ data to a device will usually involve it doing
> DMA _reads_ from memory).  It's often best to express things as DMA
> direction, 'to device', and 'from device' instead.

Good point.

> > +{
> > +	int i, unlocked = 0;
> > +
> > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > +		unsigned long pfn;
> > +
> > +		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
> > +		if (pfn) {
> > +			iommu_unmap(iommu->domain, iova, 0);
> > +			unlocked += put_pfn(pfn, rdwr);
> > +		}
> > +	}
> > +	return unlocked;
> > +}
> > +
> > +static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > +			   unsigned long npage, int rdwr)
> > +{
> > +	int unlocked;
> > +
> > +	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);
> > +	vfio_lock_acct(-unlocked);
> 
> Have you checked that your accounting will work out if the user maps
> the same memory page to multiple IOVAs?

Hmm, it probably doesn't.  We potentially over-penalize the user process
here.

> > +}
> > +
> > +/* Unmap ALL DMA regions */
> > +void vfio_iommu_unmapall(struct vfio_iommu *iommu)
> > +{
> > +	struct list_head *pos, *pos2;
> > +	struct dma_map_page *mlp;
> > +
> > +	mutex_lock(&iommu->dgate);
> > +	list_for_each_safe(pos, pos2, &iommu->dm_list) {
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > +		list_del(&mlp->list);
> > +		kfree(mlp);
> > +	}
> > +	mutex_unlock(&iommu->dgate);
> 
> Ouch, no good at all.  Keeping track of every DMA map is no good on
> POWER or other systems where IOMMU operations are a hot path.  I think
> you'll need an iommu specific hook for this instead, which uses
> whatever data structures are natural for the IOMMU.  For example a
> 1-level pagetable, like we use on POWER will just zero every entry.

It's already been noted in the docs that current users have relatively
static mappings and a performance interface is TBD for dynamically
backing streaming DMA.  The current vfio_iommu exposes iommu_ops, POWER
will need to come up with something to expose instead.

> > +}
> > +
> > +static int vaddr_get_pfn(unsigned long vaddr, int rdwr, unsigned long *pfn)
> > +{
> > +	struct page *page[1];
> > +	struct vm_area_struct *vma;
> > +	int ret = -EFAULT;
> > +
> > +	if (get_user_pages_fast(vaddr, 1, rdwr, page) == 1) {
> > +		*pfn = page_to_pfn(page[0]);
> > +		return 0;
> > +	}
> > +
> > +	down_read(&current->mm->mmap_sem);
> > +
> > +	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> > +
> > +	if (vma && vma->vm_flags & VM_PFNMAP) {
> > +		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> > +		if (is_invalid_reserved_pfn(*pfn))
> > +			ret = 0;
> > +	}
> 
> It's kind of nasty that you take gup_fast(), already designed to grab
> pointers for multiple user pages, then just use it one page at a time,
> even for a big map.

Yep, this needs work, but shouldn't really change the API.

> > +	up_read(&current->mm->mmap_sem);
> > +
> > +	return ret;
> > +}
> > +
> > +/* Map DMA region */
> > +/* dgate must be held */
> > +static int vfio_dma_map(struct vfio_iommu *iommu, unsigned long iova,
> > +			unsigned long vaddr, int npage, int rdwr)
> 
> iova should be a dma_addr_t.  Bus address size need not match virtual
> address size, and may not fit in an unsigned long.

ok.

> > +{
> > +	unsigned long start = iova;
> > +	int i, ret, locked = 0, prot = IOMMU_READ;
> > +
> > +	/* Verify pages are not already mapped */
> > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
> > +		if (iommu_iova_to_phys(iommu->domain, iova))
> > +			return -EBUSY;
> > +
> > +	iova = start;
> > +
> > +	if (rdwr)
> > +		prot |= IOMMU_WRITE;
> > +	if (iommu->cache)
> > +		prot |= IOMMU_CACHE;
> > +
> > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> > +		unsigned long pfn = 0;
> > +
> > +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> > +		if (ret) {
> > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > +			return ret;
> > +		}
> > +
> > +		/* Only add actual locked pages to accounting */
> > +		if (!is_invalid_reserved_pfn(pfn))
> > +			locked++;
> > +
> > +		ret = iommu_map(iommu->domain, iova,
> > +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
> > +		if (ret) {
> > +			/* Back out mappings on error */
> > +			put_pfn(pfn, rdwr);
> > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > +			return ret;
> > +		}
> > +	}
> > +	vfio_lock_acct(locked);
> > +	return 0;
> > +}
> > +
> > +static inline int ranges_overlap(unsigned long start1, size_t size1,
> > +				 unsigned long start2, size_t size2)
> > +{
> > +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);
> 
> Needs overflow safety.

Yep.

> > +}
> > +
> > +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
> > +					  dma_addr_t start, size_t size)
> > +{
> > +	struct list_head *pos;
> > +	struct dma_map_page *mlp;
> > +
> > +	list_for_each(pos, &iommu->dm_list) {
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > +				   start, size))
> > +			return mlp;
> > +	}
> > +	return NULL;
> > +}
> 
> Again, keeping track of each dma map operation is no good for
> performance.

This is not the performance interface you're looking for.

> > +
> > +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
> > +			    size_t size, struct dma_map_page *mlp)
> > +{
> > +	struct dma_map_page *split;
> > +	int npage_lo, npage_hi;
> > +
> > +	/* Existing dma region is completely covered, unmap all */
> > +	if (start <= mlp->daddr &&
> > +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > +		list_del(&mlp->list);
> > +		npage_lo = mlp->npage;
> > +		kfree(mlp);
> > +		return npage_lo;
> > +	}
> > +
> > +	/* Overlap low address of existing range */
> > +	if (start <= mlp->daddr) {
> > +		size_t overlap;
> > +
> > +		overlap = start + size - mlp->daddr;
> > +		npage_lo = overlap >> PAGE_SHIFT;
> > +		npage_hi = mlp->npage - npage_lo;
> > +
> > +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> > +		mlp->daddr += overlap;
> > +		mlp->vaddr += overlap;
> > +		mlp->npage -= npage_lo;
> > +		return npage_lo;
> > +	}
> > +
> > +	/* Overlap high address of existing range */
> > +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > +		size_t overlap;
> > +
> > +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> > +		npage_hi = overlap >> PAGE_SHIFT;
> > +		npage_lo = mlp->npage - npage_hi;
> > +
> > +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> > +		mlp->npage -= npage_hi;
> > +		return npage_hi;
> > +	}
> > +
> > +	/* Split existing */
> > +	npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
> > +	npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
> > +
> > +	split = kzalloc(sizeof *split, GFP_KERNEL);
> > +	if (!split)
> > +		return -ENOMEM;
> > +
> > +	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
> > +
> > +	mlp->npage = npage_lo;
> > +
> > +	split->npage = npage_hi;
> > +	split->daddr = start + size;
> > +	split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
> > +	split->rdwr = mlp->rdwr;
> > +	list_add(&split->list, &iommu->dm_list);
> > +	return size >> PAGE_SHIFT;
> > +}
> > +
> > +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > +{
> > +	int ret = 0;
> > +	size_t npage = dmp->size >> PAGE_SHIFT;
> > +	struct list_head *pos, *n;
> > +
> > +	if (dmp->dmaaddr & ~PAGE_MASK)
> > +		return -EINVAL;
> > +	if (dmp->size & ~PAGE_MASK)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&iommu->dgate);
> > +
> > +	list_for_each_safe(pos, n, &iommu->dm_list) {
> > +		struct dma_map_page *mlp;
> > +
> > +		mlp = list_entry(pos, struct dma_map_page, list);
> > +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > +				   dmp->dmaaddr, dmp->size)) {
> > +			ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
> > +						      dmp->size, mlp);
> > +			if (ret > 0)
> > +				npage -= NPAGE_TO_SIZE(ret);
> > +			if (ret < 0 || npage == 0)
> > +				break;
> > +		}
> > +	}
> > +	mutex_unlock(&iommu->dgate);
> > +	return ret > 0 ? 0 : ret;
> > +}
> > +
> > +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > +{
> > +	int npage;
> > +	struct dma_map_page *mlp, *mmlp = NULL;
> > +	dma_addr_t daddr = dmp->dmaaddr;
> > +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;
> > +	size_t size = dmp->size;
> > +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;
> > +
> > +	if (vaddr & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +	if (daddr & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +	if (size & (PAGE_SIZE-1))
> > +		return -EINVAL;
> > +
> > +	npage = size >> PAGE_SHIFT;
> > +	if (!npage)
> > +		return -EINVAL;
> > +
> > +	if (!iommu)
> > +		return -EINVAL;
> > +
> > +	mutex_lock(&iommu->dgate);
> > +
> > +	if (vfio_find_dma(iommu, daddr, size)) {
> > +		ret = -EBUSY;
> > +		goto out_lock;
> > +	}
> > +
> > +	/* account for locked pages */
> > +	locked = current->mm->locked_vm + npage;
> > +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> > +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > +			__func__, rlimit(RLIMIT_MEMLOCK));
> > +		ret = -ENOMEM;
> > +		goto out_lock;
> > +	}
> > +
> > +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);
> > +	if (ret)
> > +		goto out_lock;
> > +
> > +	/* Check if we abut a region below */
> > +	if (daddr) {
> > +		mlp = vfio_find_dma(iommu, daddr - 1, 1);
> > +		if (mlp && mlp->rdwr == rdwr &&
> > +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {
> > +
> > +			mlp->npage += npage;
> > +			daddr = mlp->daddr;
> > +			vaddr = mlp->vaddr;
> > +			npage = mlp->npage;
> > +			size = NPAGE_TO_SIZE(npage);
> > +
> > +			mmlp = mlp;
> > +		}
> > +	}
> > +
> > +	if (daddr + size) {
> > +		mlp = vfio_find_dma(iommu, daddr + size, 1);
> > +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size) {
> > +
> > +			mlp->npage += npage;
> > +			mlp->daddr = daddr;
> > +			mlp->vaddr = vaddr;
> > +
> > +			/* If merged above and below, remove previously
> > +			 * merged entry.  New entry covers it.  */
> > +			if (mmlp) {
> > +				list_del(&mmlp->list);
> > +				kfree(mmlp);
> > +			}
> > +			mmlp = mlp;
> > +		}
> > +	}
> > +
> > +	if (!mmlp) {
> > +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
> > +		if (!mlp) {
> > +			ret = -ENOMEM;
> > +			vfio_dma_unmap(iommu, daddr, npage, rdwr);
> > +			goto out_lock;
> > +		}
> > +
> > +		mlp->npage = npage;
> > +		mlp->daddr = daddr;
> > +		mlp->vaddr = vaddr;
> > +		mlp->rdwr = rdwr;
> > +		list_add(&mlp->list, &iommu->dm_list);
> > +	}
> > +
> > +out_lock:
> > +	mutex_unlock(&iommu->dgate);
> > +	return ret;
> > +}
> 
> This whole tracking infrastructure is way too complex to impose on
> every IOMMU.  We absolutely don't want to do all this when just
> updating a 1-level pagetable.

If only POWER implemented an iommu_ops so we had something on which we
could base an alternate iommu model and pluggable iommu registration...

> > +static int vfio_iommu_release(struct inode *inode, struct file *filep)
> > +{
> > +	struct vfio_iommu *iommu = filep->private_data;
> > +
> > +	vfio_release_iommu(iommu);
> > +	return 0;
> > +}
> > +
> > +static long vfio_iommu_unl_ioctl(struct file *filep,
> > +				 unsigned int cmd, unsigned long arg)
> > +{
> > +	struct vfio_iommu *iommu = filep->private_data;
> > +	int ret = -ENOSYS;
> > +
> > +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> > +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> > +
> > +                ret = put_user(flags, (u64 __user *)arg);
> 
> Um.. flags surely have to come from the IOMMU driver.

This vfio_iommu object is backed by iommu_ops, which supports this
mapping.

> > +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> > +		struct vfio_dma_map dm;
> > +
> > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > +			return -EFAULT;
> > +
> > +		ret = vfio_dma_map_dm(iommu, &dm);
> > +
> > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > +			ret = -EFAULT;
> > +
> > +	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
> > +		struct vfio_dma_map dm;
> > +
> > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > +			return -EFAULT;
> > +
> > +		ret = vfio_dma_unmap_dm(iommu, &dm);
> > +
> > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > +			ret = -EFAULT;
> > +	}
> > +	return ret;
> > +}
> > +
> > +#ifdef CONFIG_COMPAT
> > +static long vfio_iommu_compat_ioctl(struct file *filep,
> > +				    unsigned int cmd, unsigned long arg)
> > +{
> > +	arg = (unsigned long)compat_ptr(arg);
> > +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> 
> Um, this only works if the structures are exactly compatible between
> 32-bit and 64-bit ABIs.  I don't think that is always true.

I think all our structure sizes are independent of host width.  If I'm
missing something, let me know.

> > +}
> > +#endif	/* CONFIG_COMPAT */
> > +
> > +const struct file_operations vfio_iommu_fops = {
> > +	.owner		= THIS_MODULE,
> > +	.release	= vfio_iommu_release,
> > +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
> > +#ifdef CONFIG_COMPAT
> > +	.compat_ioctl	= vfio_iommu_compat_ioctl,
> > +#endif
> > +};
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > new file mode 100644
> > index 0000000..6169356
> > --- /dev/null
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -0,0 +1,1151 @@
> > +/*
> > + * VFIO framework
> > + *
> > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> > + *     Author: Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * Derived from original vfio:
> > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > + * Author: Tom Lyon, pugs@cisco.com
> > + */
> > +
> > +#include <linux/cdev.h>
> > +#include <linux/compat.h>
> > +#include <linux/device.h>
> > +#include <linux/file.h>
> > +#include <linux/anon_inodes.h>
> > +#include <linux/fs.h>
> > +#include <linux/idr.h>
> > +#include <linux/iommu.h>
> > +#include <linux/mm.h>
> > +#include <linux/module.h>
> > +#include <linux/slab.h>
> > +#include <linux/string.h>
> > +#include <linux/uaccess.h>
> > +#include <linux/vfio.h>
> > +#include <linux/wait.h>
> > +
> > +#include "vfio_private.h"
> > +
> > +#define DRIVER_VERSION	"0.2"
> > +#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
> > +#define DRIVER_DESC	"VFIO - User Level meta-driver"
> > +
> > +static int allow_unsafe_intrs;
> > +module_param(allow_unsafe_intrs, int, 0);
> > +MODULE_PARM_DESC(allow_unsafe_intrs,
> > +        "Allow use of IOMMUs which do not support interrupt remapping");
> 
> This should not be a global option, but part of the AMD/Intel IOMMU
> specific code.  In general it's a question of how strict the IOMMU
> driver is about isolation when it determines what the groups are, and
> only the IOMMU driver can know what the possibilities are for its
> class of hardware.

I agree this should probably be tied more closely to the iommu driver,
but again, we only have iommu_ops right now.

<snip>
> > +
> > +/* Attempt to merge the group pointed to by fd into group.  The merge-ee
> > + * group must not have an iommu or any devices open because we cannot
> > + * maintain that context across the merge.  The merge-er group can be
> > + * in use. */
> 
> Yeah, so merge-er group in use still has its problems, because it
> could affect what the IOMMU is capable of.

As seen below, we deny merging if the iommu domains are not exactly
compatible.  Our notion of what compatible means depends on what
iommu_ops exposes though.

> > +static int vfio_group_merge(struct vfio_group *group, int fd)
> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *old_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +	bool opened = false;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> 
> This should be a WARN_ON or BUG_ON rather than just an error return, surely.

No, I don't think so.  We're passed a file descriptor that could be for
anything.  If the user passed a file descriptor for something that's not
a vfio group, that's a user error, not an internal consistency error of
vfio.

> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +
> > +	if (!new || new == group || !new->iommu ||
> > +	    new->iommu->domain || new->bus != group->bus) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We need to attach all the devices to each domain separately
> > +	 * in order to validate that the capabilities match for both.  */
> > +	ret = __vfio_open_iommu(new->iommu);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +		opened = true;
> > +	}
> > +
> > +	/* If cache coherency doesn't match we'd potentialy need to
> > +	 * remap existing iommu mappings in the merge-er domain.
> > +	 * Poor return to bother trying to allow this currently. */
> > +	if (iommu_domain_has_cap(group->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY) !=
> > +	    iommu_domain_has_cap(new->iommu->domain,
> > +				 IOMMU_CAP_CACHE_COHERENCY)) {
> > +		__vfio_close_iommu(new->iommu);
> > +		if (opened)
> > +			__vfio_close_iommu(group->iommu);
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* Close the iommu for the merge-ee and attach all its devices
> > +	 * to the merge-er iommu. */
> > +	__vfio_close_iommu(new->iommu);
> > +
> > +	ret = __vfio_iommu_attach_group(group->iommu, new);
> > +	if (ret)
> > +		goto out;
> > +
> > +	/* set_iommu unlinks new from the iommu, so save a pointer to it */
> > +	old_iommu = new->iommu;
> > +	__vfio_group_set_iommu(new, group->iommu);
> > +	kfree(old_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Unmerge the group pointed to by fd from group. */
> > +static int vfio_group_unmerge(struct vfio_group *group, int fd)
> > +{
> > +	struct vfio_group *new;
> > +	struct vfio_iommu *new_iommu;
> > +	struct file *file;
> > +	int ret = 0;
> > +
> > +	/* Since the merge-out group is already opened, it needs to
> > +	 * have an iommu struct associated with it. */
> > +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
> > +	if (!new_iommu)
> > +		return -ENOMEM;
> > +
> > +	INIT_LIST_HEAD(&new_iommu->group_list);
> > +	INIT_LIST_HEAD(&new_iommu->dm_list);
> > +	mutex_init(&new_iommu->dgate);
> > +	new_iommu->bus = group->bus;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	file = fget(fd);
> > +	if (!file) {
> > +		ret = -EBADF;
> > +		goto out_noput;
> > +	}
> > +
> > +	/* Sanity check, is this really our fd? */
> > +	if (file->f_op != &vfio_group_fops) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	new = file->private_data;
> > +	if (!new || new == group || new->iommu != group->iommu) {
> > +		ret = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* We can't merge-out a group with devices still in use. */
> > +	if (__vfio_group_devs_inuse(new)) {
> > +		ret = -EBUSY;
> > +		goto out;
> > +	}
> > +
> > +	__vfio_iommu_detach_group(group->iommu, new);
> > +	__vfio_group_set_iommu(new, new_iommu);
> > +
> > +out:
> > +	fput(file);
> > +out_noput:
> > +	if (ret)
> > +		kfree(new_iommu);
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new iommu file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set. */
> 
> I know I've had this explained to me several times before, but I've
> forgotten again.  Why do we need to wire the iommu to an mm?

We're mapping process virtual addresses into the IOMMU, so it makes
sense to restrict ourselves to a single virtual address space.  It also
enforces the ownership, that only a single mm is in control of the
group.

> > +static int vfio_group_get_iommu_fd(struct vfio_group *group)
> > +{
> > +	int ret = 0;
> > +
> > +	mutex_lock(&vfio.lock);
> > +
> > +	if (!group->iommu->domain) {
> > +		ret = __vfio_open_iommu(group->iommu);
> > +		if (ret)
> > +			goto out;
> > +	}
> > +
> > +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
> > +			       group->iommu, O_RDWR);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	group->iommu->refcnt++;
> > +out:
> > +	mutex_unlock(&vfio.lock);
> > +	return ret;
> > +}
> > +
> > +/* Get a new device file descriptor.  This will open the iommu, setting
> > + * the current->mm ownership if it's not already set.  It's difficult to
> > + * specify the requirements for matching a user supplied buffer to a
> > + * device, so we use a vfio driver callback to test for a match.  For
> > + * PCI, dev_name(dev) is unique, but other drivers may require including
> > + * a parent device string. */
> 
> At some point we probably want an interface to enumerate the devices
> too, but that can probably wait.

That's what I decided as well.  I also haven't been able to come up with
an interface for it that doesn't make me want to vomit.

> > +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
> > +{

Thanks,

Alex
Scott Wood Nov. 15, 2011, 8:10 p.m. UTC | #24
On 11/15/2011 12:34 AM, David Gibson wrote:
> I think we need to pin exactly what "MAP_ANY" means down better.  Now,
> VFIO is pretty much a lost cause if you can't map any normal process
> memory page into the IOMMU, so I think the only thing that is really
> covered is IOVAs.  But saying "can map any IOVA" is not clear, because
> if you can't map it, it's not a (valid) IOVA.  Better to say that
> IOVAs can be any 64-bit value, which I think is what you really mean
> here.

It also means that there are no restrictions on what the IOVA can be
within that range (other than page alignment), which isn't true on our
IOMMU.

We'll also need a way to communicate the desired geometry of the overall
IOMMU table (for this group) to the kernel, which determines what the
restrictions will be (we can't determine it automatically until we know
what all the translation requests will be, and even then it's awkward).

> On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
>> +When a level triggered interrupt is signaled, the interrupt is masked
>> +on the host.  This prevents an unresponsive userspace driver from
>> +continuing to interrupt the host system.  After servicing the interrupt,
>> +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
>> +triggered interrupts implicitly have a count of 1 per index.
> 
> This is a silly restriction.  Even PCI devices can have up to 4 LSIs
> on a function in theory, though no-one ever does.  Embedded devices
> can and do have multiple level interrupts.

Those interrupts would each have their own index.  This is necessary for
level-triggered interrupts since they'll need to be individually
identifiable to VFIO_DEVICE_UNMASK_IRQ -- doesn't seem worth adding
another parameter to UNMASK.

>> +#ifdef CONFIG_COMPAT
>> +static long vfio_iommu_compat_ioctl(struct file *filep,
>> +				    unsigned int cmd, unsigned long arg)
>> +{
>> +	arg = (unsigned long)compat_ptr(arg);
>> +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> 
> Um, this only works if the structures are exactly compatible between
> 32-bit and 64-bit ABIs.  I don't think that is always true.

These are new structs, we can make it true.

>> +static int allow_unsafe_intrs;
>> +module_param(allow_unsafe_intrs, int, 0);
>> +MODULE_PARM_DESC(allow_unsafe_intrs,
>> +        "Allow use of IOMMUs which do not support interrupt remapping");
> 
> This should not be a global option, but part of the AMD/Intel IOMMU
> specific code.  In general it's a question of how strict the IOMMU
> driver is about isolation when it determines what the groups are, and
> only the IOMMU driver can know what the possibilities are for its
> class of hardware.

It's also a concern that is specific to MSIs.  In any case, I'm not sure
that the ability to cause a spurious IRQ is bad enough to warrant
disabling the entire subsystem by default on certain hardware.

Probably best to just print a warning on module init if there are any
known isolation holes, and let the admin decide whom (if anyone) to let
use this.  If the hole is bad enough that it must be confirmed, it
should require at most a sysfs poke.

-Scott
Aaron Fabbri Nov. 15, 2011, 9:40 p.m. UTC | #25
On 11/15/11 12:10 PM, "Scott Wood" <scottwood@freescale.com> wrote:

> On 11/15/2011 12:34 AM, David Gibson wrote:
<snip> 
>>> +static int allow_unsafe_intrs;
>>> +module_param(allow_unsafe_intrs, int, 0);
>>> +MODULE_PARM_DESC(allow_unsafe_intrs,
>>> +        "Allow use of IOMMUs which do not support interrupt remapping");
>> 
>> This should not be a global option, but part of the AMD/Intel IOMMU
>> specific code.  In general it's a question of how strict the IOMMU
>> driver is about isolation when it determines what the groups are, and
>> only the IOMMU driver can know what the possibilities are for its
>> class of hardware.
> 
> It's also a concern that is specific to MSIs.  In any case, I'm not sure
> that the ability to cause a spurious IRQ is bad enough to warrant
> disabling the entire subsystem by default on certain hardware.

I think the issue is more that the ability to create fake MSI interrupts can
lead to bigger exploits.

Originally we didn't have this parameter. It was added it to reflect the
fact that MSI's triggered by guests are dangerous without the isolation that
interrupt remapping provides.

That is, it *should* be inconvenient to run without interrupt mapping HW
support.

-Aaron

> Probably best to just print a warning on module init if there are any
> known isolation holes, and let the admin decide whom (if anyone) to let
> use this.  If the hole is bad enough that it must be confirmed, it
> should require at most a sysfs poke.
> 
> -Scott
>
Scott Wood Nov. 15, 2011, 10:29 p.m. UTC | #26
On 11/15/2011 03:40 PM, Aaron Fabbri wrote:
> 
> 
> 
> On 11/15/11 12:10 PM, "Scott Wood" <scottwood@freescale.com> wrote:
> 
>> On 11/15/2011 12:34 AM, David Gibson wrote:
> <snip> 
>>>> +static int allow_unsafe_intrs;
>>>> +module_param(allow_unsafe_intrs, int, 0);
>>>> +MODULE_PARM_DESC(allow_unsafe_intrs,
>>>> +        "Allow use of IOMMUs which do not support interrupt remapping");
>>>
>>> This should not be a global option, but part of the AMD/Intel IOMMU
>>> specific code.  In general it's a question of how strict the IOMMU
>>> driver is about isolation when it determines what the groups are, and
>>> only the IOMMU driver can know what the possibilities are for its
>>> class of hardware.
>>
>> It's also a concern that is specific to MSIs.  In any case, I'm not sure
>> that the ability to cause a spurious IRQ is bad enough to warrant
>> disabling the entire subsystem by default on certain hardware.
> 
> I think the issue is more that the ability to create fake MSI interrupts can
> lead to bigger exploits.
> 
> Originally we didn't have this parameter. It was added it to reflect the
> fact that MSI's triggered by guests are dangerous without the isolation that
> interrupt remapping provides.
> 
> That is, it *should* be inconvenient to run without interrupt mapping HW
> support.

A sysfs knob is sufficient inconvenience.  It should only affect whether
you can use MSIs, and the relevant issue shouldn't be "has interrupt
remapping" but "is there a hole".

Some systems might address the issue in ways other than IOMMU-level MSI
translation.  Our interrupt controller provides enough separate 4K pages
for MSI interrupt delivery for each PCIe IOMMU group to get its own (we
currently only have 3, one per root complex) -- no special IOMMU feature
required.

It doesn't help that the semantics of IOMMU_CAP_INTR_REMAP are
undefined.  I shouldn't have to know how x86 IOMMUs work when
implementing a driver for different hardware, just to know what the
generic code is expecting.

As David suggests, if you want to do this it should be the x86 IOMMU
driver that has a knob that controls how it forms groups in the absence
of this support.

-Scott
Konrad Rzeszutek Wilk Nov. 16, 2011, 4:52 p.m. UTC | #27
On Fri, Nov 11, 2011 at 03:10:56PM -0700, Alex Williamson wrote:
> 
> Thanks Konrad!  Comments inline.
> 
> On Fri, 2011-11-11 at 12:51 -0500, Konrad Rzeszutek Wilk wrote:
> > On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> > > VFIO provides a secure, IOMMU based interface for user space
> > > drivers, including device assignment to virtual machines.
> > > This provides the base management of IOMMU groups, devices,
> > > and IOMMU objects.  See Documentation/vfio.txt included in
> > > this patch for user and kernel API description.
> > > 
> > > Note, this implements the new API discussed at KVM Forum
> > > 2011, as represented by the drvier version 0.2.  It's hoped
> > > that this provides a modular enough interface to support PCI
> > > and non-PCI userspace drivers across various architectures
> > > and IOMMU implementations.
> > > 
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > > Fingers crossed, this is the last RFC for VFIO, but we need
> > > the iommu group support before this can go upstream
> > > (http://lkml.indiana.edu/hypermail/linux/kernel/1110.2/02303.html),
> > > hoping this helps push that along.
> > > 
> > > Since the last posting, this version completely modularizes
> > > the device backends and better defines the APIs between the
> > > core VFIO code and the device backends.  I expect that we
> > > might also adopt a modular IOMMU interface as iommu_ops learns
> > > about different types of hardware.  Also many, many cleanups.
> > > Check the complete git history for details:
> > > 
> > > git://github.com/awilliam/linux-vfio.git vfio-ng
> > > 
> > > (matching qemu tree: git://github.com/awilliam/qemu-vfio.git)
> > > 
> > > This version, along with the supporting VFIO PCI backend can
> > > be found here:
> > > 
> > > git://github.com/awilliam/linux-vfio.git vfio-next-20111103
> > > 
> > > I've held off on implementing a kernel->user signaling
> > > mechanism for now since the previous netlink version produced
> > > too many gag reflexes.  It's easy enough to set a bit in the
> > > group flags too indicate such support in the future, so I
> > > think we can move ahead without it.
> > > 
> > > Appreciate any feedback or suggestions.  Thanks,
> > > 
> > > Alex
> > > 
> > >  Documentation/ioctl/ioctl-number.txt |    1 
> > >  Documentation/vfio.txt               |  304 +++++++++
> > >  MAINTAINERS                          |    8 
> > >  drivers/Kconfig                      |    2 
> > >  drivers/Makefile                     |    1 
> > >  drivers/vfio/Kconfig                 |    8 
> > >  drivers/vfio/Makefile                |    3 
> > >  drivers/vfio/vfio_iommu.c            |  530 ++++++++++++++++
> > >  drivers/vfio/vfio_main.c             | 1151 ++++++++++++++++++++++++++++++++++
> > >  drivers/vfio/vfio_private.h          |   34 +
> > >  include/linux/vfio.h                 |  155 +++++
> > >  11 files changed, 2197 insertions(+), 0 deletions(-)
> > >  create mode 100644 Documentation/vfio.txt
> > >  create mode 100644 drivers/vfio/Kconfig
> > >  create mode 100644 drivers/vfio/Makefile
> > >  create mode 100644 drivers/vfio/vfio_iommu.c
> > >  create mode 100644 drivers/vfio/vfio_main.c
> > >  create mode 100644 drivers/vfio/vfio_private.h
> > >  create mode 100644 include/linux/vfio.h
> > > 
> > > diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
> > > index 54078ed..59d01e4 100644
> > > --- a/Documentation/ioctl/ioctl-number.txt
> > > +++ b/Documentation/ioctl/ioctl-number.txt
> > > @@ -88,6 +88,7 @@ Code  Seq#(hex)	Include File		Comments
> > >  		and kernel/power/user.c
> > >  '8'	all				SNP8023 advanced NIC card
> > >  					<mailto:mcr@solidum.com>
> > > +';'	64-76	linux/vfio.h
> > >  '@'	00-0F	linux/radeonfb.h	conflict!
> > >  '@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
> > >  'A'	00-1F	linux/apm_bios.h	conflict!
> > > diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> > > new file mode 100644
> > > index 0000000..5866896
> > > --- /dev/null
> > > +++ b/Documentation/vfio.txt
> > > @@ -0,0 +1,304 @@
> > > +VFIO - "Virtual Function I/O"[1]
> > > +-------------------------------------------------------------------------------
> > > +Many modern system now provide DMA and interrupt remapping facilities
> > > +to help ensure I/O devices behave within the boundaries they've been
> > > +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> > > +well as POWER systems with Partitionable Endpoints (PEs) and even
> > > +embedded powerpc systems (technology name unknown).  The VFIO driver
> > > +is an IOMMU/device agnostic framework for exposing direct device
> > > +access to userspace, in a secure, IOMMU protected environment.  In
> > > +other words, this allows safe, non-privileged, userspace drivers.
> > > +
> > > +Why do we want that?  Virtual machines often make use of direct device
> > > +access ("device assignment") when configured for the highest possible
> > > +I/O performance.  From a device and host perspective, this simply turns
> > > +the VM into a userspace driver, with the benefits of significantly
> > > +reduced latency, higher bandwidth, and direct use of bare-metal device
> > > +drivers[2].
> > 
> > Are there any constraints of running a 32-bit userspace with
> > a 64-bit kernel and with 32-bit user space drivers?
> 
> Shouldn't be.  I'll need to do some testing on that, but it was working
> on the previous generation of vfio.

<nods> ok
.. snip..

> > > +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> > > + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> > > +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> > > +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
> > 
> > Coherency support is not going to be addressed right? What about sync?
> > Say you need to sync CPU to Device address?
> 
> Do we need to expose that to userspace or should the underlying
> iommu_ops take care of it?

That I am not sure of. I know that the kernel drivers (especially network ones)
are riddled with:

pci_dma_sync_single_for_cpu(tp->pdev, dma_addr, len, PCI_DMA_FROMDEVICE);
skb_copy_from_linear_data(skb, copy_skb->data, len); 
pci_dma_sync_single_for_device(tp->pdev, dma_addr, len, PCI_DMA_FROMDEVICE);


But I think that has come from the fact that the devices are 32-bit
so they could not do DMA above 4GB. Hence the bounce buffer usage and
the proliferation of pci_dma_sync.. calls to copy the contents to a
bounce buffer if neccessary.

But IOMMUs seem to deal with devices that can map the full gamma of memory
so they are not constrained to that 32-bit or 36-bit, but rather
they do the mapping in hardware if neccessary.

So I think I just answered the question - which is: No.
.. snip..
> > > +        __u64   vaddr;          /* process virtual addr */
> > > +        __u64   dmaaddr;        /* desired and/or returned dma address */
> > > +        __u64   size;           /* size in bytes */
> > > +        __u64   flags;
> > > +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> > > +};
> > > +
> > > +Current users of VFIO use relatively static DMA mappings, not requiring
> > > +high frequency turnover.  As new users are added, it's expected that the
> > 
> > Is there a limit to how many DMA mappings can be created?
> 
> Not that I'm aware of for the current AMD-Vi/VT-d implementations.  I
> suppose iommu_ops would return -ENOSPC if it hit a limit.  I added the

Not -ENOMEM? Either way, might want to mention that in this nice
document.

> VFIO_IOMMU_FLAGS_MAP_ANY flag above to try to identify that kind of
> restriction.

.. snip..

> > > +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> > > +
> > > +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
> > 
> > Don't want __u32?
> 
> It could be, not sure if it buys us anything maybe even restricts us.
> We likely don't need 2^32 regions (famous last words?), so we could
> later define <0 to something?

OK.
> 
> > > +
> > > +Regions are described by a struct vfio_region_info, which is retrieved by
> > > +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> > > +the desired region (0 based index).  Note that devices may implement zero
> > > 
> > +sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> > > +mapping).
> > 
> > Huh?
> 
> PCI has the following static mapping:
> 
> enum {
>         VFIO_PCI_BAR0_REGION_INDEX,
>         VFIO_PCI_BAR1_REGION_INDEX,
>         VFIO_PCI_BAR2_REGION_INDEX,
>         VFIO_PCI_BAR3_REGION_INDEX,
>         VFIO_PCI_BAR4_REGION_INDEX,
>         VFIO_PCI_BAR5_REGION_INDEX,
>         VFIO_PCI_ROM_REGION_INDEX,
>         VFIO_PCI_CONFIG_REGION_INDEX,
>         VFIO_PCI_NUM_REGIONS
> };
> 
> So 8 regions are always reported regardless of whether the device
> implements all the BARs and the ROM.  Then we have a fixed bar:index
> mapping so we don't have to create a region_info field to describe the
> bar number for the index.

OK. Is that a problem if the real device actually has a zero sized BAR?
Or is zero sized BAR in PCI spec equal to "disabled, not in use" ? Just
wondering whether (-1ULL) should be used instead? (Which seems the case
in QEMU code).

> 
> > > +
> > > +struct vfio_region_info {
> > > +        __u32   len;            /* length of structure */
> > > +        __u32   index;          /* region number */
> > > +        __u64   size;           /* size in bytes of region */
> > > +        __u64   offset;         /* start offset of region */
> > > +        __u64   flags;
> > > +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> > > +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> > > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> > 
> > What is FLAG_MMAP? Does it mean: 1) it can be mmaped, or 2) it is mmaped?
> 
> Supports mmap

> 
> > FLAG_RO is pretty obvious - presumarily this is for firmware regions and such.
> > And PHYS_VALID is if the region is disabled for some reasons? If so
> > would the name FLAG_DISABLED be better?
> 
> No, POWER guys have some need to report the host physical address of the
> region, so the flag indicates whether the below field is present and
> valid.  I'll clarify these in the docs.

Thanks.
.. snip..
> > > +struct vfio_irq_info {
> > > +        __u32   len;            /* length of structure */
> > > +        __u32   index;          /* IRQ number */
> > > +        __u32   count;          /* number of individual IRQs */
> > > +        __u64   flags;
> > > +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> > > +};
> > > +
> > > +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> > > +type to index mapping).
> > 
> > I am not really sure what that means.
> 
> This is so PCI can expose:
> 
> enum {
>         VFIO_PCI_INTX_IRQ_INDEX,
>         VFIO_PCI_MSI_IRQ_INDEX,
>         VFIO_PCI_MSIX_IRQ_INDEX,
>         VFIO_PCI_NUM_IRQS
> };
> 
> So like regions it always exposes 3 IRQ indexes where count=0 if the
> device doesn't actually support that type of interrupt.  I just want to
> spell out that bus drivers have this kind of flexibility.

I think you should change the comment that  says 'IRQ number', as the
first thing that comes in my mind is 'GSI' or MSI/MSI-x vector.
Perhaps '/* index to be used with return value from GET_NUM_IRQS ioctl.
Order of structures can be unsorted. */

> 
> > > +
> > > +Information about each index can be retrieved using the GET_IRQ_INFO
> > > +ioctl, used much like GET_REGION_INFO.
> > > +
> > > +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> > > +
> > > +Individual indexes can describe single or sets of IRQs.  This provides the
> > > +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> > > +
> > > +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> > > +as shown below, are used to pass the IRQ info index, the number of eventfds,
> > > +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> > > +
> > > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> > 
> > Are eventfds u64 or u32?
> 
> int, they're just file descriptors
> 
> > Why not just define a structure?
> > struct vfio_irq_eventfds {
> > 	__u32	index;
> > 	__u32	count;
> > 	__u64	eventfds[0]
> > };
> 
> We could do that if preferred.  Hmm, are we then going to need
> size/flags?

Sure.

> 
> > How do you get an eventfd to feed in here?
> 
> eventfd(2), in qemu event_notifier_init() -> event_notifier_get_fd()
> 
> > > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> > 
> > u32?
> 
> Not here, it's an fd, so should be an int.
> 
> > > +
> > > +When a level triggered interrupt is signaled, the interrupt is masked
> > > +on the host.  This prevents an unresponsive userspace driver from
> > > +continuing to interrupt the host system.  After servicing the interrupt,
> > > +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> > > +triggered interrupts implicitly have a count of 1 per index.
> > 
> > So they are enabled automatically? Meaning you don't even hav to do
> > SET_IRQ_EVENTFDS b/c the count is set to 1?
> 
> I suppose that should be "no more than 1 per index" (ie. PCI would
> report a count of 0 for VFIO_PCI_INTX_IRQ_INDEX if the device doesn't
> support INTx).  I think you might be confusing VFIO_DEVICE_GET_IRQ_INFO
> which tells how many are available with VFIO_DEVICE_SET_IRQ_EVENTFDS
> which does the enabling/disabling.  All interrupts are disabled by
> default because userspace needs to give us a way to signal them via
> eventfds.  It will be device dependent whether multiple index can be
> enabled simultaneously.  Hmm, is that another flag on the irq_info
> struct or do we expect drivers to implicitly have that kind of
> knowledge?

Right, that was what I was wondering. Not sure how the PowerPC
world works with this.

> 
> > > +
> > > +/* Unmask IRQ index, arg[0] = index */
> > > +#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)
> > 
> > So this is for MSI as well? So if I've an index = 1, with count = 4,
> > and doing unmaks IRQ will chip enable all the MSI event at once?
> 
> No, this is only for re-enabling level triggered interrupts as discussed
> above.  Edge triggered interrupts like MSI don't need an unmask... we
> may want to do something to accelerate the MSI-X table access for
> masking specific interrupts, but I figured that would need to be PCI
> aware since those are PCI features, and would therefore be some future
> extension of the PCI bus driver and exposed via VFIO_DEVICE_GET_FLAGS.

OK.
> 
> > I guess there is not much point in enabling/disabling selective MSI
> > IRQs..
> 
> Some older OSes are said to make extensive use of masking for MSI, so we
> probably want this at some point.  I'm assuming future PCI extension for
> now.
> 
> > > +
> > > +Level triggered interrupts can also be unmasked using an irqfd.  Use
> > 
> > irqfd or eventfd?
> 
> irqfd is an eventfd in reverse.  eventfd = kernel signals userspace via
> an fd, irqfd = userspace signals kernel via an fd.

Ah neat.

> 
> > > +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
> > 
> > So only level triggered? Hmm, how do I know whether the device is
> > level or edge? Or is that edge (MSI) can also be unmaked using the
> > eventfs
> 
> Yes, only for level.  Isn't a device going to know what type of
> interrupt it uses?  MSI masking is PCI specific, not handled by this.

I certainly hope it knows, but you know buggy drivers do exist.

What would be the return value if somebody tried to unmask an edge one?
Should that be documented here? -ENOSPEC?

> 
> > > +
> > > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> > > +
> > > +When supported, as indicated by the device flags, reset the device.
> > > +
> > > +#define VFIO_DEVICE_RESET               _IO(';', 116)
> > 
> > Does it disable the 'count'? Err, does it disable the IRQ on the
> > device after this and one should call VFIO_DEVICE_SET_IRQ_EVENTFDS
> > to set new eventfds? Or does it re-use the eventfds and the device
> > is enabled after this?
> 
> It doesn't affect the interrupt programming.  Should it?

I would hope not, but I am trying to think of ways one could screw this up.
Perhaps just saying that - "No need to call VFIO_DEVICE_SET_IRQ_EVENTFDS
as the kernel (and the device) will retain the interrupt.".
.. snip..
> > I am not really sure what this section purpose is? Could this be part
> > of the header file or the code? It does not look to be part of the
> > ioctl API?
> 
> We've passed into the "VFIO bus driver API" section of the document, to
> explain the interaction between vfio-core and vfio bus drivers.

Perhaps a different file?
.. large snip ..
> > > +
> > > +	mutex_lock(&iommu->dgate);
> > > +	list_for_each_safe(pos, pos2, &iommu->dm_list) {
> > > +		mlp = list_entry(pos, struct dma_map_page, list);
> > > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > 
> > Uh, so if it did not get put_page() we would try to still delete it?
> > Couldn't that lead to corruption as the 'mlp' is returned to the poll?
> > 
> > Ah wait, the put_page is on the DMA page, so it is OK to
> > delete the tracking structure. It will be just a leaked page.
> 
> Assume you're referencing this chunk:
> 
> vfio_dma_unmap
>   __vfio_dma_unmap
>     ...
>         pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
>         if (pfn) {
>                 iommu_unmap(iommu->domain, iova, 0);
>                 unlocked += put_pfn(pfn, rdwr);
>         }
> 
> So we skip things that aren't mapped in the iommu, but anything not
> mapped should have already been put (failed vfio_dma_map).  If it is
> mapped, we put it if we originally got it via get_user_pages_fast.
> unlocked would only not get incremented here if it was an mmap'd page
> (such as the mmap of an mmio space of another vfio device), via the code
> in vaddr_get_pfn (stolen from KVM).

Yup. Sounds right.
.. snip..
> > > +module_param(allow_unsafe_intrs, int, 0);
> > 
> > S_IRUGO ?
> 
> I actually intended that to be S_IRUGO | S_IWUSR just like the kvm
> parameter so it can be toggled runtime.

OK.
> 
> > > +MODULE_PARM_DESC(allow_unsafe_intrs,
> > > +        "Allow use of IOMMUs which do not support interrupt remapping");
> > > +
> > > +static struct vfio {
> > > +	dev_t			devt;
> > > +	struct cdev		cdev;
> > > +	struct list_head	group_list;
> > > +	struct mutex		lock;
> > > +	struct kref		kref;
> > > +	struct class		*class;
> > > +	struct idr		idr;
> > > +	wait_queue_head_t	release_q;
> > > +} vfio;
> > 
> > You probably want to move this below the 'vfio_group'
> > as vfio contains the vfio_group.
> 
> Only via the group_list.  Are you suggesting for readability or to avoid
> forward declarations (which we don't need between these two with current
> ordering).

Just for readability.

> 
> > > +
> > > +static const struct file_operations vfio_group_fops;
> > > +extern const struct file_operations vfio_iommu_fops;
> > > +
> > > +struct vfio_group {
> > > +	dev_t			devt;
> > > +	unsigned int		groupid;
> > > +	struct bus_type		*bus;
> > > +	struct vfio_iommu	*iommu;
> > > +	struct list_head	device_list;
> > > +	struct list_head	iommu_next;
> > > +	struct list_head	group_next;
> > > +	int			refcnt;
> > > +};
> > > +
> > > +struct vfio_device {
> > > +	struct device			*dev;
> > > +	const struct vfio_device_ops	*ops;
> > > +	struct vfio_iommu		*iommu;
> > > +	struct vfio_group		*group;
> > > +	struct list_head		device_next;
> > > +	bool				attached;
> > > +	int				refcnt;
> > > +	void				*device_data;
> > > +};
> > 
> > And perhaps move this above vfio_group. As vfio_group
> > contains a list of these structures?
> 
> These are inter-linked, so chicken and egg.  The current ordering is
> more function based than definition based.  struct vfio is the highest
> level object, groups are next, iommus and devices are next, but we need
> to share iommus with the other file, so that lands in the header.

Ah, OK.
> 
> > > +
> > > +/*
> > > + * Helper functions called under vfio.lock
> > > + */
> > > +
> > > +/* Return true if any devices within a group are opened */
> > > +static bool __vfio_group_devs_inuse(struct vfio_group *group)
> > > +{
> > > +	struct list_head *pos;
> > > +
> > > +	list_for_each(pos, &group->device_list) {
> > > +		struct vfio_device *device;
> > > +
> > > +		device = list_entry(pos, struct vfio_device, device_next);
> > > +		if (device->refcnt)
> > > +			return true;
> > > +	}
> > > +	return false;
> > > +}
> > > +
> > > +/* Return true if any of the groups attached to an iommu are opened.
> > > + * We can only tear apart merged groups when nothing is left open. */
> > > +static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
> > > +{
> > > +	struct list_head *pos;
> > > +
> > > +	list_for_each(pos, &iommu->group_list) {
> > > +		struct vfio_group *group;
> > > +
> > > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > > +		if (group->refcnt)
> > > +			return true;
> > > +	}
> > > +	return false;
> > > +}
> > > +
> > > +/* An iommu is "in use" if it has a file descriptor open or if any of
> > > + * the groups assigned to the iommu have devices open. */
> > > +static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
> > > +{
> > > +	struct list_head *pos;
> > > +
> > > +	if (iommu->refcnt)
> > > +		return true;
> > > +
> > > +	list_for_each(pos, &iommu->group_list) {
> > > +		struct vfio_group *group;
> > > +
> > > +		group = list_entry(pos, struct vfio_group, iommu_next);
> > > +
> > > +		if (__vfio_group_devs_inuse(group))
> > > +			return true;
> > > +	}
> > > +	return false;
> > > +}
> > > +
> > > +static void __vfio_group_set_iommu(struct vfio_group *group,
> > > +				   struct vfio_iommu *iommu)
> > > +{
> > > +	struct list_head *pos;
> > > +
> > > +	if (group->iommu)
> > > +		list_del(&group->iommu_next);
> > > +	if (iommu)
> > > +		list_add(&group->iommu_next, &iommu->group_list);
> > > +
> > > +	group->iommu = iommu;
> > > +
> > > +	list_for_each(pos, &group->device_list) {
> > > +		struct vfio_device *device;
> > > +
> > > +		device = list_entry(pos, struct vfio_device, device_next);
> > > +		device->iommu = iommu;
> > > +	}
> > > +}
> > > +
> > > +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
> > > +				    struct vfio_device *device)
> > > +{
> > > +	BUG_ON(!iommu->domain && device->attached);
> > 
> > Whoa. Heavy hammer there.
> > 
> > Perhaps WARN_ON as you do check it later on.
> 
> I think it's warranted, internal consistency is broken if we have a
> device that thinks it's attached to an iommu domain that doesn't exist.
> It should, of course, never happen and this isn't a performance path.
> 
> > > +
> > > +	if (!iommu->domain || !device->attached)
> > > +		return;

Well, the deal is that you BUG_ON earlier, but you check for it here.
But the BUG_ON will stop execution , so the check 'if ..' is actually
not needed.


> > > +
> > > +	iommu_detach_device(iommu->domain, device->dev);
> > > +	device->attached = false;
> > > +}
> > > +
> > > +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
> > > +				      struct vfio_group *group)
> > > +{
> > > +	struct list_head *pos;
> > > +
> > > +	list_for_each(pos, &group->device_list) {
> > > +		struct vfio_device *device;
> > > +
> > > +		device = list_entry(pos, struct vfio_device, device_next);
> > > +		__vfio_iommu_detach_dev(iommu, device);
> > > +	}
> > > +}
> > > +
> > > +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
> > > +				   struct vfio_device *device)
> > > +{
> > > +	int ret;
> > > +
> > > +	BUG_ON(device->attached);
> > 
> > How about:
> > 
> > WARN_ON(device->attached, "The engineer who wrote the user-space device driver is trying to register
> > the device again! Tell him/her to stop please.\n");
> 
> I would almost demote this one to a WARN_ON, but userspace isn't in
> control of attaching and detaching devices from the iommu.  That's a
> side effect of getting the iommu or device file descriptor.  So again,
> this is an internal consistency check and it should never happen,
> regardless of userspace.
> 

Ok, then you might want to expand it to

BUG_ON(!device  || device->attached);

In case something has gone horribly wrong.


.. snip..
> > > +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> > > +		device_create(vfio.class, NULL, group->devt,
> > > +			      group, "%u", groupid);
> > > +
> > > +		group->bus = dev->bus;
> > 
> > 
> > Oh, so that is how the IOMMU iommu_ops get copied! You might
> > want to mention that - I was not sure where the 'handoff' is
> > was done to insert a device so that it can do iommu_ops properly.
> > 
> > Ok, so the time when a device is detected whether it can do
> > IOMMU is when we try to open it - as that is when iommu_domain_alloc
> > is called which can return NULL if the iommu_ops is not set.
> > 
> > So what about devices that don't have an iommu_ops? Say they
> > are using SWIOTLB? (like the AMD-Vi sometimes does if the
> > device is not on its list).
> > 
> > Can we use iommu_present?
> 
> I'm not sure I'm following your revelation ;)  Take a look at the

I am trying to figure out who sets the iommu_ops call on the devices.

> pointer to iommu_device_group I pasted above, or these:
> 
> https://github.com/awilliam/linux-vfio/commit/37dd08c90d149caaed7779d4f38850a8f7ed0fa5
> https://github.com/awilliam/linux-vfio/commit/63ca8543533d8130db23d7949133e548c3891c97
> https://github.com/awilliam/linux-vfio/commit/8d7d70eb8e714fbf8710848a06f8cab0c741631e
> 
> That call includes an iommu_present() check, so if there's no iommu or
> the iommu can't provide a groupid, the device is skipped over from vfio
> (can't be used).
> 
> So the ordering is:
> 
>  - bus driver registers device
>    - if it has an iommu group, add it to the vfio device/group tracking
> 
>  - group gets opened
>    - user gets iommu or device fd results in iommu_domain_alloc
> 
> Devices without iommu_ops don't get to play in the vfio world.

Right, and I think the answer of which devices get iommu_ops is done via
bus_set_iommu.

(Thinking in long-term of what would be required to make this work
with Xen and it sounds like I will need to implement a Xen IOMMU driver)
 

.. snip..
> > 
> > So where is the vfio-pci? Is that a seperate posting?
> 
> You can find it in the tree pointed to in the patch description:
> 
> https://github.com/awilliam/linux-vfio/commit/534725d327e2b7791a229ce72d2ae8a62ee0a4e5

Thanks.

> 
> I was hoping to get some consensus around the new core before spending
> too much time polishing up the bus driver.  Thanks for the review, it's
> very much appreciated!

Sure thing.
> 
> Alex
>
Scott Wood Nov. 16, 2011, 5:47 p.m. UTC | #28
On 11/11/2011 04:10 PM, Alex Williamson wrote:
> 
> Thanks Konrad!  Comments inline.
> 
> On Fri, 2011-11-11 at 12:51 -0500, Konrad Rzeszutek Wilk wrote:
>> On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
>>> +When supported, as indicated by the device flags, reset the device.
>>> +
>>> +#define VFIO_DEVICE_RESET               _IO(';', 116)
>>
>> Does it disable the 'count'? Err, does it disable the IRQ on the
>> device after this and one should call VFIO_DEVICE_SET_IRQ_EVENTFDS
>> to set new eventfds? Or does it re-use the eventfds and the device
>> is enabled after this?
> 
> It doesn't affect the interrupt programming.  Should it?

It should probably clear any currently pending interrupts, as if the
unmask IOCTL were called.

>>> +device tree properties of the device:
>>> +
>>> +struct vfio_dtpath {
>>> +        __u32   len;            /* length of structure */
>>> +        __u32   index;
>>
>> 0 based I presume?
> 
> Everything else is, I would assume so/

Yes, it should be zero-based -- this matches how such indices are done
in the kernel device tree APIs.

>>> +        __u64   flags;
>>> +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
>>
>> What is region in this context?? Or would this make much more sense
>> if I knew what Device Tree actually is.
> 
> Powerpc guys, any comments?  This was their suggestion.  These are
> effectively the first device specific extension, available when
> VFIO_DEVICE_FLAGS_DT is set.

An assigned device may consist of an entire subtree of the device tree,
and both register banks and interrupts can come from any node in the
tree.  Region versus IRQ here indicates the context in which to
interpret index, in order to retrieve the path of the node that supplied
this particular region or IRQ.

>>> +};
>>> +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
>>> +
>>> +struct vfio_dtindex {
>>> +        __u32   len;            /* length of structure */
>>> +        __u32   index;
>>> +        __u32   prop_type;
>>
>> Is that an enum type? Is this definied somewhere?
>>> +        __u32   prop_index;
>>
>> What is the purpose of this field?
> 
> Need input from powerpc folks here

To identify what this resource (register bank or IRQ) this is, we need
both the path to the node and the index into the reg or interrupts
property within the node.

We also need to distinguish reg from ranges, and interrupts from
interrupt-map.  As you suggested elsewhere in the thread, the device
tree API should probably be left out for now, and added later along with
the device tree "bus" driver.

>>> +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
>>> +				    struct vfio_device *device)
>>> +{
>>> +	BUG_ON(!iommu->domain && device->attached);
>>
>> Whoa. Heavy hammer there.
>>
>> Perhaps WARN_ON as you do check it later on.
> 
> I think it's warranted, internal consistency is broken if we have a
> device that thinks it's attached to an iommu domain that doesn't exist.
> It should, of course, never happen and this isn't a performance path.
> 
[snip]
>>> +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
>>> +				   struct vfio_device *device)
>>> +{
>>> +	int ret;
>>> +
>>> +	BUG_ON(device->attached);
>>
>> How about:
>>
>> WARN_ON(device->attached, "The engineer who wrote the user-space device driver is trying to register
>> the device again! Tell him/her to stop please.\n");
> 
> I would almost demote this one to a WARN_ON, but userspace isn't in
> control of attaching and detaching devices from the iommu.  That's a
> side effect of getting the iommu or device file descriptor.  So again,
> this is an internal consistency check and it should never happen,
> regardless of userspace.

The rule isn't to use BUG for internal consistency checks and WARN for
stuff userspace can trigger, but rather to use BUG if you cannot
reasonably continue, WARN for "significant issues that need prompt
attention" that are reasonably recoverable.  Most instances of WARN are
internal consistency checks.

From include/asm-generic/bug.h:
> If you're tempted to BUG(), think again:  is completely giving up
> really the *only* solution?  There are usually better options, where
> users don't need to reboot ASAP and can mostly shut down cleanly.

-Scott
Alex Williamson Nov. 16, 2011, 11:34 p.m. UTC | #29
On Tue, 2011-11-15 at 16:29 -0600, Scott Wood wrote:
> On 11/15/2011 03:40 PM, Aaron Fabbri wrote:
> > 
> > 
> > 
> > On 11/15/11 12:10 PM, "Scott Wood" <scottwood@freescale.com> wrote:
> > 
> >> On 11/15/2011 12:34 AM, David Gibson wrote:
> > <snip> 
> >>>> +static int allow_unsafe_intrs;
> >>>> +module_param(allow_unsafe_intrs, int, 0);
> >>>> +MODULE_PARM_DESC(allow_unsafe_intrs,
> >>>> +        "Allow use of IOMMUs which do not support interrupt remapping");
> >>>
> >>> This should not be a global option, but part of the AMD/Intel IOMMU
> >>> specific code.  In general it's a question of how strict the IOMMU
> >>> driver is about isolation when it determines what the groups are, and
> >>> only the IOMMU driver can know what the possibilities are for its
> >>> class of hardware.
> >>
> >> It's also a concern that is specific to MSIs.  In any case, I'm not sure
> >> that the ability to cause a spurious IRQ is bad enough to warrant
> >> disabling the entire subsystem by default on certain hardware.
> > 
> > I think the issue is more that the ability to create fake MSI interrupts can
> > lead to bigger exploits.
> > 
> > Originally we didn't have this parameter. It was added it to reflect the
> > fact that MSI's triggered by guests are dangerous without the isolation that
> > interrupt remapping provides.
> > 
> > That is, it *should* be inconvenient to run without interrupt mapping HW
> > support.
> 
> A sysfs knob is sufficient inconvenience.  It should only affect whether
> you can use MSIs, and the relevant issue shouldn't be "has interrupt
> remapping" but "is there a hole".
> 
> Some systems might address the issue in ways other than IOMMU-level MSI
> translation.  Our interrupt controller provides enough separate 4K pages
> for MSI interrupt delivery for each PCIe IOMMU group to get its own (we
> currently only have 3, one per root complex) -- no special IOMMU feature
> required.
> 
> It doesn't help that the semantics of IOMMU_CAP_INTR_REMAP are
> undefined.  I shouldn't have to know how x86 IOMMUs work when
> implementing a driver for different hardware, just to know what the
> generic code is expecting.
> 
> As David suggests, if you want to do this it should be the x86 IOMMU
> driver that has a knob that controls how it forms groups in the absence
> of this support.

That is a possibility, we could push it down to the iommu driver which
could simply lump everything into a single groupid when interrupt
remapping is not supported.  Or more directly, when there is an exposure
that devices can trigger random MSIs in the host.  Then we wouldn't need
an option to override this in vfio, you'd just be stuck not being able
to use any devices if you can't bind everything to vfio.  That also
eliminates the possibility of flipping it on dynamically since we can't
handle groupids changing.  Then we'd need an iommu=group_unsafe_msi flag
to enable it.  Ok?  Thanks,

Alex
David Gibson Nov. 17, 2011, 12:02 a.m. UTC | #30
On Tue, Nov 15, 2011 at 11:01:28AM -0700, Alex Williamson wrote:
> On Tue, 2011-11-15 at 17:34 +1100, David Gibson wrote:
> > On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> > > diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> > > new file mode 100644
> > > index 0000000..5866896
> > > --- /dev/null
> > > +++ b/Documentation/vfio.txt
> > > @@ -0,0 +1,304 @@
> > > +VFIO - "Virtual Function I/O"[1]
> > > +-------------------------------------------------------------------------------
> > > +Many modern system now provide DMA and interrupt remapping facilities
> > > +to help ensure I/O devices behave within the boundaries they've been
> > > +allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
> > > +well as POWER systems with Partitionable Endpoints (PEs) and even
> > > +embedded powerpc systems (technology name unknown).  The VFIO driver
> > > +is an IOMMU/device agnostic framework for exposing direct device
> > > +access to userspace, in a secure, IOMMU protected environment.  In
> > > +other words, this allows safe, non-privileged, userspace drivers.
> > 
> > It's perhaps worth emphasisng that "safe" depends on the hardware
> > being sufficiently well behaved.  BenH, I know, thinks there are a
> > *lot* of cards that, e.g. have debug registers that allow a backdoor
> > to their own config space via MMIO, which would bypass vfio's
> > filtering of config space access.  And that's before we even get into
> > the varying degrees of completeness in the isolation provided by
> > different IOMMUs.
> 
> Fair enough.  I know Tom had emphasized "well behaved" in the original
> doc.  Virtual functions are probably the best indicator of well behaved.
> 
> > > +Why do we want that?  Virtual machines often make use of direct device
> > > +access ("device assignment") when configured for the highest possible
> > > +I/O performance.  From a device and host perspective, this simply turns
> > > +the VM into a userspace driver, with the benefits of significantly
> > > +reduced latency, higher bandwidth, and direct use of bare-metal device
> > > +drivers[2].
> > > +
> > > +Some applications, particularly in the high performance computing
> > > +field, also benefit from low-overhead, direct device access from
> > > +userspace.  Examples include network adapters (often non-TCP/IP based)
> > > +and compute accelerators.  Previous to VFIO, these drivers needed to
> > 
> > s/Previous/Prior/  although that may be a .us vs .au usage thing.
> 
> Same difference, AFAICT.
> 
> > > +go through the full development cycle to become proper upstream driver,
> > > +be maintained out of tree, or make use of the UIO framework, which
> > > +has no notion of IOMMU protection, limited interrupt support, and
> > > +requires root privileges to access things like PCI configuration space.
> > > +
> > > +The VFIO driver framework intends to unify these, replacing both the
> > > +KVM PCI specific device assignment currently used as well as provide
> > > +a more secure, more featureful userspace driver environment than UIO.
> > > +
> > > +Groups, Devices, IOMMUs, oh my
> > > +-------------------------------------------------------------------------------
> > > +
> > > +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> > > +can't always distinguish transactions from each individual device in
> > > +the system.  Sometimes this is because of the IOMMU design, such as with
> > > +PEs, other times it's caused by the I/O topology, for instance a
> > > +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> > > +devices created by these restictions IOMMU groups (or just "groups" for
> > > +this document).
> > > +
> > > +The IOMMU cannot distiguish transactions between the individual devices
> > > +within the group, therefore the group is the basic unit of ownership for
> > > +a userspace process.  Because of this, groups are also the primary
> > > +interface to both devices and IOMMU domains in VFIO.
> > > +
> > > +The VFIO representation of groups is created as devices are added into
> > > +the framework by a VFIO bus driver.  The vfio-pci module is an example
> > > +of a bus driver.  This module registers devices along with a set of bus
> > > +specific callbacks with the VFIO core.  These callbacks provide the
> > > +interfaces later used for device access.  As each new group is created,
> > > +as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
> > > +character device.
> > 
> > Ok.. so, the fact that it's called "vfio-pci" suggests that the VFIO
> > bus driver is per bus type, not per bus instance.   But grouping
> > constraints could be per bus instance, if you have a couple of
> > different models of PCI host bridge with IOMMUs of different
> > capabilities built in, for example.
> 
> Yes, vfio-pci manages devices on the pci_bus_type; per type, not per bus
> instance.

Ok, how can that work.  vfio-pci is responsible for generating the
groupings, yes?  For which it needs to know the iommu/host bridge's
isolation capabilities, which vary depending on the type of host
bridge.

>  IOMMUs also register drivers per bus type, not per bus
> instance.  The IOMMU driver is free to impose any constraints it wants.
> 
> > > +In addition to the device enumeration and callbacks, the VFIO bus driver
> > > +also provides a traditional device driver and is able to bind to devices
> > > +on it's bus.  When a device is bound to the bus driver it's available to
> > > +VFIO.  When all the devices within a group are bound to their bus drivers,
> > > +the group becomes "viable" and a user with sufficient access to the VFIO
> > > +group chardev can obtain exclusive access to the set of group devices.
> > > +
> > > +As documented in linux/vfio.h, several ioctls are provided on the
> > > +group chardev:
> > > +
> > > +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> > > + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> > > + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> > > +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> > > +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> > > +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> > > +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
> > > +
> > > +The last two ioctls return new file descriptors for accessing
> > > +individual devices within the group and programming the IOMMU.  Each of
> > > +these new file descriptors provide their own set of file interfaces.
> > > +These ioctls will fail if any of the devices within the group are not
> > > +bound to their VFIO bus driver.  Additionally, when either of these
> > > +interfaces are used, the group is then bound to the struct_mm of the
> > > +caller.  The GET_FLAGS ioctl can be used to view the state of the group.
> > > +
> > > +When either the GET_IOMMU_FD or GET_DEVICE_FD ioctls are invoked, a
> > > +new IOMMU domain is created and all of the devices in the group are
> > > +attached to it.  This is the only way to ensure full IOMMU isolation
> > > +of the group, but potentially wastes resources and cycles if the user
> > > +intends to manage multiple groups with the same set of IOMMU mappings.
> > > +VFIO therefore provides a group MERGE and UNMERGE interface, which
> > > +allows multiple groups to share an IOMMU domain.  Not all IOMMUs allow
> > > +arbitrary groups to be merged, so the user should assume merging is
> > > +opportunistic.
> > 
> > I do not think "opportunistic" means what you think it means..
> > 
> > >  A new group, with no open device or IOMMU file
> > > +descriptors, can be merged into an existing, in-use, group using the
> > > +MERGE ioctl.  A merged group can be unmerged using the UNMERGE ioctl
> > > +once all of the device file descriptors for the group being merged
> > > +"out" are closed.
> > > +
> > > +When groups are merged, the GET_IOMMU_FD and GET_DEVICE_FD ioctls are
> > > +essentially fungible between group file descriptors (ie. if device
> > > A
> > 
> > IDNT "fungible" MWYTIM, either.
> 
> Hmm, feel free to suggest.  Maybe we're hitting .us vs .au connotation.

In any case, I don't think it's a word whose meaning is unambiguous
enough to use here.

> > > +is in group X, and X is merged with Y, a file descriptor for A can be
> > > +retrieved using GET_DEVICE_FD on Y.  Likewise, GET_IOMMU_FD returns a
> > > +file descriptor referencing the same internal IOMMU object from either
> > > +X or Y).  Merged groups can be dissolved either explictly with UNMERGE
> > > +or automatically when ALL file descriptors for the merged group are
> > > +closed (all IOMMUs, all devices, all groups).
> > 
> > Blech.  I'm really not liking this merge/unmerge API as it stands,
> > it's horribly confusing.  At the very least, we need some better
> > terminology.  We need some term for the metagroups; supergroups; iommu
> > domains or-at-least-they-will-be-once-we-open-the-iommu or
> > whathaveyous.
> > 
> > The first confusing thing about this interface is that each open group
> > handle actually refers to two different things; the original group you
> > opened and the metagroup it's a part of.  For the GET_IOMMU_FD and
> > GET_DEVICE_FD operations, you're using the metagroup and two "merged"
> > group handles are interchangeable.
> 
> Fungible, even ;)
> 
> > For other MERGE and especially
> > UNMERGE operations, it matters which is the original group.
> 
> If I stick two LEGO blocks together, I need to identify the individual
> block I want to remove to pull them back apart...

Yeah, I'm starting to get my head around the model, but the current
description of it doesn't help very much.  In particular the terms
"merge" and "unmerge" lead one to the wrong mental model, I think.

> > The semantics of "merge" and "unmerge" under those names are really
> > non-obvious.  Merge kind of has to merge two whole metagroups, but
> > it's unclear if unmerge reverses one merge, or just takes out one
> > (atom) group.  These operations need better names, at least.
> 
> Christian suggested a change to UNMERGE that we do not need to
> specify a group to unmerge "from".  This makes it more like a list
> implementation except there's no defined list_head.  Any member of the
> list can pull in a new entry.  Calling UNMERGE on any member extracts
> that member.

I think that's a good idea, but "unmerge" is not a good word for it.

> > Then it's unclear what order you can do various operations, and which
> > order you can open and close various things.  You can kind of figure
> > it out but it takes far more thinking than it should.
> > 
> > 
> > So at the _very_ least, we need to invent new terminology and find a
> > much better way of describing this API's semantics.  I still think an
> > entirely different interface, where metagroups are created from
> > outside with a lifetime that's not tied to an fd would be a better
> > idea.
> 
> As we've discussed previously, configfs provides part of this, but has
> no ioctl support.  It doesn't make sense to me to go play with groups in
> configfs, but then still interact with them via a char dev.

Why not?  You configure, say, loopback devices with losetup, then use
them as a block device.  Similar with nbd.  You can configure serial
devices with setserial, then use them as a char dev.

>  It also
> splits the ownership model 

I'm not even sure what that means.

> and makes it harder to enforce who gets to
> interact with the devices vs who gets to manipulate groups.

How so.

>  The current
> model really isn't that complicated, imho.  As always, feel free to
> suggest specific models.  If you have a specific terminology other than
> MERGE, please suggest.
> 
> > Now, you specify that you can't use a group as the second argument of
> > a merge if it already has an open iommu, but it's not clear from the
> > doc if you can merge things into a group with an open iommu.
> 
> >From above:
> 
>         A new group, with no open device or IOMMU file descriptors, can
>         be merged into an existing, in-use, group using the MERGE ioctl.
>                                  ^^^^^^
> 
> > Banning
> > this would make life simpler, because the IOMMU's effective
> > capabilities may change if you add more devices to the domain.  That's
> > yet another non-obvious constraint in the interface ordering, though.
> 
> Banning this would prevent using merged groups with hotplug, which I
> consider to be a primary use case.

Yeah, fair enough, based on your later comments w.r.t. only combining
feature compatible groups.

> > > +The IOMMU file descriptor provides this set of ioctls:
> > > +
> > > +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> > > + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> > > +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> > > +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
> > > +
> > > +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> > > +We currently only support IOMMU domains that are able to map any
> > > +virtual address to any IOVA.  This is indicated by the MAP_ANY
> > > flag.
> > 
> > So.  I tend to think of an IOMMU mapping IOVAs to memory pages, rather
> > than memory pages to IOVAs.  
> 
> I do too, not sure why I wrote it that way, will fix.
> 
> > The IOMMU itself, of course maps to
> > physical addresses, and the meaning of "virtual address" in this
> > context is not really clear.  I think you would be better off saying
> > the IOMMU can map any IOVA to any memory page.  From a hardware POV
> > that means any physical address, but of course for a VFIO user a page
> > is specified by its process virtual address.
> 
> Will fix.
> 
> > I think we need to pin exactly what "MAP_ANY" means down better.  Now,
> > VFIO is pretty much a lost cause if you can't map any normal process
> > memory page into the IOMMU, so I think the only thing that is really
> > covered is IOVAs.  But saying "can map any IOVA" is not clear, because
> > if you can't map it, it's not a (valid) IOVA.  Better to say that
> > IOVAs can be any 64-bit value, which I think is what you really mean
> > here.
> 
> ok
> 
> > Of course, since POWER is a platform where this is *not* true, I'd
> > prefer to have something giving the range of valid IOVAs in the core
> > to start with.
> 
> Since iommu_ops does not yet have any concept of this (nudge, nudge), I
> figured this would be added later.  A possible implementation would be
> that such an iommu would not set MAP_ANY, would add a new flag for
> MAP_RANGE, and provide a new VFIO_IOMMU_GET_RANGE_INFO ioctl to describe
> it.  I'm guaranteed to get it wrong if I try to predict all your needs.

Hrm.  "ANY" just really bothers me because "any iova" is not as clear
a concept as it first appears.  For starters it's actually "any page
aligned" at the very least.  But then it's only any 64-bit address for
busses which have full 64-bit addressing (and I do wonder if there are
any north bridges out there that forgot to implement some of the upper
PCI address bits properly, given that 64-bit CPUs rarely actually
implement more than 40-something physical address bits in practice).

I'd prefer to see at least something to advertise min and max IOVA and
IOVA alignment.  That's enough to cover x86 and POWER, including
possible variants with an IOMMU page size different to the system page
size (note that POWER kernels can have 64k pages as a config option,
which means a TCE page size different to the system page size is quite
common).

Obviously there could be more complex constraints that we would need
to advertise with option bits.

> > > +
> > > +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> > > +and unmapping IOVAs to process virtual addresses:
> > > +
> > > +struct vfio_dma_map {
> > > +        __u64   len;            /* length of structure */
> > 
> > Thanks for adding these structure length fields.  But I think they
> > should be called something other than 'len', which is likely to be
> > confused with size (or some other length that's actually related to
> > the operation's parameters).  Better to call it 'structlen' or
> > 'argslen' or something.
> 
> Ok.  As Scott noted, I've failed to implement these in a way that
> actually allows extension, but I'll work on it.

Right.  I had failed to realise quite how the encoding of structure
size into the ioctl worked.  With that in place, arguably we don't
really need the size in the structure itself, because we can still
have multiple sized versions of the ioctl.  Still, whichever.

> 
> > > +        __u64   vaddr;          /* process virtual addr */
> > > +        __u64   dmaaddr;        /* desired and/or returned dma address */
> > > +        __u64   size;           /* size in bytes */
> > > +        __u64   flags;
> > > +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> > 
> > Make it independent READ and WRITE flags from the start.  Not all
> > combinations will be be valid on all hardware, but that way we have
> > the possibilities covered without having to use strange encodings
> > later.
> 
> Ok.
> 
> > > +};
> > > +
> > > +Current users of VFIO use relatively static DMA mappings, not requiring
> > > +high frequency turnover.  As new users are added, it's expected that the
> > > +IOMMU file descriptor will evolve to support new mapping interfaces, this
> > > +will be reflected in the flags and may present new ioctls and file
> > > +interfaces.
> > > +
> > > +The device GET_FLAGS ioctl is intended to return basic device type and
> > > +indicate support for optional capabilities.  Flags currently include whether
> > > +the device is PCI or described by Device Tree, and whether the RESET ioctl
> > > +is supported:
> > > +
> > > +#define VFIO_DEVICE_GET_FLAGS           _IOR(';', 108, __u64)
> > > + #define VFIO_DEVICE_FLAGS_PCI          (1 << 0)
> > > + #define VFIO_DEVICE_FLAGS_DT           (1 << 1)
> > 
> > TBH, I don't think the VFIO for DT stuff is mature enough yet to be in
> > an initial infrastructure patch, though we should certainly be
> > discussing it as an add-on patch.
> 
> I agree for DT, and PCI should be added with vfio-pci, not the initial
> core.
> 
> > > + #define VFIO_DEVICE_FLAGS_RESET        (1 << 2)
> > > +
> > > +The MMIO and IOP resources used by a device are described by regions.
> > > +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> > > +
> > > +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
> > > +
> > > +Regions are described by a struct vfio_region_info, which is retrieved by
> > > +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> > > +the desired region (0 based index).  Note that devices may implement zero
> > > +sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> > > +mapping).
> > 
> > So, I think you're saying that a zero-sized region is used to encode a
> > NOP region, that is, to basically put a "no region here" in between
> > valid region indices.  You should spell that out.
> 
> Ok.
> 
> > [Incidentally, any chance you could borrow one of RH's tech writers
> > for this?  I'm afraid you seem to lack the knack for clear and easily
> > read documentation]
> 
> Thanks for the encouragement :-\  It's no wonder there isn't more
> content in Documentation.

Sigh.  Alas, yes.

> > > +struct vfio_region_info {
> > > +        __u32   len;            /* length of structure */
> > > +        __u32   index;          /* region number */
> > > +        __u64   size;           /* size in bytes of region */
> > > +        __u64   offset;         /* start offset of region */
> > > +        __u64   flags;
> > > +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> > > +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> > 
> > Again having separate read and write bits from the start will save
> > strange encodings later.
> 
> Seems highly unlikely, but we have bits to waste...
> 
> > > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> > > +        __u64   phys;           /* physical address of region */
> > > +};
> > 
> > I notice there is no field for "type" e.g. MMIO vs. PIO vs. config
> > space for PCI.  If you added that having a NONE type might be a
> > clearer way of encoding a non-region than just having size==0.
> 
> I thought there was some resistance to including MMIO and PIO bits in
> the flags.  If that's passed, I can add it, but PCI can determine this
> through config space (and vfio-pci exposes config space at a fixed
> index).  Having a regions w/ size == 0, MMIO and PIO flags unset seems a
> little redundant if that's the only reason for having them.  A NONE flag
> doesn't make sense to me.  Config space isn't NONE, but neither is it
> MMIO nor PIO; and someone would probably be offended about even
> mentioning PIO in the specification.

No, my concept was that NONE would be used for the indexes where there
is no valid BAR.  I'll buy your argument on why not to include the PCI
(or whatever) address space type here.

What I'm just a bit concerned by is whether we could have a case (not
for PCI) of a real resource that still has size 0 - e.g. maybe some
sort of doorbell that can't be read or written, but can be triggered
some other way.  I guess that's probably unlikely though.

> 
> > > +
> > > +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> > > +
> > > +The offset indicates the offset into the device file descriptor which
> > > +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> > > +available access types and validity of optional fields.  For instance
> > > +the phys field may only be valid for certain devices types.
> > > +
> > > +Interrupts are described using a similar interface.  GET_NUM_IRQS
> > > +reports the number or IRQ indexes for the device.
> > > +
> > > +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
> > > +
> > > +struct vfio_irq_info {
> > > +        __u32   len;            /* length of structure */
> > > +        __u32   index;          /* IRQ number */
> > > +        __u32   count;          /* number of individual IRQs */
> > 
> > Is there a reason for allowing irqs in batches like this, rather than
> > having each MSI be reflected by a separate irq_info?
> 
> Yes, bus drivers like vfio-pci can define index 1 as the MSI info
> structure and index 2 as MSI-X.  There's really no need to expose 57
> individual MSI interrupts and try to map them to the correct device
> specific MSI type if they can only logically be enabled in two distinct
> groups.  Bus drivers with individually controllable MSI vectors are free
> to expose them separately.  I assume device tree paths would help
> associate an index to a specific interrupt.

Ok, fair enough.

> > > +        __u64   flags;
> > > +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> > > +};
> > > +
> > > +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> > > +type to index mapping).
> > 
> > I know what you mean, but you need a clearer way to express it.
> 
> I'll work on it.
> 
> > > +Information about each index can be retrieved using the GET_IRQ_INFO
> > > +ioctl, used much like GET_REGION_INFO.
> > > +
> > > +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> > > +
> > > +Individual indexes can describe single or sets of IRQs.  This provides the
> > > +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> > > +
> > > +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> > > +as shown below, are used to pass the IRQ info index, the number of eventfds,
> > > +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> > > +
> > > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> > > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> > > +
> > > +When a level triggered interrupt is signaled, the interrupt is masked
> > > +on the host.  This prevents an unresponsive userspace driver from
> > > +continuing to interrupt the host system.  After servicing the interrupt,
> > > +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> > > +triggered interrupts implicitly have a count of 1 per index.
> > 
> > This is a silly restriction.  Even PCI devices can have up to 4 LSIs
> > on a function in theory, though no-one ever does.  Embedded devices
> > can and do have multiple level interrupts.
> 
> Per the PCI spec, an individual PCI function can only ever have, at
> most, a single INTx line.  A multi-function *device* can have up to 4
> INTx lines, but what we're exposing here is a struct device, ie. a PCI
> function.

Ah, my mistake.

> Other devices could certainly have multiple level interrupts, and if
> grouping them as we do with MSI on PCI makes sense, please let me know.
> I just didn't see the value in making the unmask operations handle
> sub-indexes if it's not needed.

I don't know of anything off hand.  But I can't see any consideration
that would make it unlikely either.  I generally don't trust anything
*not* to exist in embedded space.

> > > +
> > > +/* Unmask IRQ index, arg[0] = index */
> > > +#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)
> > > +
> > > +Level triggered interrupts can also be unmasked using an irqfd.  Use
> > > +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
> > > +
> > > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> > > +
> > > +When supported, as indicated by the device flags, reset the device.
> > > +
> > > +#define VFIO_DEVICE_RESET               _IO(';', 116)
> > > +
> > > +Device tree devices also invlude ioctls for further defining the
> > > +device tree properties of the device:
> > > +
> > > +struct vfio_dtpath {
> > > +        __u32   len;            /* length of structure */
> > > +        __u32   index;
> > > +        __u64   flags;
> > > +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> > > +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> > > +        char    *path;
> > > +};
> > > +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> > > +
> > > +struct vfio_dtindex {
> > > +        __u32   len;            /* length of structure */
> > > +        __u32   index;
> > > +        __u32   prop_type;
> > > +        __u32   prop_index;
> > > +        __u64   flags;
> > > +#define VFIO_DTINDEX_FLAGS_REGION       (1 << 0)
> > > +#define VFIO_DTINDEX_FLAGS_IRQ          (1 << 1)
> > > +};
> > > +#define VFIO_DEVICE_GET_DTINDEX         _IOWR(';', 118, struct vfio_dtindex)
> > > +
> > > +
> > > +VFIO bus driver API
> > > +-------------------------------------------------------------------------------
> > > +
> > > +Bus drivers, such as PCI, have three jobs:
> > > + 1) Add/remove devices from vfio
> > > + 2) Provide vfio_device_ops for device access
> > > + 3) Device binding and unbinding
> > > +
> > > +When initialized, the bus driver should enumerate the devices on it's
> > 
> > s/it's/its/
> 
> Noted.
> 
> <snip>
> > > +/* Unmap DMA region */
> > > +/* dgate must be held */
> > > +static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > > +			    int npage, int rdwr)
> > 
> > Use of "read" and "write" in DMA can often be confusing, since it's
> > not always clear if you're talking from the perspective of the CPU or
> > the device (_writing_ data to a device will usually involve it doing
> > DMA _reads_ from memory).  It's often best to express things as DMA
> > direction, 'to device', and 'from device' instead.
> 
> Good point.

This, of course, potentially affects many areas of the code and doco.

> > > +{
> > > +	int i, unlocked = 0;
> > > +
> > > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > > +		unsigned long pfn;
> > > +
> > > +		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
> > > +		if (pfn) {
> > > +			iommu_unmap(iommu->domain, iova, 0);
> > > +			unlocked += put_pfn(pfn, rdwr);
> > > +		}
> > > +	}
> > > +	return unlocked;
> > > +}
> > > +
> > > +static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > > +			   unsigned long npage, int rdwr)
> > > +{
> > > +	int unlocked;
> > > +
> > > +	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);
> > > +	vfio_lock_acct(-unlocked);
> > 
> > Have you checked that your accounting will work out if the user maps
> > the same memory page to multiple IOVAs?
> 
> Hmm, it probably doesn't.  We potentially over-penalize the user process
> here.

Ok.

> > > +}
> > > +
> > > +/* Unmap ALL DMA regions */
> > > +void vfio_iommu_unmapall(struct vfio_iommu *iommu)
> > > +{
> > > +	struct list_head *pos, *pos2;
> > > +	struct dma_map_page *mlp;
> > > +
> > > +	mutex_lock(&iommu->dgate);
> > > +	list_for_each_safe(pos, pos2, &iommu->dm_list) {
> > > +		mlp = list_entry(pos, struct dma_map_page, list);
> > > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > > +		list_del(&mlp->list);
> > > +		kfree(mlp);
> > > +	}
> > > +	mutex_unlock(&iommu->dgate);
> > 
> > Ouch, no good at all.  Keeping track of every DMA map is no good on
> > POWER or other systems where IOMMU operations are a hot path.  I think
> > you'll need an iommu specific hook for this instead, which uses
> > whatever data structures are natural for the IOMMU.  For example a
> > 1-level pagetable, like we use on POWER will just zero every entry.
> 
> It's already been noted in the docs that current users have relatively
> static mappings and a performance interface is TBD for dynamically
> backing streaming DMA.  The current vfio_iommu exposes iommu_ops, POWER
> will need to come up with something to expose instead.

Right, but I'm not just talking about the current map/unmap calls
themselves.  This infrastructure for tracking it looks like it's
intended to be generic for all mapping methods.  If not, I can't see
the reason for it, because I don't think the current interface
requires such tracking inherently.

> > > +}
> > > +
> > > +static int vaddr_get_pfn(unsigned long vaddr, int rdwr, unsigned long *pfn)
> > > +{
> > > +	struct page *page[1];
> > > +	struct vm_area_struct *vma;
> > > +	int ret = -EFAULT;
> > > +
> > > +	if (get_user_pages_fast(vaddr, 1, rdwr, page) == 1) {
> > > +		*pfn = page_to_pfn(page[0]);
> > > +		return 0;
> > > +	}
> > > +
> > > +	down_read(&current->mm->mmap_sem);
> > > +
> > > +	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
> > > +
> > > +	if (vma && vma->vm_flags & VM_PFNMAP) {
> > > +		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> > > +		if (is_invalid_reserved_pfn(*pfn))
> > > +			ret = 0;
> > > +	}
> > 
> > It's kind of nasty that you take gup_fast(), already designed to grab
> > pointers for multiple user pages, then just use it one page at a time,
> > even for a big map.
> 
> Yep, this needs work, but shouldn't really change the API.

Yes, this could be a later optimization.

> > > +	up_read(&current->mm->mmap_sem);
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +/* Map DMA region */
> > > +/* dgate must be held */
> > > +static int vfio_dma_map(struct vfio_iommu *iommu, unsigned long iova,
> > > +			unsigned long vaddr, int npage, int rdwr)
> > 
> > iova should be a dma_addr_t.  Bus address size need not match virtual
> > address size, and may not fit in an unsigned long.
> 
> ok.

Again, same consideratoin in many places of course.

> > > +{
> > > +	unsigned long start = iova;
> > > +	int i, ret, locked = 0, prot = IOMMU_READ;
> > > +
> > > +	/* Verify pages are not already mapped */
> > > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
> > > +		if (iommu_iova_to_phys(iommu->domain, iova))
> > > +			return -EBUSY;
> > > +
> > > +	iova = start;
> > > +
> > > +	if (rdwr)
> > > +		prot |= IOMMU_WRITE;
> > > +	if (iommu->cache)
> > > +		prot |= IOMMU_CACHE;
> > > +
> > > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
> > > +		unsigned long pfn = 0;
> > > +
> > > +		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
> > > +		if (ret) {
> > > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > > +			return ret;
> > > +		}
> > > +
> > > +		/* Only add actual locked pages to accounting */
> > > +		if (!is_invalid_reserved_pfn(pfn))
> > > +			locked++;
> > > +
> > > +		ret = iommu_map(iommu->domain, iova,
> > > +				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
> > > +		if (ret) {
> > > +			/* Back out mappings on error */
> > > +			put_pfn(pfn, rdwr);
> > > +			__vfio_dma_unmap(iommu, start, i, rdwr);
> > > +			return ret;
> > > +		}
> > > +	}
> > > +	vfio_lock_acct(locked);
> > > +	return 0;
> > > +}
> > > +
> > > +static inline int ranges_overlap(unsigned long start1, size_t size1,
> > > +				 unsigned long start2, size_t size2)
> > > +{
> > > +	return !(start1 + size1 <= start2 || start2 + size2 <= start1);
> > 
> > Needs overflow safety.
> 
> Yep.
> 
> > > +}
> > > +
> > > +static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
> > > +					  dma_addr_t start, size_t size)
> > > +{
> > > +	struct list_head *pos;
> > > +	struct dma_map_page *mlp;
> > > +
> > > +	list_for_each(pos, &iommu->dm_list) {
> > > +		mlp = list_entry(pos, struct dma_map_page, list);
> > > +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > > +				   start, size))
> > > +			return mlp;
> > > +	}
> > > +	return NULL;
> > > +}
> > 
> > Again, keeping track of each dma map operation is no good for
> > performance.
> 
> This is not the performance interface you're looking for.
> 
> > > +
> > > +int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
> > > +			    size_t size, struct dma_map_page *mlp)
> > > +{
> > > +	struct dma_map_page *split;
> > > +	int npage_lo, npage_hi;
> > > +
> > > +	/* Existing dma region is completely covered, unmap all */
> > > +	if (start <= mlp->daddr &&
> > > +	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > > +		list_del(&mlp->list);
> > > +		npage_lo = mlp->npage;
> > > +		kfree(mlp);
> > > +		return npage_lo;
> > > +	}
> > > +
> > > +	/* Overlap low address of existing range */
> > > +	if (start <= mlp->daddr) {
> > > +		size_t overlap;
> > > +
> > > +		overlap = start + size - mlp->daddr;
> > > +		npage_lo = overlap >> PAGE_SHIFT;
> > > +		npage_hi = mlp->npage - npage_lo;
> > > +
> > > +		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
> > > +		mlp->daddr += overlap;
> > > +		mlp->vaddr += overlap;
> > > +		mlp->npage -= npage_lo;
> > > +		return npage_lo;
> > > +	}
> > > +
> > > +	/* Overlap high address of existing range */
> > > +	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
> > > +		size_t overlap;
> > > +
> > > +		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
> > > +		npage_hi = overlap >> PAGE_SHIFT;
> > > +		npage_lo = mlp->npage - npage_hi;
> > > +
> > > +		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
> > > +		mlp->npage -= npage_hi;
> > > +		return npage_hi;
> > > +	}
> > > +
> > > +	/* Split existing */
> > > +	npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
> > > +	npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
> > > +
> > > +	split = kzalloc(sizeof *split, GFP_KERNEL);
> > > +	if (!split)
> > > +		return -ENOMEM;
> > > +
> > > +	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
> > > +
> > > +	mlp->npage = npage_lo;
> > > +
> > > +	split->npage = npage_hi;
> > > +	split->daddr = start + size;
> > > +	split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
> > > +	split->rdwr = mlp->rdwr;
> > > +	list_add(&split->list, &iommu->dm_list);
> > > +	return size >> PAGE_SHIFT;
> > > +}
> > > +
> > > +int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > > +{
> > > +	int ret = 0;
> > > +	size_t npage = dmp->size >> PAGE_SHIFT;
> > > +	struct list_head *pos, *n;
> > > +
> > > +	if (dmp->dmaaddr & ~PAGE_MASK)
> > > +		return -EINVAL;
> > > +	if (dmp->size & ~PAGE_MASK)
> > > +		return -EINVAL;
> > > +
> > > +	mutex_lock(&iommu->dgate);
> > > +
> > > +	list_for_each_safe(pos, n, &iommu->dm_list) {
> > > +		struct dma_map_page *mlp;
> > > +
> > > +		mlp = list_entry(pos, struct dma_map_page, list);
> > > +		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
> > > +				   dmp->dmaaddr, dmp->size)) {
> > > +			ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
> > > +						      dmp->size, mlp);
> > > +			if (ret > 0)
> > > +				npage -= NPAGE_TO_SIZE(ret);
> > > +			if (ret < 0 || npage == 0)
> > > +				break;
> > > +		}
> > > +	}
> > > +	mutex_unlock(&iommu->dgate);
> > > +	return ret > 0 ? 0 : ret;
> > > +}
> > > +
> > > +int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
> > > +{
> > > +	int npage;
> > > +	struct dma_map_page *mlp, *mmlp = NULL;
> > > +	dma_addr_t daddr = dmp->dmaaddr;
> > > +	unsigned long locked, lock_limit, vaddr = dmp->vaddr;
> > > +	size_t size = dmp->size;
> > > +	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;
> > > +
> > > +	if (vaddr & (PAGE_SIZE-1))
> > > +		return -EINVAL;
> > > +	if (daddr & (PAGE_SIZE-1))
> > > +		return -EINVAL;
> > > +	if (size & (PAGE_SIZE-1))
> > > +		return -EINVAL;
> > > +
> > > +	npage = size >> PAGE_SHIFT;
> > > +	if (!npage)
> > > +		return -EINVAL;
> > > +
> > > +	if (!iommu)
> > > +		return -EINVAL;
> > > +
> > > +	mutex_lock(&iommu->dgate);
> > > +
> > > +	if (vfio_find_dma(iommu, daddr, size)) {
> > > +		ret = -EBUSY;
> > > +		goto out_lock;
> > > +	}
> > > +
> > > +	/* account for locked pages */
> > > +	locked = current->mm->locked_vm + npage;
> > > +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > > +	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> > > +		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > > +			__func__, rlimit(RLIMIT_MEMLOCK));
> > > +		ret = -ENOMEM;
> > > +		goto out_lock;
> > > +	}
> > > +
> > > +	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);
> > > +	if (ret)
> > > +		goto out_lock;
> > > +
> > > +	/* Check if we abut a region below */
> > > +	if (daddr) {
> > > +		mlp = vfio_find_dma(iommu, daddr - 1, 1);
> > > +		if (mlp && mlp->rdwr == rdwr &&
> > > +		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {
> > > +
> > > +			mlp->npage += npage;
> > > +			daddr = mlp->daddr;
> > > +			vaddr = mlp->vaddr;
> > > +			npage = mlp->npage;
> > > +			size = NPAGE_TO_SIZE(npage);
> > > +
> > > +			mmlp = mlp;
> > > +		}
> > > +	}
> > > +
> > > +	if (daddr + size) {
> > > +		mlp = vfio_find_dma(iommu, daddr + size, 1);
> > > +		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size) {
> > > +
> > > +			mlp->npage += npage;
> > > +			mlp->daddr = daddr;
> > > +			mlp->vaddr = vaddr;
> > > +
> > > +			/* If merged above and below, remove previously
> > > +			 * merged entry.  New entry covers it.  */
> > > +			if (mmlp) {
> > > +				list_del(&mmlp->list);
> > > +				kfree(mmlp);
> > > +			}
> > > +			mmlp = mlp;
> > > +		}
> > > +	}
> > > +
> > > +	if (!mmlp) {
> > > +		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
> > > +		if (!mlp) {
> > > +			ret = -ENOMEM;
> > > +			vfio_dma_unmap(iommu, daddr, npage, rdwr);
> > > +			goto out_lock;
> > > +		}
> > > +
> > > +		mlp->npage = npage;
> > > +		mlp->daddr = daddr;
> > > +		mlp->vaddr = vaddr;
> > > +		mlp->rdwr = rdwr;
> > > +		list_add(&mlp->list, &iommu->dm_list);
> > > +	}
> > > +
> > > +out_lock:
> > > +	mutex_unlock(&iommu->dgate);
> > > +	return ret;
> > > +}
> > 
> > This whole tracking infrastructure is way too complex to impose on
> > every IOMMU.  We absolutely don't want to do all this when just
> > updating a 1-level pagetable.
> 
> If only POWER implemented an iommu_ops so we had something on which we
> could base an alternate iommu model and pluggable iommu registration...

Yeah, yeah.  I'm having to find gaps of time between fighting various
fires to work on vfio-ish infrastructure stuff.

> > > +static int vfio_iommu_release(struct inode *inode, struct file *filep)
> > > +{
> > > +	struct vfio_iommu *iommu = filep->private_data;
> > > +
> > > +	vfio_release_iommu(iommu);
> > > +	return 0;
> > > +}
> > > +
> > > +static long vfio_iommu_unl_ioctl(struct file *filep,
> > > +				 unsigned int cmd, unsigned long arg)
> > > +{
> > > +	struct vfio_iommu *iommu = filep->private_data;
> > > +	int ret = -ENOSYS;
> > > +
> > > +        if (cmd == VFIO_IOMMU_GET_FLAGS) {
> > > +                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
> > > +
> > > +                ret = put_user(flags, (u64 __user *)arg);
> > 
> > Um.. flags surely have to come from the IOMMU driver.
> 
> This vfio_iommu object is backed by iommu_ops, which supports this
> mapping.
> 
> > > +        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
> > > +		struct vfio_dma_map dm;
> > > +
> > > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > > +			return -EFAULT;
> > > +
> > > +		ret = vfio_dma_map_dm(iommu, &dm);
> > > +
> > > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > > +			ret = -EFAULT;
> > > +
> > > +	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
> > > +		struct vfio_dma_map dm;
> > > +
> > > +		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
> > > +			return -EFAULT;
> > > +
> > > +		ret = vfio_dma_unmap_dm(iommu, &dm);
> > > +
> > > +		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
> > > +			ret = -EFAULT;
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > > +#ifdef CONFIG_COMPAT
> > > +static long vfio_iommu_compat_ioctl(struct file *filep,
> > > +				    unsigned int cmd, unsigned long arg)
> > > +{
> > > +	arg = (unsigned long)compat_ptr(arg);
> > > +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> > 
> > Um, this only works if the structures are exactly compatible between
> > 32-bit and 64-bit ABIs.  I don't think that is always true.
> 
> I think all our structure sizes are independent of host width.  If I'm
> missing something, let me know.

Ah, for structures, that might be true.  I was seeing the bunch of
ioctl()s that take ints.

> > > +}
> > > +#endif	/* CONFIG_COMPAT */
> > > +
> > > +const struct file_operations vfio_iommu_fops = {
> > > +	.owner		= THIS_MODULE,
> > > +	.release	= vfio_iommu_release,
> > > +	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
> > > +#ifdef CONFIG_COMPAT
> > > +	.compat_ioctl	= vfio_iommu_compat_ioctl,
> > > +#endif
> > > +};
> > > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > > new file mode 100644
> > > index 0000000..6169356
> > > --- /dev/null
> > > +++ b/drivers/vfio/vfio_main.c
> > > @@ -0,0 +1,1151 @@
> > > +/*
> > > + * VFIO framework
> > > + *
> > > + * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
> > > + *     Author: Alex Williamson <alex.williamson@redhat.com>
> > > + *
> > > + * This program is free software; you can redistribute it and/or modify
> > > + * it under the terms of the GNU General Public License version 2 as
> > > + * published by the Free Software Foundation.
> > > + *
> > > + * Derived from original vfio:
> > > + * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
> > > + * Author: Tom Lyon, pugs@cisco.com
> > > + */
> > > +
> > > +#include <linux/cdev.h>
> > > +#include <linux/compat.h>
> > > +#include <linux/device.h>
> > > +#include <linux/file.h>
> > > +#include <linux/anon_inodes.h>
> > > +#include <linux/fs.h>
> > > +#include <linux/idr.h>
> > > +#include <linux/iommu.h>
> > > +#include <linux/mm.h>
> > > +#include <linux/module.h>
> > > +#include <linux/slab.h>
> > > +#include <linux/string.h>
> > > +#include <linux/uaccess.h>
> > > +#include <linux/vfio.h>
> > > +#include <linux/wait.h>
> > > +
> > > +#include "vfio_private.h"
> > > +
> > > +#define DRIVER_VERSION	"0.2"
> > > +#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
> > > +#define DRIVER_DESC	"VFIO - User Level meta-driver"
> > > +
> > > +static int allow_unsafe_intrs;
> > > +module_param(allow_unsafe_intrs, int, 0);
> > > +MODULE_PARM_DESC(allow_unsafe_intrs,
> > > +        "Allow use of IOMMUs which do not support interrupt remapping");
> > 
> > This should not be a global option, but part of the AMD/Intel IOMMU
> > specific code.  In general it's a question of how strict the IOMMU
> > driver is about isolation when it determines what the groups are, and
> > only the IOMMU driver can know what the possibilities are for its
> > class of hardware.
> 
> I agree this should probably be tied more closely to the iommu driver,
> but again, we only have iommu_ops right now.
> 
> <snip>
> > > +
> > > +/* Attempt to merge the group pointed to by fd into group.  The merge-ee
> > > + * group must not have an iommu or any devices open because we cannot
> > > + * maintain that context across the merge.  The merge-er group can be
> > > + * in use. */
> > 
> > Yeah, so merge-er group in use still has its problems, because it
> > could affect what the IOMMU is capable of.
> 
> As seen below, we deny merging if the iommu domains are not exactly
> compatible.  Our notion of what compatible means depends on what
> iommu_ops exposes though.

Ok.

> > > +static int vfio_group_merge(struct vfio_group *group, int fd)
> > > +{
> > > +	struct vfio_group *new;
> > > +	struct vfio_iommu *old_iommu;
> > > +	struct file *file;
> > > +	int ret = 0;
> > > +	bool opened = false;
> > > +
> > > +	mutex_lock(&vfio.lock);
> > > +
> > > +	file = fget(fd);
> > > +	if (!file) {
> > > +		ret = -EBADF;
> > > +		goto out_noput;
> > > +	}
> > > +
> > > +	/* Sanity check, is this really our fd? */
> > > +	if (file->f_op != &vfio_group_fops) {
> > 
> > This should be a WARN_ON or BUG_ON rather than just an error return, surely.
> 
> No, I don't think so.  We're passed a file descriptor that could be for
> anything.  If the user passed a file descriptor for something that's not
> a vfio group, that's a user error, not an internal consistency error of
> vfio.

Sorry, I was mixing up which of the fd arguments was which.

> > > +		ret = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	new = file->private_data;
> > > +
> > > +	if (!new || new == group || !new->iommu ||
> > > +	    new->iommu->domain || new->bus != group->bus) {
> > > +		ret = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	/* We need to attach all the devices to each domain separately
> > > +	 * in order to validate that the capabilities match for both.  */
> > > +	ret = __vfio_open_iommu(new->iommu);
> > > +	if (ret)
> > > +		goto out;
> > > +
> > > +	if (!group->iommu->domain) {
> > > +		ret = __vfio_open_iommu(group->iommu);
> > > +		if (ret)
> > > +			goto out;
> > > +		opened = true;
> > > +	}
> > > +
> > > +	/* If cache coherency doesn't match we'd potentialy need to
> > > +	 * remap existing iommu mappings in the merge-er domain.
> > > +	 * Poor return to bother trying to allow this currently. */
> > > +	if (iommu_domain_has_cap(group->iommu->domain,
> > > +				 IOMMU_CAP_CACHE_COHERENCY) !=
> > > +	    iommu_domain_has_cap(new->iommu->domain,
> > > +				 IOMMU_CAP_CACHE_COHERENCY)) {
> > > +		__vfio_close_iommu(new->iommu);
> > > +		if (opened)
> > > +			__vfio_close_iommu(group->iommu);
> > > +		ret = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	/* Close the iommu for the merge-ee and attach all its devices
> > > +	 * to the merge-er iommu. */
> > > +	__vfio_close_iommu(new->iommu);
> > > +
> > > +	ret = __vfio_iommu_attach_group(group->iommu, new);
> > > +	if (ret)
> > > +		goto out;
> > > +
> > > +	/* set_iommu unlinks new from the iommu, so save a pointer to it */
> > > +	old_iommu = new->iommu;
> > > +	__vfio_group_set_iommu(new, group->iommu);
> > > +	kfree(old_iommu);
> > > +
> > > +out:
> > > +	fput(file);
> > > +out_noput:
> > > +	mutex_unlock(&vfio.lock);
> > > +	return ret;
> > > +}
> > > +
> > > +/* Unmerge the group pointed to by fd from group. */
> > > +static int vfio_group_unmerge(struct vfio_group *group, int fd)
> > > +{
> > > +	struct vfio_group *new;
> > > +	struct vfio_iommu *new_iommu;
> > > +	struct file *file;
> > > +	int ret = 0;
> > > +
> > > +	/* Since the merge-out group is already opened, it needs to
> > > +	 * have an iommu struct associated with it. */
> > > +	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
> > > +	if (!new_iommu)
> > > +		return -ENOMEM;
> > > +
> > > +	INIT_LIST_HEAD(&new_iommu->group_list);
> > > +	INIT_LIST_HEAD(&new_iommu->dm_list);
> > > +	mutex_init(&new_iommu->dgate);
> > > +	new_iommu->bus = group->bus;
> > > +
> > > +	mutex_lock(&vfio.lock);
> > > +
> > > +	file = fget(fd);
> > > +	if (!file) {
> > > +		ret = -EBADF;
> > > +		goto out_noput;
> > > +	}
> > > +
> > > +	/* Sanity check, is this really our fd? */
> > > +	if (file->f_op != &vfio_group_fops) {
> > > +		ret = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	new = file->private_data;
> > > +	if (!new || new == group || new->iommu != group->iommu) {
> > > +		ret = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	/* We can't merge-out a group with devices still in use. */
> > > +	if (__vfio_group_devs_inuse(new)) {
> > > +		ret = -EBUSY;
> > > +		goto out;
> > > +	}
> > > +
> > > +	__vfio_iommu_detach_group(group->iommu, new);
> > > +	__vfio_group_set_iommu(new, new_iommu);
> > > +
> > > +out:
> > > +	fput(file);
> > > +out_noput:
> > > +	if (ret)
> > > +		kfree(new_iommu);
> > > +	mutex_unlock(&vfio.lock);
> > > +	return ret;
> > > +}
> > > +
> > > +/* Get a new iommu file descriptor.  This will open the iommu, setting
> > > + * the current->mm ownership if it's not already set. */
> > 
> > I know I've had this explained to me several times before, but I've
> > forgotten again.  Why do we need to wire the iommu to an mm?
> 
> We're mapping process virtual addresses into the IOMMU, so it makes
> sense to restrict ourselves to a single virtual address space.  It also
> enforces the ownership, that only a single mm is in control of the
> group.

Neither of those seems conclusive to me, but I remember that I saw a
strong reason earlier, even if I can't remember it now.

> > > +static int vfio_group_get_iommu_fd(struct vfio_group *group)
> > > +{
> > > +	int ret = 0;
> > > +
> > > +	mutex_lock(&vfio.lock);
> > > +
> > > +	if (!group->iommu->domain) {
> > > +		ret = __vfio_open_iommu(group->iommu);
> > > +		if (ret)
> > > +			goto out;
> > > +	}
> > > +
> > > +	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
> > > +			       group->iommu, O_RDWR);
> > > +	if (ret < 0)
> > > +		goto out;
> > > +
> > > +	group->iommu->refcnt++;
> > > +out:
> > > +	mutex_unlock(&vfio.lock);
> > > +	return ret;
> > > +}
> > > +
> > > +/* Get a new device file descriptor.  This will open the iommu, setting
> > > + * the current->mm ownership if it's not already set.  It's difficult to
> > > + * specify the requirements for matching a user supplied buffer to a
> > > + * device, so we use a vfio driver callback to test for a match.  For
> > > + * PCI, dev_name(dev) is unique, but other drivers may require including
> > > + * a parent device string. */
> > 
> > At some point we probably want an interface to enumerate the devices
> > too, but that can probably wait.
> 
> That's what I decided as well.  I also haven't been able to come up with
> an interface for it that doesn't make me want to vomit.

Ok.
Alex Williamson Nov. 17, 2011, 8:22 p.m. UTC | #31
On Wed, 2011-11-16 at 11:52 -0500, Konrad Rzeszutek Wilk wrote:
> On Fri, Nov 11, 2011 at 03:10:56PM -0700, Alex Williamson wrote:
<snip>
> > > > +
> > > > +Regions are described by a struct vfio_region_info, which is retrieved by
> > > > +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> > > > +the desired region (0 based index).  Note that devices may implement zero
> > > > 
> > > +sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> > > > +mapping).
> > > 
> > > Huh?
> > 
> > PCI has the following static mapping:
> > 
> > enum {
> >         VFIO_PCI_BAR0_REGION_INDEX,
> >         VFIO_PCI_BAR1_REGION_INDEX,
> >         VFIO_PCI_BAR2_REGION_INDEX,
> >         VFIO_PCI_BAR3_REGION_INDEX,
> >         VFIO_PCI_BAR4_REGION_INDEX,
> >         VFIO_PCI_BAR5_REGION_INDEX,
> >         VFIO_PCI_ROM_REGION_INDEX,
> >         VFIO_PCI_CONFIG_REGION_INDEX,
> >         VFIO_PCI_NUM_REGIONS
> > };
> > 
> > So 8 regions are always reported regardless of whether the device
> > implements all the BARs and the ROM.  Then we have a fixed bar:index
> > mapping so we don't have to create a region_info field to describe the
> > bar number for the index.
> 
> OK. Is that a problem if the real device actually has a zero sized BAR?
> Or is zero sized BAR in PCI spec equal to "disabled, not in use" ? Just
> wondering whether (-1ULL) should be used instead? (Which seems the case
> in QEMU code).

Yes, PCI spec defines that unimplemented BARs are hardwired to zero, so
the sizing operation returns zero for the size.

<snip>
> > > > +struct vfio_irq_info {
> > > > +        __u32   len;            /* length of structure */
> > > > +        __u32   index;          /* IRQ number */
> > > > +        __u32   count;          /* number of individual IRQs */
> > > > +        __u64   flags;
> > > > +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> > > > +};
> > > > +
> > > > +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> > > > +type to index mapping).
> > > 
> > > I am not really sure what that means.
> > 
> > This is so PCI can expose:
> > 
> > enum {
> >         VFIO_PCI_INTX_IRQ_INDEX,
> >         VFIO_PCI_MSI_IRQ_INDEX,
> >         VFIO_PCI_MSIX_IRQ_INDEX,
> >         VFIO_PCI_NUM_IRQS
> > };
> > 
> > So like regions it always exposes 3 IRQ indexes where count=0 if the
> > device doesn't actually support that type of interrupt.  I just want to
> > spell out that bus drivers have this kind of flexibility.
> 
> I think you should change the comment that  says 'IRQ number', as the
> first thing that comes in my mind is 'GSI' or MSI/MSI-x vector.
> Perhaps '/* index to be used with return value from GET_NUM_IRQS ioctl.
> Order of structures can be unsorted. */

Ah, yes.  I see the confusion.  They can't really be unsorted though,
the user needs some point of reference.  For PCI they will be strictly
ordered.  For Device Tree, I assume there will be a path referencing the
index.  I'll update the doc to clarify.

<snip>
> > > > +
> > > > +When a level triggered interrupt is signaled, the interrupt is masked
> > > > +on the host.  This prevents an unresponsive userspace driver from
> > > > +continuing to interrupt the host system.  After servicing the interrupt,
> > > > +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> > > > +triggered interrupts implicitly have a count of 1 per index.
> > > 
> > > So they are enabled automatically? Meaning you don't even hav to do
> > > SET_IRQ_EVENTFDS b/c the count is set to 1?
> > 
> > I suppose that should be "no more than 1 per index" (ie. PCI would
> > report a count of 0 for VFIO_PCI_INTX_IRQ_INDEX if the device doesn't
> > support INTx).  I think you might be confusing VFIO_DEVICE_GET_IRQ_INFO
> > which tells how many are available with VFIO_DEVICE_SET_IRQ_EVENTFDS
> > which does the enabling/disabling.  All interrupts are disabled by
> > default because userspace needs to give us a way to signal them via
> > eventfds.  It will be device dependent whether multiple index can be
> > enabled simultaneously.  Hmm, is that another flag on the irq_info
> > struct or do we expect drivers to implicitly have that kind of
> > knowledge?
> 
> Right, that was what I was wondering. Not sure how the PowerPC
> world works with this.

On second thought, I think an exclusive flag isn't appropriate.  VFIO is
not meant to abstract the device to the level that a user could write a
generic "vfio driver".  The user will always need to understand the type
of device, VFIO just provides the conduit to make use of it.  There's
too much left undefined with a simplistic exclusive flag.

<snip>
> > > > +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
> > > 
> > > So only level triggered? Hmm, how do I know whether the device is
> > > level or edge? Or is that edge (MSI) can also be unmaked using the
> > > eventfs
> > 
> > Yes, only for level.  Isn't a device going to know what type of
> > interrupt it uses?  MSI masking is PCI specific, not handled by this.
> 
> I certainly hope it knows, but you know buggy drivers do exist.
> 
> What would be the return value if somebody tried to unmask an edge one?
> Should that be documented here? -ENOSPEC?

I would assume EINVAL or EFAULT since the user is providing an invalid
argument/bad address.

> > > > +
> > > > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > > > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> > > > +
> > > > +When supported, as indicated by the device flags, reset the device.
> > > > +
> > > > +#define VFIO_DEVICE_RESET               _IO(';', 116)
> > > 
> > > Does it disable the 'count'? Err, does it disable the IRQ on the
> > > device after this and one should call VFIO_DEVICE_SET_IRQ_EVENTFDS
> > > to set new eventfds? Or does it re-use the eventfds and the device
> > > is enabled after this?
> > 
> > It doesn't affect the interrupt programming.  Should it?
> 
> I would hope not, but I am trying to think of ways one could screw this up.
> Perhaps just saying that - "No need to call VFIO_DEVICE_SET_IRQ_EVENTFDS
> as the kernel (and the device) will retain the interrupt.".

Ok, I added some words around this in the doc.

> .. snip..
> > > I am not really sure what this section purpose is? Could this be part
> > > of the header file or the code? It does not look to be part of the
> > > ioctl API?
> > 
> > We've passed into the "VFIO bus driver API" section of the document, to
> > explain the interaction between vfio-core and vfio bus drivers.
> 
> Perhaps a different file?

The entire file is ~300 lines.  Seems excessive to split.

<snip>
> > > > +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
> > > > +				    struct vfio_device *device)
> > > > +{
> > > > +	BUG_ON(!iommu->domain && device->attached);
> > > 
> > > Whoa. Heavy hammer there.
> > > 
> > > Perhaps WARN_ON as you do check it later on.
> > 
> > I think it's warranted, internal consistency is broken if we have a
> > device that thinks it's attached to an iommu domain that doesn't exist.
> > It should, of course, never happen and this isn't a performance path.
> > 
> > > > +
> > > > +	if (!iommu->domain || !device->attached)
> > > > +		return;
> 
> Well, the deal is that you BUG_ON earlier, but you check for it here.
> But the BUG_ON will stop execution , so the check 'if ..' is actually
> not needed.

The BUG_ON is a subtly different check:

domain | attached
-------+---------
   0   |   0     Nothing to do
   0   |   1     <--- BUG_ON, we're broken
   1   |   0     Nothing to do
   1   |   1     Do stuff

Writing out the truth table, I see now I could just make this:
   if (!attached) {return;}
since the BUG_ON takes care of the other case.

The reason for the laziness of allowing this to simply return is that if
we hit an error attaching an individual device within a group, we just
push the whole group back through __vfio_iommu_detach_group(), so some
devices may have never been attached.

> > > > +
> > > > +	iommu_detach_device(iommu->domain, device->dev);
> > > > +	device->attached = false;
> > > > +}
> > > > +
> > > > +static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
> > > > +				      struct vfio_group *group)
> > > > +{
> > > > +	struct list_head *pos;
> > > > +
> > > > +	list_for_each(pos, &group->device_list) {
> > > > +		struct vfio_device *device;
> > > > +
> > > > +		device = list_entry(pos, struct vfio_device, device_next);
> > > > +		__vfio_iommu_detach_dev(iommu, device);
> > > > +	}
> > > > +}
> > > > +
> > > > +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
> > > > +				   struct vfio_device *device)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	BUG_ON(device->attached);
> > > 
> > > How about:
> > > 
> > > WARN_ON(device->attached, "The engineer who wrote the user-space device driver is trying to register
> > > the device again! Tell him/her to stop please.\n");
> > 
> > I would almost demote this one to a WARN_ON, but userspace isn't in
> > control of attaching and detaching devices from the iommu.  That's a
> > side effect of getting the iommu or device file descriptor.  So again,
> > this is an internal consistency check and it should never happen,
> > regardless of userspace.
> > 
> 
> Ok, then you might want to expand it to
> 
> BUG_ON(!device  || device->attached);
> 
> In case something has gone horribly wrong.

Impressive, that exceeds even my paranoia ;)  For that we would have had
to walk the group->device_list and come up with a NULL device pointer.
I think we can assume that won't happen.  I've also got this though:

        if (!iommu || !iommu->domain)
                return -EINVAL;

Which is effectively just being lazy without a good excuse like above.
That could probably be folded into the BUG_ON.
> 
> .. snip..
> > > > +		group->devt = MKDEV(MAJOR(vfio.devt), minor);
> > > > +		device_create(vfio.class, NULL, group->devt,
> > > > +			      group, "%u", groupid);
> > > > +
> > > > +		group->bus = dev->bus;
> > > 
> > > 
> > > Oh, so that is how the IOMMU iommu_ops get copied! You might
> > > want to mention that - I was not sure where the 'handoff' is
> > > was done to insert a device so that it can do iommu_ops properly.
> > > 
> > > Ok, so the time when a device is detected whether it can do
> > > IOMMU is when we try to open it - as that is when iommu_domain_alloc
> > > is called which can return NULL if the iommu_ops is not set.
> > > 
> > > So what about devices that don't have an iommu_ops? Say they
> > > are using SWIOTLB? (like the AMD-Vi sometimes does if the
> > > device is not on its list).
> > > 
> > > Can we use iommu_present?
> > 
> > I'm not sure I'm following your revelation ;)  Take a look at the
> 
> I am trying to figure out who sets the iommu_ops call on the devices.

The iommu driver registers ops with bus_set_iommu, so then we just need
to pass the bus pointer and iommu_ops figures out the rest.  If there's
no iommu_ops for a device or the iommu_ops doesn't implement the
device_group callback, it gets skipped by vfio and therefore won't be
usable by this interface.

> > pointer to iommu_device_group I pasted above, or these:
> > 
> > https://github.com/awilliam/linux-vfio/commit/37dd08c90d149caaed7779d4f38850a8f7ed0fa5
> > https://github.com/awilliam/linux-vfio/commit/63ca8543533d8130db23d7949133e548c3891c97
> > https://github.com/awilliam/linux-vfio/commit/8d7d70eb8e714fbf8710848a06f8cab0c741631e
> > 
> > That call includes an iommu_present() check, so if there's no iommu or
> > the iommu can't provide a groupid, the device is skipped over from vfio
> > (can't be used).
> > 
> > So the ordering is:
> > 
> >  - bus driver registers device
> >    - if it has an iommu group, add it to the vfio device/group tracking
> > 
> >  - group gets opened
> >    - user gets iommu or device fd results in iommu_domain_alloc
> > 
> > Devices without iommu_ops don't get to play in the vfio world.
> 
> Right, and I think the answer of which devices get iommu_ops is done via
> bus_set_iommu.

Exactly.

> (Thinking in long-term of what would be required to make this work
> with Xen and it sounds like I will need to implement a Xen IOMMU driver)

Yeah, that would make sense.  Thanks!

Alex
Alex Williamson Nov. 17, 2011, 8:52 p.m. UTC | #32
On Wed, 2011-11-16 at 11:47 -0600, Scott Wood wrote:
> On 11/11/2011 04:10 PM, Alex Williamson wrote:
> > 
> > Thanks Konrad!  Comments inline.
> > 
> > On Fri, 2011-11-11 at 12:51 -0500, Konrad Rzeszutek Wilk wrote:
> >> On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> >>> +When supported, as indicated by the device flags, reset the device.
> >>> +
> >>> +#define VFIO_DEVICE_RESET               _IO(';', 116)
> >>
> >> Does it disable the 'count'? Err, does it disable the IRQ on the
> >> device after this and one should call VFIO_DEVICE_SET_IRQ_EVENTFDS
> >> to set new eventfds? Or does it re-use the eventfds and the device
> >> is enabled after this?
> > 
> > It doesn't affect the interrupt programming.  Should it?
> 
> It should probably clear any currently pending interrupts, as if the
> unmask IOCTL were called.

Sounds reasonable.

> >>> +device tree properties of the device:
> >>> +
> >>> +struct vfio_dtpath {
> >>> +        __u32   len;            /* length of structure */
> >>> +        __u32   index;
> >>
> >> 0 based I presume?
> > 
> > Everything else is, I would assume so/
> 
> Yes, it should be zero-based -- this matches how such indices are done
> in the kernel device tree APIs.
> 
> >>> +        __u64   flags;
> >>> +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> >>
> >> What is region in this context?? Or would this make much more sense
> >> if I knew what Device Tree actually is.
> > 
> > Powerpc guys, any comments?  This was their suggestion.  These are
> > effectively the first device specific extension, available when
> > VFIO_DEVICE_FLAGS_DT is set.
> 
> An assigned device may consist of an entire subtree of the device tree,
> and both register banks and interrupts can come from any node in the
> tree.  Region versus IRQ here indicates the context in which to
> interpret index, in order to retrieve the path of the node that supplied
> this particular region or IRQ.

Ok.  Thanks for the clarification.  We'll wait for the vfio-dt bus
driver before actually including this.

> >>> +};
> >>> +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> >>> +
> >>> +struct vfio_dtindex {
> >>> +        __u32   len;            /* length of structure */
> >>> +        __u32   index;
> >>> +        __u32   prop_type;
> >>
> >> Is that an enum type? Is this definied somewhere?
> >>> +        __u32   prop_index;
> >>
> >> What is the purpose of this field?
> > 
> > Need input from powerpc folks here
> 
> To identify what this resource (register bank or IRQ) this is, we need
> both the path to the node and the index into the reg or interrupts
> property within the node.
> 
> We also need to distinguish reg from ranges, and interrupts from
> interrupt-map.  As you suggested elsewhere in the thread, the device
> tree API should probably be left out for now, and added later along with
> the device tree "bus" driver.

Yep, I'll do that.

> >>> +static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
> >>> +				    struct vfio_device *device)
> >>> +{
> >>> +	BUG_ON(!iommu->domain && device->attached);
> >>
> >> Whoa. Heavy hammer there.
> >>
> >> Perhaps WARN_ON as you do check it later on.
> > 
> > I think it's warranted, internal consistency is broken if we have a
> > device that thinks it's attached to an iommu domain that doesn't exist.
> > It should, of course, never happen and this isn't a performance path.
> > 
> [snip]
> >>> +static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
> >>> +				   struct vfio_device *device)
> >>> +{
> >>> +	int ret;
> >>> +
> >>> +	BUG_ON(device->attached);
> >>
> >> How about:
> >>
> >> WARN_ON(device->attached, "The engineer who wrote the user-space device driver is trying to register
> >> the device again! Tell him/her to stop please.\n");
> > 
> > I would almost demote this one to a WARN_ON, but userspace isn't in
> > control of attaching and detaching devices from the iommu.  That's a
> > side effect of getting the iommu or device file descriptor.  So again,
> > this is an internal consistency check and it should never happen,
> > regardless of userspace.
> 
> The rule isn't to use BUG for internal consistency checks and WARN for
> stuff userspace can trigger, but rather to use BUG if you cannot
> reasonably continue, WARN for "significant issues that need prompt
> attention" that are reasonably recoverable.  Most instances of WARN are
> internal consistency checks.

That makes sense.

> From include/asm-generic/bug.h:
> > If you're tempted to BUG(), think again:  is completely giving up
> > really the *only* solution?  There are usually better options, where
> > users don't need to reboot ASAP and can mostly shut down cleanly.

Ok, I'll make a cleanup pass of demoting BUG_ONs to WARN_ONs.  Thanks,

Alex
Scott Wood Nov. 17, 2011, 8:56 p.m. UTC | #33
On Thu, Nov 17, 2011 at 01:22:17PM -0700, Alex Williamson wrote:
> On Wed, 2011-11-16 at 11:52 -0500, Konrad Rzeszutek Wilk wrote:
> > On Fri, Nov 11, 2011 at 03:10:56PM -0700, Alex Williamson wrote:
> > What would be the return value if somebody tried to unmask an edge one?
> > Should that be documented here? -ENOSPEC?
> 
> I would assume EINVAL or EFAULT since the user is providing an invalid
> argument/bad address.

EINVAL.  EFAULT is normally only used for when the user passes a bad
virtual memory address to the kernel.  This isn't an address at all, it's
an index that points to an object for which this operation does not make
sense.

-Scott
Alex Williamson Nov. 18, 2011, 8:32 p.m. UTC | #34
On Thu, 2011-11-17 at 11:02 +1100, David Gibson wrote:
> On Tue, Nov 15, 2011 at 11:01:28AM -0700, Alex Williamson wrote:
> > On Tue, 2011-11-15 at 17:34 +1100, David Gibson wrote:
> > > On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
<snip>
> > > > +Groups, Devices, IOMMUs, oh my
> > > > +-------------------------------------------------------------------------------
> > > > +
> > > > +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> > > > +can't always distinguish transactions from each individual device in
> > > > +the system.  Sometimes this is because of the IOMMU design, such as with
> > > > +PEs, other times it's caused by the I/O topology, for instance a
> > > > +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> > > > +devices created by these restictions IOMMU groups (or just "groups" for
> > > > +this document).
> > > > +
> > > > +The IOMMU cannot distiguish transactions between the individual devices
> > > > +within the group, therefore the group is the basic unit of ownership for
> > > > +a userspace process.  Because of this, groups are also the primary
> > > > +interface to both devices and IOMMU domains in VFIO.
> > > > +
> > > > +The VFIO representation of groups is created as devices are added into
> > > > +the framework by a VFIO bus driver.  The vfio-pci module is an example
> > > > +of a bus driver.  This module registers devices along with a set of bus
> > > > +specific callbacks with the VFIO core.  These callbacks provide the
> > > > +interfaces later used for device access.  As each new group is created,
> > > > +as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
> > > > +character device.
> > > 
> > > Ok.. so, the fact that it's called "vfio-pci" suggests that the VFIO
> > > bus driver is per bus type, not per bus instance.   But grouping
> > > constraints could be per bus instance, if you have a couple of
> > > different models of PCI host bridge with IOMMUs of different
> > > capabilities built in, for example.
> > 
> > Yes, vfio-pci manages devices on the pci_bus_type; per type, not per bus
> > instance.
> 
> Ok, how can that work.  vfio-pci is responsible for generating the
> groupings, yes?  For which it needs to know the iommu/host bridge's
> isolation capabilities, which vary depending on the type of host
> bridge.

No, grouping is done at the iommu driver level.  vfio gets groupings via
iomm_device_group(), which uses the iommu_ops for the bus_type of the
requested device.  I'll attempt to clarify where groups come from in the
documentation.

> >  IOMMUs also register drivers per bus type, not per bus
> > instance.  The IOMMU driver is free to impose any constraints it wants.
> > 
> > > > +In addition to the device enumeration and callbacks, the VFIO bus driver
> > > > +also provides a traditional device driver and is able to bind to devices
> > > > +on it's bus.  When a device is bound to the bus driver it's available to
> > > > +VFIO.  When all the devices within a group are bound to their bus drivers,
> > > > +the group becomes "viable" and a user with sufficient access to the VFIO
> > > > +group chardev can obtain exclusive access to the set of group devices.
> > > > +
> > > > +As documented in linux/vfio.h, several ioctls are provided on the
> > > > +group chardev:
> > > > +
> > > > +#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
> > > > + #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
> > > > + #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
> > > > +#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
> > > > +#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
> > > > +#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
> > > > +#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
> > > > +
> > > > +The last two ioctls return new file descriptors for accessing
> > > > +individual devices within the group and programming the IOMMU.  Each of
> > > > +these new file descriptors provide their own set of file interfaces.
> > > > +These ioctls will fail if any of the devices within the group are not
> > > > +bound to their VFIO bus driver.  Additionally, when either of these
> > > > +interfaces are used, the group is then bound to the struct_mm of the
> > > > +caller.  The GET_FLAGS ioctl can be used to view the state of the group.
> > > > +
> > > > +When either the GET_IOMMU_FD or GET_DEVICE_FD ioctls are invoked, a
> > > > +new IOMMU domain is created and all of the devices in the group are
> > > > +attached to it.  This is the only way to ensure full IOMMU isolation
> > > > +of the group, but potentially wastes resources and cycles if the user
> > > > +intends to manage multiple groups with the same set of IOMMU mappings.
> > > > +VFIO therefore provides a group MERGE and UNMERGE interface, which
> > > > +allows multiple groups to share an IOMMU domain.  Not all IOMMUs allow
> > > > +arbitrary groups to be merged, so the user should assume merging is
> > > > +opportunistic.
> > > 
> > > I do not think "opportunistic" means what you think it means..
> > > 
> > > >  A new group, with no open device or IOMMU file
> > > > +descriptors, can be merged into an existing, in-use, group using the
> > > > +MERGE ioctl.  A merged group can be unmerged using the UNMERGE ioctl
> > > > +once all of the device file descriptors for the group being merged
> > > > +"out" are closed.
> > > > +
> > > > +When groups are merged, the GET_IOMMU_FD and GET_DEVICE_FD ioctls are
> > > > +essentially fungible between group file descriptors (ie. if device
> > > > A
> > > 
> > > IDNT "fungible" MWYTIM, either.
> > 
> > Hmm, feel free to suggest.  Maybe we're hitting .us vs .au connotation.
> 
> In any case, I don't think it's a word whose meaning is unambiguous
> enough to use here.
> 
> > > > +is in group X, and X is merged with Y, a file descriptor for A can be
> > > > +retrieved using GET_DEVICE_FD on Y.  Likewise, GET_IOMMU_FD returns a
> > > > +file descriptor referencing the same internal IOMMU object from either
> > > > +X or Y).  Merged groups can be dissolved either explictly with UNMERGE
> > > > +or automatically when ALL file descriptors for the merged group are
> > > > +closed (all IOMMUs, all devices, all groups).
> > > 
> > > Blech.  I'm really not liking this merge/unmerge API as it stands,
> > > it's horribly confusing.  At the very least, we need some better
> > > terminology.  We need some term for the metagroups; supergroups; iommu
> > > domains or-at-least-they-will-be-once-we-open-the-iommu or
> > > whathaveyous.
> > > 
> > > The first confusing thing about this interface is that each open group
> > > handle actually refers to two different things; the original group you
> > > opened and the metagroup it's a part of.  For the GET_IOMMU_FD and
> > > GET_DEVICE_FD operations, you're using the metagroup and two "merged"
> > > group handles are interchangeable.
> > 
> > Fungible, even ;)
> > 
> > > For other MERGE and especially
> > > UNMERGE operations, it matters which is the original group.
> > 
> > If I stick two LEGO blocks together, I need to identify the individual
> > block I want to remove to pull them back apart...
> 
> Yeah, I'm starting to get my head around the model, but the current
> description of it doesn't help very much.  In particular the terms
> "merge" and "unmerge" lead one to the wrong mental model, I think.
> 
> > > The semantics of "merge" and "unmerge" under those names are really
> > > non-obvious.  Merge kind of has to merge two whole metagroups, but
> > > it's unclear if unmerge reverses one merge, or just takes out one
> > > (atom) group.  These operations need better names, at least.
> > 
> > Christian suggested a change to UNMERGE that we do not need to
> > specify a group to unmerge "from".  This makes it more like a list
> > implementation except there's no defined list_head.  Any member of the
> > list can pull in a new entry.  Calling UNMERGE on any member extracts
> > that member.
> 
> I think that's a good idea, but "unmerge" is not a good word for it.

I can't think of better, if you can, please suggest.

> > > Then it's unclear what order you can do various operations, and which
> > > order you can open and close various things.  You can kind of figure
> > > it out but it takes far more thinking than it should.
> > > 
> > > 
> > > So at the _very_ least, we need to invent new terminology and find a
> > > much better way of describing this API's semantics.  I still think an
> > > entirely different interface, where metagroups are created from
> > > outside with a lifetime that's not tied to an fd would be a better
> > > idea.
> > 
> > As we've discussed previously, configfs provides part of this, but has
> > no ioctl support.  It doesn't make sense to me to go play with groups in
> > configfs, but then still interact with them via a char dev.
> 
> Why not?  You configure, say, loopback devices with losetup, then use
> them as a block device.  Similar with nbd.  You can configure serial
> devices with setserial, then use them as a char dev.
> 
> >  It also
> > splits the ownership model 
> 
> I'm not even sure what that means.
> 
> > and makes it harder to enforce who gets to
> > interact with the devices vs who gets to manipulate groups.
> 
> How so.

Let's map out what a configfs interface would look like, maybe I'll
convince myself it's on the table.  We'd probably start with

/config/vfio/$bus_type.name/

That would probably be pre-populated with a bunch of $groupid files,
matching /dev/vfio/$bus_type.name/$groupid char dev files (assuming
configfs can pre-populate files).  To make a user defined group, we
might then do:

mkdir /config/vfio/$bus_type.name/my_group

That would generate a /dev/vfio/$bus_type.name/my_group char dev.  To
add groups to the new my_group "super group", we'd need to do something
like:

ln -s /config/vfio/$bus_type.name/$groupidA /config/vfio/$bus_type.name/my_group/nic_group

I might then add a second group as:

ln -s /config/vfio/$bus_type.name/$groupidB /config/vfio/$bus_type.name/my_group/hba_group

Either link could fail if the target group is not viable, the group is
already in use, or the second link could fail if the iommu domains were
incompatible.

Do these links cause /dev/vfio/$bus_type.name/{$groupidA,$groupidB} to
disappear?  If not, do we allow them to be opened?  Linking would also
have to fail if we later tried to link one of these groupids to a
different super group.

Now we want to give my_group to a user, so we have to go back to /dev
and

chown $user /dev/vfio/$bus_type.name/my_group

At this point my_group would have the existing set of group ioctls sans
{UN}MERGE, of course.

So $user can use the super group, but not manipulate it's members.  Do
we then allow:

chown $user /config/vfio/$bus_type.name/my_group

If so, what does it imply about the user then doing:

ln -s /config/vfio/$bus_type.name/$groupidC /config/vfio/$bus_type.name/my_group/stolen_group

Would we instead need to chown the configfs groups as well as the super
group?

chown $user /config/vfio/$bus_type.name/my_group
chown $user /config/vfio/$bus_type.name/$groupidA
chown $user /config/vfio/$bus_type.name/$groupidB

ie:

# chown $user:$user /config/vfio/$bus_type.name/$groupC
$ ln -s /config/vfio/$bus_type.name/$groupidC /config/vfio/$bus_type.name/my_group/given_group

(linking has to look at the permissions of the target as well as the
link name)

Now we've introduced that we have ownership of configfs entries, what
does that imply about the char dev entries?  For instance, can $userA
own /dev/vfio/$bus_type.name/$groupidA, but $userB own the configfs
file?  We also have another security consideration that an exploit on
the host might allow a 3rd party to insert a device into a group.

This is where I start to get lost in the complexity versus simply giving
the user permissions for the char dev and allowing them to stick groups
together so long as the have permissions for the group.

We also add an entire filesystem to the interface that already spans
sysfs, dev, eventfds and potentially netlink.

If terminology is the complaint against the {UN}MERGE ioctl interface,
I'm still not sold that configfs is the answer.  /me goes to the
thesaurus... amalgamate? blend? combine? cement? unite? join?

> >  The current
> > model really isn't that complicated, imho.  As always, feel free to
> > suggest specific models.  If you have a specific terminology other than
> > MERGE, please suggest.
> > 
> > > Now, you specify that you can't use a group as the second argument of
> > > a merge if it already has an open iommu, but it's not clear from the
> > > doc if you can merge things into a group with an open iommu.
> > 
> > >From above:
> > 
> >         A new group, with no open device or IOMMU file descriptors, can
> >         be merged into an existing, in-use, group using the MERGE ioctl.
> >                                  ^^^^^^
> > 
> > > Banning
> > > this would make life simpler, because the IOMMU's effective
> > > capabilities may change if you add more devices to the domain.  That's
> > > yet another non-obvious constraint in the interface ordering, though.
> > 
> > Banning this would prevent using merged groups with hotplug, which I
> > consider to be a primary use case.
> 
> Yeah, fair enough, based on your later comments w.r.t. only combining
> feature compatible groups.
> 
> > > > +The IOMMU file descriptor provides this set of ioctls:
> > > > +
> > > > +#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
> > > > + #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
> > > > +#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> > > > +#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
> > > > +
> > > > +The GET_FLAGS ioctl returns basic information about the IOMMU domain.
> > > > +We currently only support IOMMU domains that are able to map any
> > > > +virtual address to any IOVA.  This is indicated by the MAP_ANY
> > > > flag.
> > > 
> > > So.  I tend to think of an IOMMU mapping IOVAs to memory pages, rather
> > > than memory pages to IOVAs.  
> > 
> > I do too, not sure why I wrote it that way, will fix.
> > 
> > > The IOMMU itself, of course maps to
> > > physical addresses, and the meaning of "virtual address" in this
> > > context is not really clear.  I think you would be better off saying
> > > the IOMMU can map any IOVA to any memory page.  From a hardware POV
> > > that means any physical address, but of course for a VFIO user a page
> > > is specified by its process virtual address.
> > 
> > Will fix.
> > 
> > > I think we need to pin exactly what "MAP_ANY" means down better.  Now,
> > > VFIO is pretty much a lost cause if you can't map any normal process
> > > memory page into the IOMMU, so I think the only thing that is really
> > > covered is IOVAs.  But saying "can map any IOVA" is not clear, because
> > > if you can't map it, it's not a (valid) IOVA.  Better to say that
> > > IOVAs can be any 64-bit value, which I think is what you really mean
> > > here.
> > 
> > ok
> > 
> > > Of course, since POWER is a platform where this is *not* true, I'd
> > > prefer to have something giving the range of valid IOVAs in the core
> > > to start with.
> > 
> > Since iommu_ops does not yet have any concept of this (nudge, nudge), I
> > figured this would be added later.  A possible implementation would be
> > that such an iommu would not set MAP_ANY, would add a new flag for
> > MAP_RANGE, and provide a new VFIO_IOMMU_GET_RANGE_INFO ioctl to describe
> > it.  I'm guaranteed to get it wrong if I try to predict all your needs.
> 
> Hrm.  "ANY" just really bothers me because "any iova" is not as clear
> a concept as it first appears.  For starters it's actually "any page
> aligned" at the very least.  But then it's only any 64-bit address for
> busses which have full 64-bit addressing (and I do wonder if there are
> any north bridges out there that forgot to implement some of the upper
> PCI address bits properly, given that 64-bit CPUs rarely actually
> implement more than 40-something physical address bits in practice).
> 
> I'd prefer to see at least something to advertise min and max IOVA and
> IOVA alignment.  That's enough to cover x86 and POWER, including
> possible variants with an IOMMU page size different to the system page
> size (note that POWER kernels can have 64k pages as a config option,
> which means a TCE page size different to the system page size is quite
> common).
> 
> Obviously there could be more complex constraints that we would need
> to advertise with option bits.

x86 has limitations as well.   I don't think most x86 IOMMUs support a
full 64bit IOVA space, so point take.

struct vfio_iommu_info {
	__u64	len;	/* or structlen/arglen */
	__u64	flags;	/* replaces VFIO_IOMMU_GET_FLAGS, none defined yet */
	__u64	iova_max;
	__u64	iova_min;
	__u64	granularity;
};
	
#define VFIO_IOMMU_GET_INFO              _IOR(';', xxx, struct vfio_iommu_info)

Is granularity the minimum granularity, typically PAGE_SIZE barring
special configurations noted above, or is it a bitmap of supported
granularities?  Ex. If we support 4k normal pages and 2M large pages, we
might set bits 12 and 21.

> > > > +
> > > > +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> > > > +and unmapping IOVAs to process virtual addresses:
> > > > +
> > > > +struct vfio_dma_map {
> > > > +        __u64   len;            /* length of structure */
> > > 
> > > Thanks for adding these structure length fields.  But I think they
> > > should be called something other than 'len', which is likely to be
> > > confused with size (or some other length that's actually related to
> > > the operation's parameters).  Better to call it 'structlen' or
> > > 'argslen' or something.
> > 
> > Ok.  As Scott noted, I've failed to implement these in a way that
> > actually allows extension, but I'll work on it.
> 
> Right.  I had failed to realise quite how the encoding of structure
> size into the ioctl worked.  With that in place, arguably we don't
> really need the size in the structure itself, because we can still
> have multiple sized versions of the ioctl.  Still, whichever.

Hmm, that might be cleaner than eliminating the size with just using
_IO().  So we might have something like:

#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
#define VFIO_IOMMU_MAP_DMA_V2           _IOWR(';', 106, struct vfio_dma_map_v2)

For which the driver might do:

case VFIO_IOMMU_MAP_DMA:
case VFIO_IOMMU_MAP_DMA_V2:
{
	struct vfio_dma_map map;

	/* We don't care about the extra v2 bits */
	if (copy_from_user(&map, (void __user *)arg, sizeof map))
		return -EFAULT;
...

That presumes v2 is compatible other than extra fields.  Any objections
(this gets rid of length from all ioctl passed structs)?

> > > > +        __u64   vaddr;          /* process virtual addr */
> > > > +        __u64   dmaaddr;        /* desired and/or returned dma address */
> > > > +        __u64   size;           /* size in bytes */
> > > > +        __u64   flags;
> > > > +#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
> > > 
> > > Make it independent READ and WRITE flags from the start.  Not all
> > > combinations will be be valid on all hardware, but that way we have
> > > the possibilities covered without having to use strange encodings
> > > later.
> > 
> > Ok.
> > 
> > > > +};
> > > > +
> > > > +Current users of VFIO use relatively static DMA mappings, not requiring
> > > > +high frequency turnover.  As new users are added, it's expected that the
> > > > +IOMMU file descriptor will evolve to support new mapping interfaces, this
> > > > +will be reflected in the flags and may present new ioctls and file
> > > > +interfaces.
> > > > +
> > > > +The device GET_FLAGS ioctl is intended to return basic device type and
> > > > +indicate support for optional capabilities.  Flags currently include whether
> > > > +the device is PCI or described by Device Tree, and whether the RESET ioctl
> > > > +is supported:
> > > > +
> > > > +#define VFIO_DEVICE_GET_FLAGS           _IOR(';', 108, __u64)
> > > > + #define VFIO_DEVICE_FLAGS_PCI          (1 << 0)
> > > > + #define VFIO_DEVICE_FLAGS_DT           (1 << 1)
> > > 
> > > TBH, I don't think the VFIO for DT stuff is mature enough yet to be in
> > > an initial infrastructure patch, though we should certainly be
> > > discussing it as an add-on patch.
> > 
> > I agree for DT, and PCI should be added with vfio-pci, not the initial
> > core.
> > 
> > > > + #define VFIO_DEVICE_FLAGS_RESET        (1 << 2)
> > > > +
> > > > +The MMIO and IOP resources used by a device are described by regions.
> > > > +The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
> > > > +
> > > > +#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
> > > > +
> > > > +Regions are described by a struct vfio_region_info, which is retrieved by
> > > > +using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
> > > > +the desired region (0 based index).  Note that devices may implement zero
> > > > +sized regions (vfio-pci does this to provide a 1:1 BAR to region index
> > > > +mapping).
> > > 
> > > So, I think you're saying that a zero-sized region is used to encode a
> > > NOP region, that is, to basically put a "no region here" in between
> > > valid region indices.  You should spell that out.
> > 
> > Ok.
> > 
> > > [Incidentally, any chance you could borrow one of RH's tech writers
> > > for this?  I'm afraid you seem to lack the knack for clear and easily
> > > read documentation]
> > 
> > Thanks for the encouragement :-\  It's no wonder there isn't more
> > content in Documentation.
> 
> Sigh.  Alas, yes.
> 
> > > > +struct vfio_region_info {
> > > > +        __u32   len;            /* length of structure */
> > > > +        __u32   index;          /* region number */
> > > > +        __u64   size;           /* size in bytes of region */
> > > > +        __u64   offset;         /* start offset of region */
> > > > +        __u64   flags;
> > > > +#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
> > > > +#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
> > > 
> > > Again having separate read and write bits from the start will save
> > > strange encodings later.
> > 
> > Seems highly unlikely, but we have bits to waste...
> > 
> > > > +#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
> > > > +        __u64   phys;           /* physical address of region */
> > > > +};
> > > 
> > > I notice there is no field for "type" e.g. MMIO vs. PIO vs. config
> > > space for PCI.  If you added that having a NONE type might be a
> > > clearer way of encoding a non-region than just having size==0.
> > 
> > I thought there was some resistance to including MMIO and PIO bits in
> > the flags.  If that's passed, I can add it, but PCI can determine this
> > through config space (and vfio-pci exposes config space at a fixed
> > index).  Having a regions w/ size == 0, MMIO and PIO flags unset seems a
> > little redundant if that's the only reason for having them.  A NONE flag
> > doesn't make sense to me.  Config space isn't NONE, but neither is it
> > MMIO nor PIO; and someone would probably be offended about even
> > mentioning PIO in the specification.
> 
> No, my concept was that NONE would be used for the indexes where there
> is no valid BAR.  I'll buy your argument on why not to include the PCI
> (or whatever) address space type here.
> 
> What I'm just a bit concerned by is whether we could have a case (not
> for PCI) of a real resource that still has size 0 - e.g. maybe some
> sort of doorbell that can't be read or written, but can be triggered
> some other way.  I guess that's probably unlikely though.

Right, and if somehow you had such a region where the size is zero, but
allowed some kind of operation on it, we could define a flag for it.

> > > > +
> > > > +#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
> > > > +
> > > > +The offset indicates the offset into the device file descriptor which
> > > > +accesses the given range (for read/write/mmap/seek).  Flags indicate the
> > > > +available access types and validity of optional fields.  For instance
> > > > +the phys field may only be valid for certain devices types.
> > > > +
> > > > +Interrupts are described using a similar interface.  GET_NUM_IRQS
> > > > +reports the number or IRQ indexes for the device.
> > > > +
> > > > +#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
> > > > +
> > > > +struct vfio_irq_info {
> > > > +        __u32   len;            /* length of structure */
> > > > +        __u32   index;          /* IRQ number */
> > > > +        __u32   count;          /* number of individual IRQs */
> > > 
> > > Is there a reason for allowing irqs in batches like this, rather than
> > > having each MSI be reflected by a separate irq_info?
> > 
> > Yes, bus drivers like vfio-pci can define index 1 as the MSI info
> > structure and index 2 as MSI-X.  There's really no need to expose 57
> > individual MSI interrupts and try to map them to the correct device
> > specific MSI type if they can only logically be enabled in two distinct
> > groups.  Bus drivers with individually controllable MSI vectors are free
> > to expose them separately.  I assume device tree paths would help
> > associate an index to a specific interrupt.
> 
> Ok, fair enough.
> 
> > > > +        __u64   flags;
> > > > +#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
> > > > +};
> > > > +
> > > > +Again, zero count entries are allowed (vfio-pci uses a static interrupt
> > > > +type to index mapping).
> > > 
> > > I know what you mean, but you need a clearer way to express it.
> > 
> > I'll work on it.
> > 
> > > > +Information about each index can be retrieved using the GET_IRQ_INFO
> > > > +ioctl, used much like GET_REGION_INFO.
> > > > +
> > > > +#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
> > > > +
> > > > +Individual indexes can describe single or sets of IRQs.  This provides the
> > > > +flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
> > > > +
> > > > +All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
> > > > +as shown below, are used to pass the IRQ info index, the number of eventfds,
> > > > +and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
> > > > +
> > > > +/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
> > > > +#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
> > > > +
> > > > +When a level triggered interrupt is signaled, the interrupt is masked
> > > > +on the host.  This prevents an unresponsive userspace driver from
> > > > +continuing to interrupt the host system.  After servicing the interrupt,
> > > > +UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
> > > > +triggered interrupts implicitly have a count of 1 per index.
> > > 
> > > This is a silly restriction.  Even PCI devices can have up to 4 LSIs
> > > on a function in theory, though no-one ever does.  Embedded devices
> > > can and do have multiple level interrupts.
> > 
> > Per the PCI spec, an individual PCI function can only ever have, at
> > most, a single INTx line.  A multi-function *device* can have up to 4
> > INTx lines, but what we're exposing here is a struct device, ie. a PCI
> > function.
> 
> Ah, my mistake.
> 
> > Other devices could certainly have multiple level interrupts, and if
> > grouping them as we do with MSI on PCI makes sense, please let me know.
> > I just didn't see the value in making the unmask operations handle
> > sub-indexes if it's not needed.
> 
> I don't know of anything off hand.  But I can't see any consideration
> that would make it unlikely either.  I generally don't trust anything
> *not* to exist in embedded space.

Fair enough.  Level IRQs are still triggered individually, so unmasking
is too, which means UNMASK_IRQ takes something like { int index; int
subindex }.

SET_UNMASK_IRQ_EVENTFDS should follow SET_IRQ_EVENTFDS and take { int
index; int count; int fds[] }.

> > > > +
> > > > +/* Unmask IRQ index, arg[0] = index */
> > > > +#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)
> > > > +
> > > > +Level triggered interrupts can also be unmasked using an irqfd.  Use
> > > > +SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
> > > > +
> > > > +/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
> > > > +#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
> > > > +
> > > > +When supported, as indicated by the device flags, reset the device.
> > > > +
> > > > +#define VFIO_DEVICE_RESET               _IO(';', 116)
> > > > +
> > > > +Device tree devices also invlude ioctls for further defining the
> > > > +device tree properties of the device:
> > > > +
> > > > +struct vfio_dtpath {
> > > > +        __u32   len;            /* length of structure */
> > > > +        __u32   index;
> > > > +        __u64   flags;
> > > > +#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
> > > > +#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
> > > > +        char    *path;
> > > > +};
> > > > +#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
> > > > +
> > > > +struct vfio_dtindex {
> > > > +        __u32   len;            /* length of structure */
> > > > +        __u32   index;
> > > > +        __u32   prop_type;
> > > > +        __u32   prop_index;
> > > > +        __u64   flags;
> > > > +#define VFIO_DTINDEX_FLAGS_REGION       (1 << 0)
> > > > +#define VFIO_DTINDEX_FLAGS_IRQ          (1 << 1)
> > > > +};
> > > > +#define VFIO_DEVICE_GET_DTINDEX         _IOWR(';', 118, struct vfio_dtindex)
> > > > +
> > > > +
> > > > +VFIO bus driver API
> > > > +-------------------------------------------------------------------------------
> > > > +
> > > > +Bus drivers, such as PCI, have three jobs:
> > > > + 1) Add/remove devices from vfio
> > > > + 2) Provide vfio_device_ops for device access
> > > > + 3) Device binding and unbinding
> > > > +
> > > > +When initialized, the bus driver should enumerate the devices on it's
> > > 
> > > s/it's/its/
> > 
> > Noted.
> > 
> > <snip>
> > > > +/* Unmap DMA region */
> > > > +/* dgate must be held */
> > > > +static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > > > +			    int npage, int rdwr)
> > > 
> > > Use of "read" and "write" in DMA can often be confusing, since it's
> > > not always clear if you're talking from the perspective of the CPU or
> > > the device (_writing_ data to a device will usually involve it doing
> > > DMA _reads_ from memory).  It's often best to express things as DMA
> > > direction, 'to device', and 'from device' instead.
> > 
> > Good point.
> 
> This, of course, potentially affects many areas of the code and doco.

I've changed vfio_iommu to use <linux/dma-direction.h> definitions
internally.  For the ioctl I've so far simply included WRITE and READ
flags, which I can clarify are from the device perspective.  Flags like
VFIO_DMA_MAP_FLAG_TO_DEVICE/FROM_DEVICE are actually more confusing to
me at this interface level.  We also have IOMMU_READ/IOMMU_WRITE which
makes me question using dma-direction.h and if we shouldn't just define
everything as from the device perspective.

> > > > +{
> > > > +	int i, unlocked = 0;
> > > > +
> > > > +	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > > > +		unsigned long pfn;
> > > > +
> > > > +		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
> > > > +		if (pfn) {
> > > > +			iommu_unmap(iommu->domain, iova, 0);
> > > > +			unlocked += put_pfn(pfn, rdwr);
> > > > +		}
> > > > +	}
> > > > +	return unlocked;
> > > > +}
> > > > +
> > > > +static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
> > > > +			   unsigned long npage, int rdwr)
> > > > +{
> > > > +	int unlocked;
> > > > +
> > > > +	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);
> > > > +	vfio_lock_acct(-unlocked);
> > > 
> > > Have you checked that your accounting will work out if the user maps
> > > the same memory page to multiple IOVAs?
> > 
> > Hmm, it probably doesn't.  We potentially over-penalize the user process
> > here.
> 
> Ok.

FWIW, I don't intend to fix this right now, but I have added a comment
in the code noting it.  We'll have to see if there's an efficient way to
make the tracking better.

> > > > +}
> > > > +
> > > > +/* Unmap ALL DMA regions */
> > > > +void vfio_iommu_unmapall(struct vfio_iommu *iommu)
> > > > +{
> > > > +	struct list_head *pos, *pos2;
> > > > +	struct dma_map_page *mlp;
> > > > +
> > > > +	mutex_lock(&iommu->dgate);
> > > > +	list_for_each_safe(pos, pos2, &iommu->dm_list) {
> > > > +		mlp = list_entry(pos, struct dma_map_page, list);
> > > > +		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
> > > > +		list_del(&mlp->list);
> > > > +		kfree(mlp);
> > > > +	}
> > > > +	mutex_unlock(&iommu->dgate);
> > > 
> > > Ouch, no good at all.  Keeping track of every DMA map is no good on
> > > POWER or other systems where IOMMU operations are a hot path.  I think
> > > you'll need an iommu specific hook for this instead, which uses
> > > whatever data structures are natural for the IOMMU.  For example a
> > > 1-level pagetable, like we use on POWER will just zero every entry.
> > 
> > It's already been noted in the docs that current users have relatively
> > static mappings and a performance interface is TBD for dynamically
> > backing streaming DMA.  The current vfio_iommu exposes iommu_ops, POWER
> > will need to come up with something to expose instead.
> 
> Right, but I'm not just talking about the current map/unmap calls
> themselves.  This infrastructure for tracking it looks like it's
> intended to be generic for all mapping methods.  If not, I can't see
> the reason for it, because I don't think the current interface
> requires such tracking inherently.

It does seem that way, but there is a purpose.  We need to unmap
everything on release.  It's easy to assume that iommu_domain_free()
will unmap everything from the IOMMU, which it does, but we've also done
a get_user_pages on each of those in vfio, which we need to cleanup.  We
can't rely on userspace to do this since they might have been SIGKILL'd.
Making it generic with coalescing of adjacent regions and such is
primarily for space efficiency.

<snip>
> > > > +#ifdef CONFIG_COMPAT
> > > > +static long vfio_iommu_compat_ioctl(struct file *filep,
> > > > +				    unsigned int cmd, unsigned long arg)
> > > > +{
> > > > +	arg = (unsigned long)compat_ptr(arg);
> > > > +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> > > 
> > > Um, this only works if the structures are exactly compatible between
> > > 32-bit and 64-bit ABIs.  I don't think that is always true.
> > 
> > I think all our structure sizes are independent of host width.  If I'm
> > missing something, let me know.
> 
> Ah, for structures, that might be true.  I was seeing the bunch of
> ioctl()s that take ints.

Ugh, I suppose you're thinking of an ILP64 platform with ILP32 compat
mode.  Darn it, guess we need to make everything 64bit, including file
descriptors.

<snip>
> > > > +
> > > > +/* Get a new iommu file descriptor.  This will open the iommu, setting
> > > > + * the current->mm ownership if it's not already set. */
> > > 
> > > I know I've had this explained to me several times before, but I've
> > > forgotten again.  Why do we need to wire the iommu to an mm?
> > 
> > We're mapping process virtual addresses into the IOMMU, so it makes
> > sense to restrict ourselves to a single virtual address space.  It also
> > enforces the ownership, that only a single mm is in control of the
> > group.
> 
> Neither of those seems conclusive to me, but I remember that I saw a
> strong reason earlier, even if I can't remember it now.

The point of the group is to provide a unit of ownership.  We can't let
$userA open $groupid and fetch a device, then have $userB do the same,
grabbing a different device.  The mappings will step on each other and
the devices have no isolation.  We can't restrict that purely by file
permissions or we'll have the same problem with sudo.  At one point we
discussed a single open instance, but that unnecessarily limits the
user, so we settled on the mm.  Thanks,

Alex
Scott Wood Nov. 18, 2011, 9:09 p.m. UTC | #35
On Fri, Nov 18, 2011 at 01:32:56PM -0700, Alex Williamson wrote:
> Hmm, that might be cleaner than eliminating the size with just using
> _IO().  So we might have something like:
> 
> #define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> #define VFIO_IOMMU_MAP_DMA_V2           _IOWR(';', 106, struct vfio_dma_map_v2)
> 
> For which the driver might do:
> 
> case VFIO_IOMMU_MAP_DMA:
> case VFIO_IOMMU_MAP_DMA_V2:
> {
> 	struct vfio_dma_map map;
> 
> 	/* We don't care about the extra v2 bits */
> 	if (copy_from_user(&map, (void __user *)arg, sizeof map))
> 		return -EFAULT;

That won't work if you have an old kernel that doesn't know about v2, and
a new user that uses v2.  To make this work you'd have to strip out the
size from the ioctl number before switching (but still use it when
considering whether to access the v2 fields).  Simpler to just leave it
out of the ioctl number and put it in the struct field as currently
planned.

> > > I think all our structure sizes are independent of host width.  If I'm
> > > missing something, let me know.
> > 
> > Ah, for structures, that might be true.  I was seeing the bunch of
> > ioctl()s that take ints.
> 
> Ugh, I suppose you're thinking of an ILP64 platform with ILP32 compat
> mode.

Does Linux support ILP64?  There are "int" ioctls all over the place, and
I don't think we do compat wrappers for them.  In fact, some of the
ioctls in linux/fs.h use "int" for the compatible version of ioctls
originally defined as "long".

It's cleaner to always use the fixed types, though.

> Darn it, guess we need to make everything 64bit, including file
> descriptors.

What's wrong with __u32/__s32 (or uint32_t/int32_t)?

I really do not see Linux supporting an ABI that has no 32-bit type at
all, especially in a situation where userspace compatibility is needed. 
If that does happen, the ABI breakage will go well beyond VFIO.

> The point of the group is to provide a unit of ownership.  We can't let
> $userA open $groupid and fetch a device, then have $userB do the same,
> grabbing a different device.  The mappings will step on each other and
> the devices have no isolation.  We can't restrict that purely by file
> permissions or we'll have the same problem with sudo.

What is the problem with sudo?  If you're running processes as the same
user, regardless of how, they're going to be able to mess with each
other.

Is it possible to expose access to only specific groups via an
individually-permissionable /dev/device, so only the entity handing out
access to devices needs access to everything?

> At one point we discussed a single open instance, but that
> unnecessarily limits the user, so we settled on the mm.  Thanks,

It would be nice if this limitation weren't excessively integrated into
the design -- in the embedded space we've got unusual partitioning
setups, including failover arrangements where partitions share devices. 
The device may be configured with the IOMMU pointing only at regions that
are shared by both mms, or the non-shared regions may be reconfigured as
active ownership of the device gets handed around.

It would be up to userspace code to make sure that the mappings don't
"step on each other".  The mapping could be done with whichever mm issued
the map call for a given region.

For this use case, there is unlikely to be an issue with ownership
because there will not be separate privilege domains creating partitions
-- other use cases could refrain from enabling multiple-mm support unless
ownership issues are resolved.

This doesn't need to be supported initially, but we should try to avoid
letting the assumption permeate the code.

-Scott
David Gibson Nov. 21, 2011, 2:47 a.m. UTC | #36
On Fri, Nov 18, 2011 at 01:32:56PM -0700, Alex Williamson wrote:
> On Thu, 2011-11-17 at 11:02 +1100, David Gibson wrote:
> > On Tue, Nov 15, 2011 at 11:01:28AM -0700, Alex Williamson wrote:
> > > On Tue, 2011-11-15 at 17:34 +1100, David Gibson wrote:
> > > > On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
> <snip>
> > > > > +Groups, Devices, IOMMUs, oh my
> > > > > +-------------------------------------------------------------------------------
> > > > > +
> > > > > +A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
> > > > > +can't always distinguish transactions from each individual device in
> > > > > +the system.  Sometimes this is because of the IOMMU design, such as with
> > > > > +PEs, other times it's caused by the I/O topology, for instance a
> > > > > +PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
> > > > > +devices created by these restictions IOMMU groups (or just "groups" for
> > > > > +this document).
> > > > > +
> > > > > +The IOMMU cannot distiguish transactions between the individual devices
> > > > > +within the group, therefore the group is the basic unit of ownership for
> > > > > +a userspace process.  Because of this, groups are also the primary
> > > > > +interface to both devices and IOMMU domains in VFIO.
> > > > > +
> > > > > +The VFIO representation of groups is created as devices are added into
> > > > > +the framework by a VFIO bus driver.  The vfio-pci module is an example
> > > > > +of a bus driver.  This module registers devices along with a set of bus
> > > > > +specific callbacks with the VFIO core.  These callbacks provide the
> > > > > +interfaces later used for device access.  As each new group is created,
> > > > > +as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
> > > > > +character device.
> > > > 
> > > > Ok.. so, the fact that it's called "vfio-pci" suggests that the VFIO
> > > > bus driver is per bus type, not per bus instance.   But grouping
> > > > constraints could be per bus instance, if you have a couple of
> > > > different models of PCI host bridge with IOMMUs of different
> > > > capabilities built in, for example.
> > > 
> > > Yes, vfio-pci manages devices on the pci_bus_type; per type, not per bus
> > > instance.
> > 
> > Ok, how can that work.  vfio-pci is responsible for generating the
> > groupings, yes?  For which it needs to know the iommu/host bridge's
> > isolation capabilities, which vary depending on the type of host
> > bridge.
> 
> No, grouping is done at the iommu driver level.  vfio gets groupings via
> iomm_device_group(), which uses the iommu_ops for the bus_type of the
> requested device.  I'll attempt to clarify where groups come from in the
> documentation.

Hrm, but still per bus type, not bus instance.  Hrm.  Yeah, I need to
look at the earlier iommu patches in more detail.

[snip]
> > Yeah, I'm starting to get my head around the model, but the current
> > description of it doesn't help very much.  In particular the terms
> > "merge" and "unmerge" lead one to the wrong mental model, I think.
> > 
> > > > The semantics of "merge" and "unmerge" under those names are really
> > > > non-obvious.  Merge kind of has to merge two whole metagroups, but
> > > > it's unclear if unmerge reverses one merge, or just takes out one
> > > > (atom) group.  These operations need better names, at least.
> > > 
> > > Christian suggested a change to UNMERGE that we do not need to
> > > specify a group to unmerge "from".  This makes it more like a list
> > > implementation except there's no defined list_head.  Any member of the
> > > list can pull in a new entry.  Calling UNMERGE on any member extracts
> > > that member.
> > 
> > I think that's a good idea, but "unmerge" is not a good word for it.
> 
> I can't think of better, if you can, please suggest.

Well, I think addgroup and removegroup would be better than merge and
unmerge, although they have their own problems.

> > > > Then it's unclear what order you can do various operations, and which
> > > > order you can open and close various things.  You can kind of figure
> > > > it out but it takes far more thinking than it should.
> > > > 
> > > > 
> > > > So at the _very_ least, we need to invent new terminology and find a
> > > > much better way of describing this API's semantics.  I still think an
> > > > entirely different interface, where metagroups are created from
> > > > outside with a lifetime that's not tied to an fd would be a better
> > > > idea.
> > > 
> > > As we've discussed previously, configfs provides part of this, but has
> > > no ioctl support.  It doesn't make sense to me to go play with groups in
> > > configfs, but then still interact with them via a char dev.
> > 
> > Why not?  You configure, say, loopback devices with losetup, then use
> > them as a block device.  Similar with nbd.  You can configure serial
> > devices with setserial, then use them as a char dev.
> > 
> > >  It also
> > > splits the ownership model 
> > 
> > I'm not even sure what that means.
> > 
> > > and makes it harder to enforce who gets to
> > > interact with the devices vs who gets to manipulate groups.
> > 
> > How so.
> 
> Let's map out what a configfs interface would look like, maybe I'll
> convince myself it's on the table.  We'd probably start with

Hrm, assumin we used configfs, which is not the only option.

> /config/vfio/$bus_type.name/
> 
> That would probably be pre-populated with a bunch of $groupid files,
> matching /dev/vfio/$bus_type.name/$groupid char dev files (assuming
> configfs can pre-populate files).  To make a user defined group, we
> might then do:
> 
> mkdir /config/vfio/$bus_type.name/my_group
> 
> That would generate a /dev/vfio/$bus_type.name/my_group char dev.  To
> add groups to the new my_group "super group", we'd need to do something
> like:
> 
> ln -s /config/vfio/$bus_type.name/$groupidA /config/vfio/$bus_type.name/my_group/nic_group
> 
> I might then add a second group as:
> 
> ln -s /config/vfio/$bus_type.name/$groupidB /config/vfio/$bus_type.name/my_group/hba_group
> 
> Either link could fail if the target group is not viable,

The link op shouldn't fail because the subgroup isn't viable.
Instead, the supergroup jusy won't be viable until all devices in all
subgroups are bound to vfio.

> the group is
> already in use, or the second link could fail if the iommu domains were
> incompatible.
> 
> Do these links cause /dev/vfio/$bus_type.name/{$groupidA,$groupidB} to
> disappear?  If not, do we allow them to be opened?  Linking would also
> have to fail if we later tried to link one of these groupids to a
> different super group.

Again, I think some confusion is coming in here from calling both the
hardware determined thing and the admin determined thing a "group".
So for now I'm going to call the first a "group" and the second a
"predomain" (because once it's viable and the right conditions are set
up it will become an iommu domain).

So another option is that "groups" *only* participate in the merging
interface; getting iommu and device handles occurs only on a
predomain.  Therefore there would be no /dev/vfio/$group, you would
have to configure a predomain with at least one group before you had a
device file.

> Now we want to give my_group to a user, so we have to go back to /dev
> and
> 
> chown $user /dev/vfio/$bus_type.name/my_group
> 
> At this point my_group would have the existing set of group ioctls sans
> {UN}MERGE, of course.
> 
> So $user can use the super group, but not manipulate it's members.  Do
> we then allow:
> 
> chown $user /config/vfio/$bus_type.name/my_group
> 
> If so, what does it imply about the user then doing:
> 
> ln -s /config/vfio/$bus_type.name/$groupidC /config/vfio/$bus_type.name/my_group/stolen_group
> 
> Would we instead need to chown the configfs groups as well as the super
> group?
> 
> chown $user /config/vfio/$bus_type.name/my_group
> chown $user /config/vfio/$bus_type.name/$groupidA
> chown $user /config/vfio/$bus_type.name/$groupidB
> 
> ie:
> 
> # chown $user:$user /config/vfio/$bus_type.name/$groupC
> $ ln -s /config/vfio/$bus_type.name/$groupidC /config/vfio/$bus_type.name/my_group/given_group

This is not the only option.  We could also do:

cd /config/vfio
mkdir new_predomain
echo $groupid > new_predomain/addgroup
chown $user /dev/vfio/new_predomain

This is assuming that configuration of predomains is a root only
operation, which seems reasonable to me.

> (linking has to look at the permissions of the target as well as the
> link name)

Which would be unexpected and therefore a bad idea.

> Now we've introduced that we have ownership of configfs entries, what
> does that imply about the char dev entries?  For instance, can $userA
> own /dev/vfio/$bus_type.name/$groupidA, but $userB own the configfs
> file?  We also have another security consideration that an exploit on
> the host might allow a 3rd party to insert a device into a group.
> 
> This is where I start to get lost in the complexity versus simply giving
> the user permissions for the char dev and allowing them to stick groups
> together so long as the have permissions for the group.
> 
> We also add an entire filesystem to the interface that already spans
> sysfs, dev, eventfds and potentially netlink.
> 
> If terminology is the complaint against the {UN}MERGE ioctl interface,
> I'm still not sold that configfs is the answer.  /me goes to the
> thesaurus... amalgamate? blend? combine? cement? unite? join?

A thesaurus won't help, my point is you want something with a
*different* meaning to merge, which implies a symmetry not present in
this operation.

[snip]
> > Hrm.  "ANY" just really bothers me because "any iova" is not as clear
> > a concept as it first appears.  For starters it's actually "any page
> > aligned" at the very least.  But then it's only any 64-bit address for
> > busses which have full 64-bit addressing (and I do wonder if there are
> > any north bridges out there that forgot to implement some of the upper
> > PCI address bits properly, given that 64-bit CPUs rarely actually
> > implement more than 40-something physical address bits in practice).
> > 
> > I'd prefer to see at least something to advertise min and max IOVA and
> > IOVA alignment.  That's enough to cover x86 and POWER, including
> > possible variants with an IOMMU page size different to the system page
> > size (note that POWER kernels can have 64k pages as a config option,
> > which means a TCE page size different to the system page size is quite
> > common).
> > 
> > Obviously there could be more complex constraints that we would need
> > to advertise with option bits.
> 
> x86 has limitations as well.   I don't think most x86 IOMMUs support a
> full 64bit IOVA space, so point take.
> 
> struct vfio_iommu_info {
> 	__u64	len;	/* or structlen/arglen */
> 	__u64	flags;	/* replaces VFIO_IOMMU_GET_FLAGS, none defined yet */
> 	__u64	iova_max;
> 	__u64	iova_min;
> 	__u64	granularity;
> };
> 	
> #define VFIO_IOMMU_GET_INFO              _IOR(';', xxx, struct vfio_iommu_info)

Yeah, this looks like what I was after.

> Is granularity the minimum granularity, typically PAGE_SIZE barring
> special configurations noted above, or is it a bitmap of supported
> granularities?  Ex. If we support 4k normal pages and 2M large pages, we
> might set bits 12 and 21.

Just minimum, I think.  I'd prefer 'alignment' to 'granularity' I
think, but I don't care that much.

> > > > > +
> > > > > +The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
> > > > > +and unmapping IOVAs to process virtual addresses:
> > > > > +
> > > > > +struct vfio_dma_map {
> > > > > +        __u64   len;            /* length of structure */
> > > > 
> > > > Thanks for adding these structure length fields.  But I think they
> > > > should be called something other than 'len', which is likely to be
> > > > confused with size (or some other length that's actually related to
> > > > the operation's parameters).  Better to call it 'structlen' or
> > > > 'argslen' or something.
> > > 
> > > Ok.  As Scott noted, I've failed to implement these in a way that
> > > actually allows extension, but I'll work on it.
> > 
> > Right.  I had failed to realise quite how the encoding of structure
> > size into the ioctl worked.  With that in place, arguably we don't
> > really need the size in the structure itself, because we can still
> > have multiple sized versions of the ioctl.  Still, whichever.
> 
> Hmm, that might be cleaner than eliminating the size with just using
> _IO().  So we might have something like:
> 
> #define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
> #define VFIO_IOMMU_MAP_DMA_V2           _IOWR(';', 106, struct vfio_dma_map_v2)
> 
> For which the driver might do:
> 
> case VFIO_IOMMU_MAP_DMA:
> case VFIO_IOMMU_MAP_DMA_V2:
> {
> 	struct vfio_dma_map map;
> 
> 	/* We don't care about the extra v2 bits */
> 	if (copy_from_user(&map, (void __user *)arg, sizeof map))
> 		return -EFAULT;
> ...
> 
> That presumes v2 is compatible other than extra fields.

Right, as does having the length in the structure itself.

> Any objections
> (this gets rid of length from all ioctl passed structs)?

Not from here.

[snip]
> > No, my concept was that NONE would be used for the indexes where there
> > is no valid BAR.  I'll buy your argument on why not to include the PCI
> > (or whatever) address space type here.
> > 
> > What I'm just a bit concerned by is whether we could have a case (not
> > for PCI) of a real resource that still has size 0 - e.g. maybe some
> > sort of doorbell that can't be read or written, but can be triggered
> > some other way.  I guess that's probably unlikely though.
> 
> Right, and if somehow you had such a region where the size is zero, but
> allowed some kind of operation on it, we could define a flag for it.

Hrm, I guess.

[snip]
> > > Other devices could certainly have multiple level interrupts, and if
> > > grouping them as we do with MSI on PCI makes sense, please let me know.
> > > I just didn't see the value in making the unmask operations handle
> > > sub-indexes if it's not needed.
> > 
> > I don't know of anything off hand.  But I can't see any consideration
> > that would make it unlikely either.  I generally don't trust anything
> > *not* to exist in embedded space.
> 
> Fair enough.  Level IRQs are still triggered individually, so unmasking
> is too, which means UNMASK_IRQ takes something like { int index; int
> subindex }.
> 
> SET_UNMASK_IRQ_EVENTFDS should follow SET_IRQ_EVENTFDS and take { int
> index; int count; int fds[] }.

Ok.

[snip]
> > > > Use of "read" and "write" in DMA can often be confusing, since it's
> > > > not always clear if you're talking from the perspective of the CPU or
> > > > the device (_writing_ data to a device will usually involve it doing
> > > > DMA _reads_ from memory).  It's often best to express things as DMA
> > > > direction, 'to device', and 'from device' instead.
> > > 
> > > Good point.
> > 
> > This, of course, potentially affects many areas of the code and doco.
> 
> I've changed vfio_iommu to use <linux/dma-direction.h> definitions
> internally.  For the ioctl I've so far simply included WRITE and READ
> flags, which I can clarify are from the device perspective.  Flags like
> VFIO_DMA_MAP_FLAG_TO_DEVICE/FROM_DEVICE are actually more confusing to
> me at this interface level.  We also have IOMMU_READ/IOMMU_WRITE which
> makes me question using dma-direction.h and if we shouldn't just define
> everything as from the device perspective.

Ok, sounds like a good start.  In some contexts read/write are clear,
in others they're not.  Just something to keep in mind.

[snip]
> > Right, but I'm not just talking about the current map/unmap calls
> > themselves.  This infrastructure for tracking it looks like it's
> > intended to be generic for all mapping methods.  If not, I can't see
> > the reason for it, because I don't think the current interface
> > requires such tracking inherently.
> 
> It does seem that way, but there is a purpose.  We need to unmap
> everything on release.  It's easy to assume that iommu_domain_free()
> will unmap everything from the IOMMU, which it does, but we've also done
> a get_user_pages on each of those in vfio, which we need to cleanup.  We
> can't rely on userspace to do this since they might have been SIGKILL'd.
> Making it generic with coalescing of adjacent regions and such is
> primarily for space efficiency.


Ah, I see.  Much as generic infrastructure is nice when we can do it,
I think this consideration will have to be pushed down to the iommu
driver layer.  For e.g. on power, we have all the information we need
to do the page tracking; any write to a TCE put()s the page that was
previously in that entry (if any) as well as get()ing the one that's
going in (if any).  It's just that we don't want to keep track in this
generic data structure _as well as_ the one that's natural for the
hardware.

> <snip>
> > > > > +#ifdef CONFIG_COMPAT
> > > > > +static long vfio_iommu_compat_ioctl(struct file *filep,
> > > > > +				    unsigned int cmd, unsigned long arg)
> > > > > +{
> > > > > +	arg = (unsigned long)compat_ptr(arg);
> > > > > +	return vfio_iommu_unl_ioctl(filep, cmd, arg);
> > > > 
> > > > Um, this only works if the structures are exactly compatible between
> > > > 32-bit and 64-bit ABIs.  I don't think that is always true.
> > > 
> > > I think all our structure sizes are independent of host width.  If I'm
> > > missing something, let me know.
> > 
> > Ah, for structures, that might be true.  I was seeing the bunch of
> > ioctl()s that take ints.
> 
> Ugh, I suppose you're thinking of an ILP64 platform with ILP32 compat
> mode.  Darn it, guess we need to make everything 64bit, including file
> descriptors.

Well, we don't _have_ to, but if we don't then we have to implement
compat wrappers for every non explicit width thing we pass through.

> <snip>
> > > > > +
> > > > > +/* Get a new iommu file descriptor.  This will open the iommu, setting
> > > > > + * the current->mm ownership if it's not already set. */
> > > > 
> > > > I know I've had this explained to me several times before, but I've
> > > > forgotten again.  Why do we need to wire the iommu to an mm?
> > > 
> > > We're mapping process virtual addresses into the IOMMU, so it makes
> > > sense to restrict ourselves to a single virtual address space.  It also
> > > enforces the ownership, that only a single mm is in control of the
> > > group.
> > 
> > Neither of those seems conclusive to me, but I remember that I saw a
> > strong reason earlier, even if I can't remember it now.
> 
> The point of the group is to provide a unit of ownership.  We can't let
> $userA open $groupid and fetch a device, then have $userB do the same,
> grabbing a different device.  The mappings will step on each other and
> the devices have no isolation.  We can't restrict that purely by file
> permissions or we'll have the same problem with sudo.  At one point we
> discussed a single open instance, but that unnecessarily limits the
> user, so we settled on the mm.  Thanks,

Hm, ok.

Fyi, I'll be kind of slow in responses for the next while.  I broke a
bone in my hand on Friday :(.
Alex Williamson Nov. 22, 2011, 6:22 p.m. UTC | #37
On Mon, 2011-11-21 at 13:47 +1100, David Gibson wrote:
> On Fri, Nov 18, 2011 at 01:32:56PM -0700, Alex Williamson wrote:
> > On Thu, 2011-11-17 at 11:02 +1100, David Gibson wrote:
> > > On Tue, Nov 15, 2011 at 11:01:28AM -0700, Alex Williamson wrote:
> > > > On Tue, 2011-11-15 at 17:34 +1100, David Gibson wrote:
> > > > > On Thu, Nov 03, 2011 at 02:12:24PM -0600, Alex Williamson wrote:
<snip> 
> > > > As we've discussed previously, configfs provides part of this, but has
> > > > no ioctl support.  It doesn't make sense to me to go play with groups in
> > > > configfs, but then still interact with them via a char dev.
> > > 
> > > Why not?  You configure, say, loopback devices with losetup, then use
> > > them as a block device.  Similar with nbd.  You can configure serial
> > > devices with setserial, then use them as a char dev.
> > > 
> > > >  It also
> > > > splits the ownership model 
> > > 
> > > I'm not even sure what that means.
> > > 
> > > > and makes it harder to enforce who gets to
> > > > interact with the devices vs who gets to manipulate groups.
> > > 
> > > How so.
> > 
> > Let's map out what a configfs interface would look like, maybe I'll
> > convince myself it's on the table.  We'd probably start with
> 
> Hrm, assumin we used configfs, which is not the only option.

I'm not writing vfiofs, configfs seems most like what we'd need.  If
there are others we should consider, please note them.

> > /config/vfio/$bus_type.name/
> > 
> > That would probably be pre-populated with a bunch of $groupid files,
> > matching /dev/vfio/$bus_type.name/$groupid char dev files (assuming
> > configfs can pre-populate files).  To make a user defined group, we
> > might then do:
> > 
> > mkdir /config/vfio/$bus_type.name/my_group
> > 
> > That would generate a /dev/vfio/$bus_type.name/my_group char dev.  To
> > add groups to the new my_group "super group", we'd need to do something
> > like:
> > 
> > ln -s /config/vfio/$bus_type.name/$groupidA /config/vfio/$bus_type.name/my_group/nic_group
> > 
> > I might then add a second group as:
> > 
> > ln -s /config/vfio/$bus_type.name/$groupidB /config/vfio/$bus_type.name/my_group/hba_group
> > 
> > Either link could fail if the target group is not viable,
> 
> The link op shouldn't fail because the subgroup isn't viable.
> Instead, the supergroup jusy won't be viable until all devices in all
> subgroups are bound to vfio.

The supergroup may already be in use if it's a hotplug.  What does it
mean to have an incompatible group linked into the supergroup?  When
does the subgroup actually become part of the supergroup?  Does the
userspace driver using the supergroup get notified somehow?  Does the
vfio driver get notified independently?  This example continues to show
what an administration nightmare it becomes when we split management
from usage.

> > the group is
> > already in use, or the second link could fail if the iommu domains were
> > incompatible.
> > 
> > Do these links cause /dev/vfio/$bus_type.name/{$groupidA,$groupidB} to
> > disappear?  If not, do we allow them to be opened?  Linking would also
> > have to fail if we later tried to link one of these groupids to a
> > different super group.
> 
> Again, I think some confusion is coming in here from calling both the
> hardware determined thing and the admin determined thing a "group".
> So for now I'm going to call the first a "group" and the second a
> "predomain" (because once it's viable and the right conditions are set
> up it will become an iommu domain).
> 
> So another option is that "groups" *only* participate in the merging
> interface; getting iommu and device handles occurs only on a
> predomain.  Therefore there would be no /dev/vfio/$group, you would
> have to configure a predomain with at least one group before you had a
> device file.

I think this actually leads to a more complicated, more difficult to use
interface that interposes an unnecessary administration layer into a
driver's decisions about how to manage the iommu.

> > Now we want to give my_group to a user, so we have to go back to /dev
> > and
> > 
> > chown $user /dev/vfio/$bus_type.name/my_group
> > 
> > At this point my_group would have the existing set of group ioctls sans
> > {UN}MERGE, of course.
> > 
> > So $user can use the super group, but not manipulate it's members.  Do
> > we then allow:
> > 
> > chown $user /config/vfio/$bus_type.name/my_group
> > 
> > If so, what does it imply about the user then doing:
> > 
> > ln -s /config/vfio/$bus_type.name/$groupidC /config/vfio/$bus_type.name/my_group/stolen_group
> > 
> > Would we instead need to chown the configfs groups as well as the super
> > group?
> > 
> > chown $user /config/vfio/$bus_type.name/my_group
> > chown $user /config/vfio/$bus_type.name/$groupidA
> > chown $user /config/vfio/$bus_type.name/$groupidB
> > 
> > ie:
> > 
> > # chown $user:$user /config/vfio/$bus_type.name/$groupC
> > $ ln -s /config/vfio/$bus_type.name/$groupidC /config/vfio/$bus_type.name/my_group/given_group
> 
> This is not the only option.  We could also do:
> 
> cd /config/vfio
> mkdir new_predomain
> echo $groupid > new_predomain/addgroup
> chown $user /dev/vfio/new_predomain

echo $groupid > new_predomain/delgroup
SEGV... Now we've included yet another admin path in the hotplug case as
the userspace driver needs to coordinate removal of groups with some
other entity.

> This is assuming that configuration of predomains is a root only
> operation, which seems reasonable to me.

I think it should be a driver decision.  Let's go back to the purpose of
this interface.  We want to give *devices* to userspace drivers.  Groups
are any unfortunate side-effect of hardware topology, so instead of
giving the user a device, we give it a group that contains the device.
It's a driver optimization that they can say "oh, I wonder if I can use
the same iommu descriptor to drive both of these, let me try to merge
them...".  That results in "worked, yay" skip initializing a new iommu
object OR "nope, oh well".  Adding an admin layer that presupposes that
they should be merged and does it adds nothing for the better.

> > (linking has to look at the permissions of the target as well as the
> > link name)
> 
> Which would be unexpected and therefore a bad idea.

Another indication that this is the wrong interface.

> > Now we've introduced that we have ownership of configfs entries, what
> > does that imply about the char dev entries?  For instance, can $userA
> > own /dev/vfio/$bus_type.name/$groupidA, but $userB own the configfs
> > file?  We also have another security consideration that an exploit on
> > the host might allow a 3rd party to insert a device into a group.
> > 
> > This is where I start to get lost in the complexity versus simply giving
> > the user permissions for the char dev and allowing them to stick groups
> > together so long as the have permissions for the group.
> > 
> > We also add an entire filesystem to the interface that already spans
> > sysfs, dev, eventfds and potentially netlink.
> > 
> > If terminology is the complaint against the {UN}MERGE ioctl interface,
> > I'm still not sold that configfs is the answer.  /me goes to the
> > thesaurus... amalgamate? blend? combine? cement? unite? join?
> 
> A thesaurus won't help, my point is you want something with a
> *different* meaning to merge, which implies a symmetry not present in
> this operation.

But there is symmetry in a merged group, let's look at the operations on
a group (note I've updated some of the ioctls since last posting):

VFIO_GROUP_GET_INFO

        This returns a structure containing flags for the group, when
        merged it represents the merged group.

VFIO_GROUP_GET_DEVICE_FD

        This returns a file descriptor for the device described by the
        given char*, when merged it operates across all groups within
        the merged set.

VFIO_GROUP_GET_IOMMU_FD

        Return a file descriptor for the iommu, when merged there's a
        single iommu across the merged group.

VFIO_GROUP_MERGE

        Pull a singleton group into a merge.  This can be called on any
        member of a merged group to pull a singleton group into the
        merged set.

VFIO_GROUP_UNMERGE

        Extract the group from the merged set.

Where is the discontinuity with calling this symmetric?  Is it simply
that we have an entry point to the supergroup at each subgroup?  Forming
a new node when groups are merged is a limitation, not a feature, and
imposes a number of administration issues (ownership, creation,
deletion, addition, subtraction, notifications, etc).  Is it that we can
only merge singletons?  This is an implementation restriction, not an
API restriction.  If you want to go to the trouble of determining that
the existing IOMMU mappings are compatible and can atomically merge
them, the singleton could instead be a member of another supergroup.  We
currently can't do this atomically, and as merging is an optimization, I
leave the burden on userspace to split supergroups if they want to merge
with another group.

I'm not sure why this is such a thorn since aiu the power iommu
topology, you're going to have IOVA windows per group that really can't
make use of the merge interface.  This is mostly useful for "MAP_ANY"
style IOMMUs.  Do you really want to impose the administrative overhead
of predomains for a feature you're not likely to use?

<snip>
> [snip]
> > > Right, but I'm not just talking about the current map/unmap calls
> > > themselves.  This infrastructure for tracking it looks like it's
> > > intended to be generic for all mapping methods.  If not, I can't see
> > > the reason for it, because I don't think the current interface
> > > requires such tracking inherently.
> > 
> > It does seem that way, but there is a purpose.  We need to unmap
> > everything on release.  It's easy to assume that iommu_domain_free()
> > will unmap everything from the IOMMU, which it does, but we've also done
> > a get_user_pages on each of those in vfio, which we need to cleanup.  We
> > can't rely on userspace to do this since they might have been SIGKILL'd.
> > Making it generic with coalescing of adjacent regions and such is
> > primarily for space efficiency.
> 
> 
> Ah, I see.  Much as generic infrastructure is nice when we can do it,
> I think this consideration will have to be pushed down to the iommu
> driver layer.  For e.g. on power, we have all the information we need
> to do the page tracking; any write to a TCE put()s the page that was
> previously in that entry (if any) as well as get()ing the one that's
> going in (if any).  It's just that we don't want to keep track in this
> generic data structure _as well as_ the one that's natural for the
> hardware.

There are few users of the IOMMU API, maybe we can negotiate this.  I
also expect as power gets added, we'll need to make the vfio_iommu layer
more modular.  It's possible you won't make use of this iommu object and
can leave page tracking to the iommu.  I think that can be done within
the existing API though.  Thanks,

Alex
Alex Williamson Nov. 22, 2011, 7:16 p.m. UTC | #38
On Fri, Nov 18, 2011 at 2:09 PM, Scott Wood <scottwood@freescale.com> wrote:
> On Fri, Nov 18, 2011 at 01:32:56PM -0700, Alex Williamson wrote:
>> Hmm, that might be cleaner than eliminating the size with just using
>> _IO().  So we might have something like:
>>
>> #define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
>> #define VFIO_IOMMU_MAP_DMA_V2           _IOWR(';', 106, struct vfio_dma_map_v2)
>>
>> For which the driver might do:
>>
>> case VFIO_IOMMU_MAP_DMA:
>> case VFIO_IOMMU_MAP_DMA_V2:
>> {
>>       struct vfio_dma_map map;
>>
>>       /* We don't care about the extra v2 bits */
>>       if (copy_from_user(&map, (void __user *)arg, sizeof map))
>>               return -EFAULT;
>
> That won't work if you have an old kernel that doesn't know about v2, and
> a new user that uses v2.  To make this work you'd have to strip out the
> size from the ioctl number before switching (but still use it when
> considering whether to access the v2 fields).  Simpler to just leave it
> out of the ioctl number and put it in the struct field as currently
> planned.

Ok, _IO for all ioctls passing structs then.

>> > > I think all our structure sizes are independent of host width.  If I'm
>> > > missing something, let me know.
>> >
>> > Ah, for structures, that might be true.  I was seeing the bunch of
>> > ioctl()s that take ints.
>>
>> Ugh, I suppose you're thinking of an ILP64 platform with ILP32 compat
>> mode.
>
> Does Linux support ILP64?  There are "int" ioctls all over the place, and
> I don't think we do compat wrappers for them.  In fact, some of the
> ioctls in linux/fs.h use "int" for the compatible version of ioctls
> originally defined as "long".
>
> It's cleaner to always use the fixed types, though.

I've updated anything that passes data to use a structure and will
make use of __s32 in place of ints.  If there ever exists an ILP64
system, we can use a flag bit of the structure to indicate 64bit file
descriptor support.

>> Darn it, guess we need to make everything 64bit, including file
>> descriptors.
>
> What's wrong with __u32/__s32 (or uint32_t/int32_t)?
>
> I really do not see Linux supporting an ABI that has no 32-bit type at
> all, especially in a situation where userspace compatibility is needed.
> If that does happen, the ABI breakage will go well beyond VFIO.

Yep, I think the structs fix this and still leave room for the impossible.

>> The point of the group is to provide a unit of ownership.  We can't let
>> $userA open $groupid and fetch a device, then have $userB do the same,
>> grabbing a different device.  The mappings will step on each other and
>> the devices have no isolation.  We can't restrict that purely by file
>> permissions or we'll have the same problem with sudo.
>
> What is the problem with sudo?  If you're running processes as the same
> user, regardless of how, they're going to be able to mess with each
> other.

Just trying to indicate that file permissions are easy to bypass and
privileged users can inadvertently do stupid stuff.  Kind of like
request_region() in the kernel.   Kernel drivers are privileged, but
we still want to enforce an owner of that region.  VFIO extends the
ownership of a device to a single entity in userspace.  How do we
identify that entity and keep others out?

> Is it possible to expose access to only specific groups via an
> individually-permissionable /dev/device, so only the entity handing out
> access to devices needs access to everything?

Yes, that's fundamental to vfio.  vfio-bus drivers enumerate devices
to the vfio-core.  Privileged users bind devices to the vfio-bus
driver creating viable groups.  Groups are represented as chardevs
under /dev/vfio.  If a user has permission to access the chardev, they
have the ability to use the devices.  Once they get a device or iommu
descriptor the group is tied to them via the struct mm and only they
are permitted to access the other devices in the group.

>> At one point we discussed a single open instance, but that
>> unnecessarily limits the user, so we settled on the mm.  Thanks,
>
> It would be nice if this limitation weren't excessively integrated into
> the design -- in the embedded space we've got unusual partitioning
> setups, including failover arrangements where partitions share devices.
> The device may be configured with the IOMMU pointing only at regions that
> are shared by both mms, or the non-shared regions may be reconfigured as
> active ownership of the device gets handed around.
>
> It would be up to userspace code to make sure that the mappings don't
> "step on each other".  The mapping could be done with whichever mm issued
> the map call for a given region.
>
> For this use case, there is unlikely to be an issue with ownership
> because there will not be separate privilege domains creating partitions
> -- other use cases could refrain from enabling multiple-mm support unless
> ownership issues are resolved.
>
> This doesn't need to be supported initially, but we should try to avoid
> letting the assumption permeate the code.

So I'm hearing "we want to use this driver you're developing that's
centered around using the iommu to securely provide access to a device
from userspace, but can we do it without the iommu and can we loosen
up the security a bit?"  Is that about right?  ;)  Thanks,

Alex
Scott Wood Nov. 22, 2011, 8 p.m. UTC | #39
On 11/22/2011 01:16 PM, Alex Williamson wrote:
> On Fri, Nov 18, 2011 at 2:09 PM, Scott Wood <scottwood@freescale.com> wrote:
>> On Fri, Nov 18, 2011 at 01:32:56PM -0700, Alex Williamson wrote:
>>> Ugh, I suppose you're thinking of an ILP64 platform with ILP32 compat
>>> mode.
>>
>> Does Linux support ILP64?  There are "int" ioctls all over the place, and
>> I don't think we do compat wrappers for them.  In fact, some of the
>> ioctls in linux/fs.h use "int" for the compatible version of ioctls
>> originally defined as "long".
>>
>> It's cleaner to always use the fixed types, though.
> 
> I've updated anything that passes data to use a structure 

That's a bit extreme...

> and will make use of __s32 in place of ints.  If there ever exists an ILP64
> system, we can use a flag bit of the structure to indicate 64bit file
> descriptor support.

If we end up supporting an ABI where compatibility between user and
kernel is broken even when we use fixed-size types and are careful about
alignment, we'll need a compat wrapper, and we'll know what ABI
userspace is supposed to be using.  I'm not sure how a flag would help.

>>> The point of the group is to provide a unit of ownership.  We can't let
>>> $userA open $groupid and fetch a device, then have $userB do the same,
>>> grabbing a different device.  The mappings will step on each other and
>>> the devices have no isolation.  We can't restrict that purely by file
>>> permissions or we'll have the same problem with sudo.
>>
>> What is the problem with sudo?  If you're running processes as the same
>> user, regardless of how, they're going to be able to mess with each
>> other.
> 
> Just trying to indicate that file permissions are easy to bypass and
> privileged users can inadvertently do stupid stuff.

Preventing stupid stuff can also prevent useful stuff.  Security and
accident-avoidance are different things.  "We can't let" is the domain
of the former.

> Kind of like request_region() in the kernel.   Kernel drivers are privileged, but
> we still want to enforce an owner of that region.  VFIO extends the
> ownership of a device to a single entity in userspace.  How do we
> identify that entity and keep others out?

That's fine as long as it's an optional safeguard that can be turned off
if needed.  Maybe require userspace to set a flag via some mechanism to
indicate it's opening the device in shared mode.

>> It would be nice if this limitation weren't excessively integrated into
>> the design -- in the embedded space we've got unusual partitioning
>> setups, including failover arrangements where partitions share devices.
>> The device may be configured with the IOMMU pointing only at regions that
>> are shared by both mms, or the non-shared regions may be reconfigured as
>> active ownership of the device gets handed around.
>>
>> It would be up to userspace code to make sure that the mappings don't
>> "step on each other".  The mapping could be done with whichever mm issued
>> the map call for a given region.
>>
>> For this use case, there is unlikely to be an issue with ownership
>> because there will not be separate privilege domains creating partitions
>> -- other use cases could refrain from enabling multiple-mm support unless
>> ownership issues are resolved.
>>
>> This doesn't need to be supported initially, but we should try to avoid
>> letting the assumption permeate the code.
> 
> So I'm hearing "we want to use this driver you're developing that's
> centered around using the iommu to securely provide access to a device
> from userspace, but can we do it without the iommu and can we loosen
> up the security a bit?"  Is that about right?  ;)  Thanks,

We have a variety of use cases for userspace and KVM-guest access to
devices.  Some of those involve an iommu, some don't.  Some involve
shared ownership (which isn't necessarily a loosening of security --
there's still an iommu, and access control on the vfio group), some
don't.  Some don't involve DMA at all.  I see no reason to have entirely
separate kernel mechanisms for these use cases.

I'm not asking you to implement any of this, just hoping you'll keep
such flexibility in mind when deciding on fundamental assumptions that
the code and API are to make.

-Scott
Alex Williamson Nov. 22, 2011, 9:28 p.m. UTC | #40
On Tue, 2011-11-22 at 14:00 -0600, Scott Wood wrote:
> On 11/22/2011 01:16 PM, Alex Williamson wrote:
> > On Fri, Nov 18, 2011 at 2:09 PM, Scott Wood <scottwood@freescale.com> wrote:
> >> On Fri, Nov 18, 2011 at 01:32:56PM -0700, Alex Williamson wrote:
> >>> Ugh, I suppose you're thinking of an ILP64 platform with ILP32 compat
> >>> mode.
> >>
> >> Does Linux support ILP64?  There are "int" ioctls all over the place, and
> >> I don't think we do compat wrappers for them.  In fact, some of the
> >> ioctls in linux/fs.h use "int" for the compatible version of ioctls
> >> originally defined as "long".
> >>
> >> It's cleaner to always use the fixed types, though.
> > 
> > I've updated anything that passes data to use a structure 
> 
> That's a bit extreme...

Ok, I lied, it's not everything.  I have consolidated some GET_FLAGS and
GET_NUM_* calls into generic GET_INFO ioctls so we have more
flexibility.  I think the structures make sense there.  I'm not as
convinced on the eventfd and irq unmask structures, but who knows, they
might save us some day.

Here's where I stand on the API definitions, maybe we can get some
agreement on this before diving into semantics of the documentation or
or implementation, though it still includes the merge interface.
Thanks,

Alex

/*
 * VFIO API definition
 *
 * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
 * 	Author: Alex Williamson <alex.williamson@redhat.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
#ifndef VFIO_H
#define VFIO_H

#include <linux/types.h>

#ifdef __KERNEL__	/* Internal VFIO-core/bus driver API */

/**
 * struct vfio_device_ops - VFIO bus driver device callbacks
 *
 * @match: Return true if buf describes the device
 * @open: Called when userspace receives file descriptor for device
 * @release: Called when userspace releases file descriptor for device
 * @read: Perform read(2) on device file descriptor
 * @write: Perform write(2) on device file descriptor
 * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
 *         operations documented below
 * @mmap: Perform mmap(2) on a region of the device file descriptor
 */
struct vfio_device_ops {
	bool	(*match)(struct device *dev, const char *buf);
	int	(*open)(void *device_data);
	void	(*release)(void *device_data);
	ssize_t	(*read)(void *device_data, char __user *buf,
			size_t count, loff_t *ppos);
	ssize_t	(*write)(void *device_data, const char __user *buf,
			 size_t count, loff_t *size);
	long	(*ioctl)(void *device_data, unsigned int cmd,
			 unsigned long arg);
	int	(*mmap)(void *device_data, struct vm_area_struct *vma);
};

/**
 * vfio_group_add_dev() - Add a device to the vfio-core
 *
 * @dev: Device to add
 * @ops: VFIO bus driver callbacks for device
 *
 * This registration makes the VFIO core aware of the device, creates
 * groups objects as required and exposes chardevs under /dev/vfio.
 *
 * Return 0 on success, errno on failure.
 */
extern int vfio_group_add_dev(struct device *dev,
			      const struct vfio_device_ops *ops);

/**
 * vfio_group_del_dev() - Remove a device from the vfio-core
 *
 * @dev: Device to remove
 *
 * Remove a device previously added to the VFIO core, removing groups
 * and chardevs as necessary.
 */
extern void vfio_group_del_dev(struct device *dev);

/**
 * vfio_bind_dev() - Indicate device is bound to the VFIO bus driver and
 *                   register private data structure for ops callbacks.
 *
 * @dev: Device being bound
 * @device_data: VFIO bus driver private data
 *
 * This registration indicate that a device previously registered with
 * vfio_group_add_dev() is now available for use by the VFIO core.  When
 * all devices within a group are available, the group is viable and my
 * be used by userspace drivers.  Typically called from VFIO bus driver
 * probe function.
 *
 * Return 0 on success, errno on failure
 */
extern int vfio_bind_dev(struct device *dev, void *device_data);

/**
 * vfio_unbind_dev() - Indicate device is unbinding from VFIO bus driver
 *
 * @dev: Device being unbound
 *
 * De-registration of the device previously registered with vfio_bind_dev()
 * from VFIO.  Upon completion, the device is no longer available for use by
 * the VFIO core.  Typically called from the VFIO bus driver remove function.
 * The VFIO core will attempt to release the device from users and may take
 * measures to free the device and/or block as necessary.
 *
 * Returns pointer to private device_data structure registered with
 * vfio_bind_dev().
 */
extern void *vfio_unbind_dev(struct device *dev);

#endif /* __KERNEL__ */

/* Kernel & User level defines for VFIO IOCTLs. */

/*
 * The IOCTL interface is designed for extensibility by embedding the
 * structure length (argsz) and flags into structures passed between
 * kernel and userspace.  We therefore use the _IO() macro for these
 * defines to avoid implicitly embedding a size into the ioctl request.  
 * As structure fields are added, argsz will increase to match and flag
 * bits will be defined to indicate additional fields with valid data.
 * It's *always* the caller's responsibility to indicate the size of
 * the structure passed by setting argsz appropriately.
 */

#define VFIO_TYPE	';'
#define VFIO_BASE	100

/* --------------- IOCTLs for GROUP file descriptors --------------- */

/**
 * VFIO_GROUP_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 0, struct vfio_group_info)
 *
 * Retrieve information about the group.  Fills in provided
 * struct vfio_group_info.  Caller sets argsz.
 */
struct vfio_group_info {
	__u32	argsz;
	__u32	flags;
#define VFIO_GROUP_FLAGS_VIABLE		(1 << 0)
#define VFIO_GROUP_FLAGS_MM_LOCKED	(1 << 1)
};

#define VFIO_GROUP_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 0);

/**
 * VFIO_GROUP_MERGE - _IOW(VFIO_TYPE, VFIO_BASE + 1, __s32)
 *
 * Merge group indicated by passed file descriptor into current group.
 * Current group may be in use, group indicated by file descriptor
 * cannot be in use (no open iommu or devices).
 */
#define VFIO_GROUP_MERGE		_IOW(VFIO_TYPE, VFIO_BASE + 1, __s32)

/**
 * VFIO_GROUP_UNMERGE - _IO(VFIO_TYPE, VFIO_BASE + 2)
 *
 * Remove the current group from a merged set.  The current group cannot
 * have any open devices.
 */
#define VFIO_GROUP_UNMERGE		_IO(VFIO_TYPE, VFIO_BASE + 2)

/**
 * VFIO_GROUP_GET_IOMMU_FD - _IO(VFIO_TYPE, VFIO_BASE + 3)
 *
 * Return a new file descriptor for the IOMMU object.  The IOMMU object
 * is shared among members of a merged group.
 */
#define VFIO_GROUP_GET_IOMMU_FD		_IO(VFIO_TYPE, VFIO_BASE + 3)

/**
 * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 4, char)
 *
 * Return a new file descriptor for the device object described by
 * the provided char array.
 */
#define VFIO_GROUP_GET_DEVICE_FD	_IOW(VFIO_TYPE, VFIO_BASE + 4, char)


/* --------------- IOCTLs for IOMMU file descriptors --------------- */

/**
 * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 5, struct vfio_iommu_info)
 *
 * Retrieve information about the IOMMU object.  Fills in provided
 * struct vfio_iommu_info.  Caller sets argsz.
 */
struct vfio_iommu_info {
	__u32	argsz;
	__u32	flags;
	__u64	iova_max;	/* Maximum IOVA address */
	__u64	iova_min;	/* Minimum IOVA address */
	__u64	alignment;	/* Required alignment, often PAGE_SIZE */
};

#define	VFIO_IOMMU_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 5)

/**
 * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 6, struct vfio_dma_map)
 *
 * Map or unmap process virtual addresses to IO virtual addresses using
 * the provided struct vfio_dma_map.  Caller sets argsz.
 */
struct vfio_dma_map {
	__u32	argsz;
	__u32	flags;
#define VFIO_DMA_MAP_FLAG_MAP	(1 << 0)	/* Map (1) vs Unmap (0) */
#define VFIO_DMA_MAP_FLAG_READ	(1 << 1)	/* readable from device */
#define VFIO_DMA_MAP_FLAG_WRITE	(1 << 2)	/* writable from device */
	__u64	vaddr;		/* Process virtual address */
	__u64	iova;		/* IO virtual address */
	__u64	size;		/* Size of mapping (bytes) */
};

#define	VFIO_IOMMU_MAP_DMA		_IO(VFIO_TYPE, VFIO_BASE + 6)


/* --------------- IOCTLs for DEVICE file descriptors --------------- */

/**
 * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
 *			       struct vfio_device_info)
 *
 * Retrieve information about the device.  Fills in provided
 * struct vfio_device_info.  Caller sets argsz.
 */
struct vfio_device_info {
	__u32	argsz;
	__u32	flags;
#define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
	__u32	num_regions;	/* Max region index + 1 */
	__u32	num_irqs;	/* Max IRQ index + 1 */
};

#define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)

/**
 * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
 *				       struct vfio_region_info)
 *
 * Retrieve information about a device region.  Caller provides
 * struct vfio_region_info with index value set.  Caller sets argsz.
 */
struct vfio_region_info {
	__u32	argsz;
	__u32	flags;
#define VFIO_REGION_INFO_FLAG_MMAP	(1 << 0) /* Region supports mmap */
#define VFIO_REGION_INFO_FLAG_RO	(1 << 1) /* Region is read-only */
	__u32	index;		/* Region index */
	__u32	resv;		/* Reserved for alignment */
	__u64	size;		/* Region size (bytes) */
	__u64	offset;		/* Region offset from start of device fd */
};

#define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)

/**
 * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
 *				    struct vfio_irq_info)
 *
 * Retrieve information about a device IRQ.  Caller provides
 * struct vfio_irq_info with index value set.  Caller sets argsz.
 */
struct vfio_irq_info {
	__u32	argsz;
	__u32	flags;
#define VFIO_IRQ_INFO_FLAG_LEVEL	(1 << 0) /* Level (1) vs Edge (0) */
	__u32	index;		/* IRQ index */
	__u32	count;		/* Number of IRQs within this index */
};

#define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)

/**
 * VFIO_DEVICE_SET_IRQ_EVENTFDS - _IOW(VFIO_TYPE, VFIO_BASE + 10,
 *				       struct vfio_irq_eventfds)
 *
 * Set eventfds for IRQs using the struct vfio_irq_eventfds provided.
 * Setting the eventfds also enables the interrupt.  Caller sets argsz.
 */
struct vfio_irq_eventfds {
	__u32	argsz;
	__u32	flags;
	__u32	index;		/* IRQ index */
	__u32	count;		/* Number of eventfds */
	__s32	eventfds[];	/* eventfd for sub-index, -1 to unset */
};

#define VFIO_DEVICE_SET_IRQ_EVENTFDS	_IO(VFIO_TYPE, VFIO_BASE + 10)

/**
 * VFIO_DEVICE_UNMASK_IRQ - _IOW(VFIO_TYPE, VFIO_BASE + 11,
 *				 struct vfio_unmask_irq)
 *
 * Unmask the IRQ described by the provided struct vfio_unmask_irq.
 * Level triggered IRQs are masked when posted to userspace and must
 * be unmasked to re-trigger.  Caller sets argsz.
 */
struct vfio_unmask_irq {
	__u32	argsz;
	__u32	flags;
	__u32	index;		/* IRQ index */
	__u32	subindex;	/* Sub-index to unmask */
};

#define VFIO_DEVICE_UNMASK_IRQ		_IO(VFIO_TYPE, VFIO_BASE + 11)

/**
 * VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 12,
 *					     struct vfio_irq_eventfds)
 *
 * Set eventfds to be used for unmasking IRQs using the provided
 * struct vfio_irq_eventfds.
 */
#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD	_IO(VFIO_TYPE, VFIO_BASE + 12)

/**
 * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 13)
 *
 * Reset a device.
 */
#define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 13)

#endif /* VFIO_H */
diff mbox

Patch

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 54078ed..59d01e4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -88,6 +88,7 @@  Code  Seq#(hex)	Include File		Comments
 		and kernel/power/user.c
 '8'	all				SNP8023 advanced NIC card
 					<mailto:mcr@solidum.com>
+';'	64-76	linux/vfio.h
 '@'	00-0F	linux/radeonfb.h	conflict!
 '@'	00-0F	drivers/video/aty/aty128fb.c	conflict!
 'A'	00-1F	linux/apm_bios.h	conflict!
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
new file mode 100644
index 0000000..5866896
--- /dev/null
+++ b/Documentation/vfio.txt
@@ -0,0 +1,304 @@ 
+VFIO - "Virtual Function I/O"[1]
+-------------------------------------------------------------------------------
+Many modern system now provide DMA and interrupt remapping facilities
+to help ensure I/O devices behave within the boundaries they've been
+allotted.  This includes x86 hardware with AMD-Vi and Intel VT-d as
+well as POWER systems with Partitionable Endpoints (PEs) and even
+embedded powerpc systems (technology name unknown).  The VFIO driver
+is an IOMMU/device agnostic framework for exposing direct device
+access to userspace, in a secure, IOMMU protected environment.  In
+other words, this allows safe, non-privileged, userspace drivers.
+
+Why do we want that?  Virtual machines often make use of direct device
+access ("device assignment") when configured for the highest possible
+I/O performance.  From a device and host perspective, this simply turns
+the VM into a userspace driver, with the benefits of significantly
+reduced latency, higher bandwidth, and direct use of bare-metal device
+drivers[2].
+
+Some applications, particularly in the high performance computing
+field, also benefit from low-overhead, direct device access from
+userspace.  Examples include network adapters (often non-TCP/IP based)
+and compute accelerators.  Previous to VFIO, these drivers needed to
+go through the full development cycle to become proper upstream driver,
+be maintained out of tree, or make use of the UIO framework, which
+has no notion of IOMMU protection, limited interrupt support, and
+requires root privileges to access things like PCI configuration space.
+
+The VFIO driver framework intends to unify these, replacing both the
+KVM PCI specific device assignment currently used as well as provide
+a more secure, more featureful userspace driver environment than UIO.
+
+Groups, Devices, IOMMUs, oh my
+-------------------------------------------------------------------------------
+
+A fundamental component of VFIO is the notion of IOMMU groups.  IOMMUs
+can't always distinguish transactions from each individual device in
+the system.  Sometimes this is because of the IOMMU design, such as with
+PEs, other times it's caused by the I/O topology, for instance a
+PCIe-to-PCI bridge masking all devices behind it.  We call the sets of
+devices created by these restictions IOMMU groups (or just "groups" for
+this document).
+
+The IOMMU cannot distiguish transactions between the individual devices
+within the group, therefore the group is the basic unit of ownership for
+a userspace process.  Because of this, groups are also the primary
+interface to both devices and IOMMU domains in VFIO.
+
+The VFIO representation of groups is created as devices are added into
+the framework by a VFIO bus driver.  The vfio-pci module is an example
+of a bus driver.  This module registers devices along with a set of bus
+specific callbacks with the VFIO core.  These callbacks provide the
+interfaces later used for device access.  As each new group is created,
+as determined by iommu_device_group(), VFIO creates a /dev/vfio/$GROUP
+character device.
+
+In addition to the device enumeration and callbacks, the VFIO bus driver
+also provides a traditional device driver and is able to bind to devices
+on it's bus.  When a device is bound to the bus driver it's available to
+VFIO.  When all the devices within a group are bound to their bus drivers,
+the group becomes "viable" and a user with sufficient access to the VFIO
+group chardev can obtain exclusive access to the set of group devices.
+
+As documented in linux/vfio.h, several ioctls are provided on the
+group chardev:
+
+#define VFIO_GROUP_GET_FLAGS            _IOR(';', 100, __u64)
+ #define VFIO_GROUP_FLAGS_VIABLE        (1 << 0)
+ #define VFIO_GROUP_FLAGS_MM_LOCKED     (1 << 1)
+#define VFIO_GROUP_MERGE                _IOW(';', 101, int)
+#define VFIO_GROUP_UNMERGE              _IOW(';', 102, int)
+#define VFIO_GROUP_GET_IOMMU_FD         _IO(';', 103)
+#define VFIO_GROUP_GET_DEVICE_FD        _IOW(';', 104, char *)
+
+The last two ioctls return new file descriptors for accessing
+individual devices within the group and programming the IOMMU.  Each of
+these new file descriptors provide their own set of file interfaces.
+These ioctls will fail if any of the devices within the group are not
+bound to their VFIO bus driver.  Additionally, when either of these
+interfaces are used, the group is then bound to the struct_mm of the
+caller.  The GET_FLAGS ioctl can be used to view the state of the group.
+
+When either the GET_IOMMU_FD or GET_DEVICE_FD ioctls are invoked, a
+new IOMMU domain is created and all of the devices in the group are
+attached to it.  This is the only way to ensure full IOMMU isolation
+of the group, but potentially wastes resources and cycles if the user
+intends to manage multiple groups with the same set of IOMMU mappings.
+VFIO therefore provides a group MERGE and UNMERGE interface, which
+allows multiple groups to share an IOMMU domain.  Not all IOMMUs allow
+arbitrary groups to be merged, so the user should assume merging is
+opportunistic.  A new group, with no open device or IOMMU file
+descriptors, can be merged into an existing, in-use, group using the
+MERGE ioctl.  A merged group can be unmerged using the UNMERGE ioctl
+once all of the device file descriptors for the group being merged
+"out" are closed.
+
+When groups are merged, the GET_IOMMU_FD and GET_DEVICE_FD ioctls are
+essentially fungible between group file descriptors (ie. if device A
+is in group X, and X is merged with Y, a file descriptor for A can be
+retrieved using GET_DEVICE_FD on Y.  Likewise, GET_IOMMU_FD returns a
+file descriptor referencing the same internal IOMMU object from either
+X or Y).  Merged groups can be dissolved either explictly with UNMERGE
+or automatically when ALL file descriptors for the merged group are
+closed (all IOMMUs, all devices, all groups).
+
+The IOMMU file descriptor provides this set of ioctls:
+
+#define VFIO_IOMMU_GET_FLAGS            _IOR(';', 105, __u64)
+ #define VFIO_IOMMU_FLAGS_MAP_ANY       (1 << 0)
+#define VFIO_IOMMU_MAP_DMA              _IOWR(';', 106, struct vfio_dma_map)
+#define VFIO_IOMMU_UNMAP_DMA            _IOWR(';', 107, struct vfio_dma_map)
+
+The GET_FLAGS ioctl returns basic information about the IOMMU domain.
+We currently only support IOMMU domains that are able to map any
+virtual address to any IOVA.  This is indicated by the MAP_ANY flag.
+
+The (UN)MAP_DMA commands make use of struct vfio_dma_map for mapping
+and unmapping IOVAs to process virtual addresses:
+
+struct vfio_dma_map {
+        __u64   len;            /* length of structure */
+        __u64   vaddr;          /* process virtual addr */
+        __u64   dmaaddr;        /* desired and/or returned dma address */
+        __u64   size;           /* size in bytes */
+        __u64   flags;
+#define VFIO_DMA_MAP_FLAG_WRITE         (1 << 0) /* req writeable DMA mem */
+};
+
+Current users of VFIO use relatively static DMA mappings, not requiring
+high frequency turnover.  As new users are added, it's expected that the
+IOMMU file descriptor will evolve to support new mapping interfaces, this
+will be reflected in the flags and may present new ioctls and file
+interfaces.
+
+The device GET_FLAGS ioctl is intended to return basic device type and
+indicate support for optional capabilities.  Flags currently include whether
+the device is PCI or described by Device Tree, and whether the RESET ioctl
+is supported:
+
+#define VFIO_DEVICE_GET_FLAGS           _IOR(';', 108, __u64)
+ #define VFIO_DEVICE_FLAGS_PCI          (1 << 0)
+ #define VFIO_DEVICE_FLAGS_DT           (1 << 1)
+ #define VFIO_DEVICE_FLAGS_RESET        (1 << 2)
+
+The MMIO and IOP resources used by a device are described by regions.
+The GET_NUM_REGIONS ioctl tells us how many regions the device supports:
+
+#define VFIO_DEVICE_GET_NUM_REGIONS     _IOR(';', 109, int)
+
+Regions are described by a struct vfio_region_info, which is retrieved by
+using the GET_REGION_INFO ioctl with vfio_region_info.index field set to
+the desired region (0 based index).  Note that devices may implement zero
+sized regions (vfio-pci does this to provide a 1:1 BAR to region index
+mapping).
+
+struct vfio_region_info {
+        __u32   len;            /* length of structure */
+        __u32   index;          /* region number */
+        __u64   size;           /* size in bytes of region */
+        __u64   offset;         /* start offset of region */
+        __u64   flags;
+#define VFIO_REGION_INFO_FLAG_MMAP              (1 << 0)
+#define VFIO_REGION_INFO_FLAG_RO                (1 << 1)
+#define VFIO_REGION_INFO_FLAG_PHYS_VALID        (1 << 2)
+        __u64   phys;           /* physical address of region */
+};
+
+#define VFIO_DEVICE_GET_REGION_INFO     _IOWR(';', 110, struct vfio_region_info)
+
+The offset indicates the offset into the device file descriptor which
+accesses the given range (for read/write/mmap/seek).  Flags indicate the
+available access types and validity of optional fields.  For instance
+the phys field may only be valid for certain devices types.
+
+Interrupts are described using a similar interface.  GET_NUM_IRQS
+reports the number or IRQ indexes for the device.
+
+#define VFIO_DEVICE_GET_NUM_IRQS        _IOR(';', 111, int)
+
+struct vfio_irq_info {
+        __u32   len;            /* length of structure */
+        __u32   index;          /* IRQ number */
+        __u32   count;          /* number of individual IRQs */
+        __u64   flags;
+#define VFIO_IRQ_INFO_FLAG_LEVEL                (1 << 0)
+};
+
+Again, zero count entries are allowed (vfio-pci uses a static interrupt
+type to index mapping).
+
+Information about each index can be retrieved using the GET_IRQ_INFO
+ioctl, used much like GET_REGION_INFO.
+
+#define VFIO_DEVICE_GET_IRQ_INFO        _IOWR(';', 112, struct vfio_irq_info)
+
+Individual indexes can describe single or sets of IRQs.  This provides the
+flexibility to describe PCI INTx, MSI, and MSI-X using a single interface.
+
+All VFIO interrupts are signaled to userspace via eventfds.  Integer arrays,
+as shown below, are used to pass the IRQ info index, the number of eventfds,
+and each eventfd to be signaled.  Using a count of 0 disables the interrupt.
+
+/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
+#define VFIO_DEVICE_SET_IRQ_EVENTFDS    _IOW(';', 113, int)
+
+When a level triggered interrupt is signaled, the interrupt is masked
+on the host.  This prevents an unresponsive userspace driver from
+continuing to interrupt the host system.  After servicing the interrupt,
+UNMASK_IRQ is used to allow the interrupt to retrigger.  Note that level
+triggered interrupts implicitly have a count of 1 per index.
+
+/* Unmask IRQ index, arg[0] = index */
+#define VFIO_DEVICE_UNMASK_IRQ          _IOW(';', 114, int)
+
+Level triggered interrupts can also be unmasked using an irqfd.  Use
+SET_UNMASK_IRQ_EVENTFD to set the file descriptor for this.
+
+/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
+#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD      _IOW(';', 115, int)
+
+When supported, as indicated by the device flags, reset the device.
+
+#define VFIO_DEVICE_RESET               _IO(';', 116)
+
+Device tree devices also invlude ioctls for further defining the
+device tree properties of the device:
+
+struct vfio_dtpath {
+        __u32   len;            /* length of structure */
+        __u32   index;
+        __u64   flags;
+#define VFIO_DTPATH_FLAGS_REGION        (1 << 0)
+#define VFIO_DTPATH_FLAGS_IRQ           (1 << 1)
+        char    *path;
+};
+#define VFIO_DEVICE_GET_DTPATH          _IOWR(';', 117, struct vfio_dtpath)
+
+struct vfio_dtindex {
+        __u32   len;            /* length of structure */
+        __u32   index;
+        __u32   prop_type;
+        __u32   prop_index;
+        __u64   flags;
+#define VFIO_DTINDEX_FLAGS_REGION       (1 << 0)
+#define VFIO_DTINDEX_FLAGS_IRQ          (1 << 1)
+};
+#define VFIO_DEVICE_GET_DTINDEX         _IOWR(';', 118, struct vfio_dtindex)
+
+
+VFIO bus driver API
+-------------------------------------------------------------------------------
+
+Bus drivers, such as PCI, have three jobs:
+ 1) Add/remove devices from vfio
+ 2) Provide vfio_device_ops for device access
+ 3) Device binding and unbinding
+
+When initialized, the bus driver should enumerate the devices on it's
+bus and call vfio_group_add_dev() for each device.  If the bus supports
+hotplug, notifiers should be enabled to track devices being added and
+removed.  vfio_group_del_dev() removes a previously added device from
+vfio.
+
+Adding a device registers a vfio_device_ops function pointer structure
+for the device:
+
+struct vfio_device_ops {
+	bool			(*match)(struct device *, char *);
+	int			(*get)(void *);
+	void			(*put)(void *);
+	ssize_t			(*read)(void *, char __user *,
+					size_t, loff_t *);
+	ssize_t			(*write)(void *, const char __user *,
+					 size_t, loff_t *);
+	long			(*ioctl)(void *, unsigned int, unsigned long);
+	int			(*mmap)(void *, struct vm_area_struct *);
+};
+
+When a device is bound to the bus driver, the bus driver indicates this
+to vfio using the vfio_bind_dev() interface.  The device_data parameter
+is a pointer to an opaque data structure for use only by the bus driver.
+The get, put, read, write, ioctl, and mmap vfio_device_ops all pass
+this data structure back to the bus driver.  When a device is unbound
+from the bus driver, the vfio_unbind_dev() interface signals this to
+vfio.  This function returns the pointer to the device_data structure
+registered for the device.
+
+As noted previously, a group contains one or more devices, so
+GROUP_GET_DEVICE_FD needs to identify the specific device being requested.
+The vfio_device_ops.match callback is used to allow bus drivers to determine
+the match.  For drivers like vfio-pci, it's a simple match to dev_name(),
+which is unique in the system due to the PCI bus topology, other bus drivers
+may need to include parent devices to create a unique match, so this is
+left as a bus driver interface.
+
+-------------------------------------------------------------------------------
+
+[1] VFIO was originally an acronym for "Virtual Function I/O" in it's
+initial implementation by Tom Lyon while as Cisco.  We've since outgrown
+the acronym, but it's catchy.
+
+[2] As always there are trade-offs to virtual machine device
+assignment that are beyond the scope of VFIO.  It's expected that
+future IOMMU technologies will reduce some, but maybe not all, of
+these trade-offs.
diff --git a/MAINTAINERS b/MAINTAINERS
index f05f5f6..4bd5aa0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7106,6 +7106,14 @@  S:	Maintained
 F:	Documentation/filesystems/vfat.txt
 F:	fs/fat/
 
+VFIO DRIVER
+M:	Alex Williamson <alex.williamson@redhat.com>
+L:	kvm@vger.kernel.org
+S:	Maintained
+F:	Documentation/vfio.txt
+F:	drivers/vfio/
+F:	include/linux/vfio.h
+
 VIDEOBUF2 FRAMEWORK
 M:	Pawel Osciak <pawel@osciak.com>
 M:	Marek Szyprowski <m.szyprowski@samsung.com>
diff --git a/drivers/Kconfig b/drivers/Kconfig
index b5e6f24..e15578b 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -112,6 +112,8 @@  source "drivers/auxdisplay/Kconfig"
 
 source "drivers/uio/Kconfig"
 
+source "drivers/vfio/Kconfig"
+
 source "drivers/vlynq/Kconfig"
 
 source "drivers/virtio/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 1b31421..5f138b5 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -58,6 +58,7 @@  obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_FUSION)		+= message/
 obj-y				+= firewire/
 obj-$(CONFIG_UIO)		+= uio/
+obj-$(CONFIG_VFIO)		+= vfio/
 obj-y				+= cdrom/
 obj-y				+= auxdisplay/
 obj-$(CONFIG_PCCARD)		+= pcmcia/
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
new file mode 100644
index 0000000..9acb1e7
--- /dev/null
+++ b/drivers/vfio/Kconfig
@@ -0,0 +1,8 @@ 
+menuconfig VFIO
+	tristate "VFIO Non-Privileged userspace driver framework"
+	depends on IOMMU_API
+	help
+	  VFIO provides a framework for secure userspace device drivers.
+	  See Documentation/vfio.txt for more details.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
new file mode 100644
index 0000000..088faf1
--- /dev/null
+++ b/drivers/vfio/Makefile
@@ -0,0 +1,3 @@ 
+vfio-y := vfio_main.o vfio_iommu.o
+
+obj-$(CONFIG_VFIO) := vfio.o
diff --git a/drivers/vfio/vfio_iommu.c b/drivers/vfio/vfio_iommu.c
new file mode 100644
index 0000000..029dae3
--- /dev/null
+++ b/drivers/vfio/vfio_iommu.c
@@ -0,0 +1,530 @@ 
+/*
+ * VFIO: IOMMU DMA mapping support
+ *
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/workqueue.h>
+
+#include "vfio_private.h"
+
+struct dma_map_page {
+	struct list_head	list;
+	dma_addr_t		daddr;
+	unsigned long		vaddr;
+	int			npage;
+	int			rdwr;
+};
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+	struct mm_struct	*mm;
+	int			npage;
+	struct work_struct	work;
+};
+
+/* delayed decrement for locked_vm */
+static void vfio_lock_acct_bg(struct work_struct *work)
+{
+	struct vwork *vwork = container_of(work, struct vwork, work);
+	struct mm_struct *mm;
+
+	mm = vwork->mm;
+	down_write(&mm->mmap_sem);
+	mm->locked_vm += vwork->npage;
+	up_write(&mm->mmap_sem);
+	mmput(mm);		/* unref mm */
+	kfree(vwork);
+}
+
+static void vfio_lock_acct(int npage)
+{
+	struct vwork *vwork;
+	struct mm_struct *mm;
+
+	if (!current->mm) {
+		/* process exited */
+		return;
+	}
+	if (down_write_trylock(&current->mm->mmap_sem)) {
+		current->mm->locked_vm += npage;
+		up_write(&current->mm->mmap_sem);
+		return;
+	}
+	/*
+	 * Couldn't get mmap_sem lock, so must setup to decrement
+	 * mm->locked_vm later. If locked_vm were atomic, we wouldn't
+	 * need this silliness
+	 */
+	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+	if (!vwork)
+		return;
+	mm = get_task_mm(current);	/* take ref mm */
+	if (!mm) {
+		kfree(vwork);
+		return;
+	}
+	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
+	vwork->mm = mm;
+	vwork->npage = npage;
+	schedule_work(&vwork->work);
+}
+
+/* Some mappings aren't backed by a struct page, for example an mmap'd
+ * MMIO range for our own or another device.  These use a different
+ * pfn conversion and shouldn't be tracked as locked pages. */
+static int is_invalid_reserved_pfn(unsigned long pfn)
+{
+	if (pfn_valid(pfn)) {
+		int reserved;
+		struct page *tail = pfn_to_page(pfn);
+		struct page *head = compound_trans_head(tail);
+		reserved = PageReserved(head);
+		if (head != tail) {
+			/* "head" is not a dangling pointer
+			 * (compound_trans_head takes care of that)
+			 * but the hugepage may have been split
+			 * from under us (and we may not hold a
+			 * reference count on the head page so it can
+			 * be reused before we run PageReferenced), so
+			 * we've to check PageTail before returning
+			 * what we just read.
+			 */
+			smp_rmb();
+			if (PageTail(tail))
+				return reserved;
+		}
+		return PageReserved(tail);
+	}
+
+	return true;
+}
+
+static int put_pfn(unsigned long pfn, int rdwr)
+{
+	if (!is_invalid_reserved_pfn(pfn)) {
+		struct page *page = pfn_to_page(pfn);
+		if (rdwr)
+			SetPageDirty(page);
+		put_page(page);
+		return 1;
+	}
+	return 0;
+}
+
+/* Unmap DMA region */
+/* dgate must be held */
+static int __vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
+			    int npage, int rdwr)
+{
+	int i, unlocked = 0;
+
+	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
+		unsigned long pfn;
+
+		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
+		if (pfn) {
+			iommu_unmap(iommu->domain, iova, 0);
+			unlocked += put_pfn(pfn, rdwr);
+		}
+	}
+	return unlocked;
+}
+
+static void vfio_dma_unmap(struct vfio_iommu *iommu, unsigned long iova,
+			   unsigned long npage, int rdwr)
+{
+	int unlocked;
+
+	unlocked = __vfio_dma_unmap(iommu, iova, npage, rdwr);
+	vfio_lock_acct(-unlocked);
+}
+
+/* Unmap ALL DMA regions */
+void vfio_iommu_unmapall(struct vfio_iommu *iommu)
+{
+	struct list_head *pos, *pos2;
+	struct dma_map_page *mlp;
+
+	mutex_lock(&iommu->dgate);
+	list_for_each_safe(pos, pos2, &iommu->dm_list) {
+		mlp = list_entry(pos, struct dma_map_page, list);
+		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
+		list_del(&mlp->list);
+		kfree(mlp);
+	}
+	mutex_unlock(&iommu->dgate);
+}
+
+static int vaddr_get_pfn(unsigned long vaddr, int rdwr, unsigned long *pfn)
+{
+	struct page *page[1];
+	struct vm_area_struct *vma;
+	int ret = -EFAULT;
+
+	if (get_user_pages_fast(vaddr, 1, rdwr, page) == 1) {
+		*pfn = page_to_pfn(page[0]);
+		return 0;
+	}
+
+	down_read(&current->mm->mmap_sem);
+
+	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
+
+	if (vma && vma->vm_flags & VM_PFNMAP) {
+		*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+		if (is_invalid_reserved_pfn(*pfn))
+			ret = 0;
+	}
+
+	up_read(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+/* Map DMA region */
+/* dgate must be held */
+static int vfio_dma_map(struct vfio_iommu *iommu, unsigned long iova,
+			unsigned long vaddr, int npage, int rdwr)
+{
+	unsigned long start = iova;
+	int i, ret, locked = 0, prot = IOMMU_READ;
+
+	/* Verify pages are not already mapped */
+	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
+		if (iommu_iova_to_phys(iommu->domain, iova))
+			return -EBUSY;
+
+	iova = start;
+
+	if (rdwr)
+		prot |= IOMMU_WRITE;
+	if (iommu->cache)
+		prot |= IOMMU_CACHE;
+
+	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
+		unsigned long pfn = 0;
+
+		ret = vaddr_get_pfn(vaddr, rdwr, &pfn);
+		if (ret) {
+			__vfio_dma_unmap(iommu, start, i, rdwr);
+			return ret;
+		}
+
+		/* Only add actual locked pages to accounting */
+		if (!is_invalid_reserved_pfn(pfn))
+			locked++;
+
+		ret = iommu_map(iommu->domain, iova,
+				(phys_addr_t)pfn << PAGE_SHIFT, 0, prot);
+		if (ret) {
+			/* Back out mappings on error */
+			put_pfn(pfn, rdwr);
+			__vfio_dma_unmap(iommu, start, i, rdwr);
+			return ret;
+		}
+	}
+	vfio_lock_acct(locked);
+	return 0;
+}
+
+static inline int ranges_overlap(unsigned long start1, size_t size1,
+				 unsigned long start2, size_t size2)
+{
+	return !(start1 + size1 <= start2 || start2 + size2 <= start1);
+}
+
+static struct dma_map_page *vfio_find_dma(struct vfio_iommu *iommu,
+					  dma_addr_t start, size_t size)
+{
+	struct list_head *pos;
+	struct dma_map_page *mlp;
+
+	list_for_each(pos, &iommu->dm_list) {
+		mlp = list_entry(pos, struct dma_map_page, list);
+		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
+				   start, size))
+			return mlp;
+	}
+	return NULL;
+}
+
+int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
+			    size_t size, struct dma_map_page *mlp)
+{
+	struct dma_map_page *split;
+	int npage_lo, npage_hi;
+
+	/* Existing dma region is completely covered, unmap all */
+	if (start <= mlp->daddr &&
+	    start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
+		vfio_dma_unmap(iommu, mlp->daddr, mlp->npage, mlp->rdwr);
+		list_del(&mlp->list);
+		npage_lo = mlp->npage;
+		kfree(mlp);
+		return npage_lo;
+	}
+
+	/* Overlap low address of existing range */
+	if (start <= mlp->daddr) {
+		size_t overlap;
+
+		overlap = start + size - mlp->daddr;
+		npage_lo = overlap >> PAGE_SHIFT;
+		npage_hi = mlp->npage - npage_lo;
+
+		vfio_dma_unmap(iommu, mlp->daddr, npage_lo, mlp->rdwr);
+		mlp->daddr += overlap;
+		mlp->vaddr += overlap;
+		mlp->npage -= npage_lo;
+		return npage_lo;
+	}
+
+	/* Overlap high address of existing range */
+	if (start + size >= mlp->daddr + NPAGE_TO_SIZE(mlp->npage)) {
+		size_t overlap;
+
+		overlap = mlp->daddr + NPAGE_TO_SIZE(mlp->npage) - start;
+		npage_hi = overlap >> PAGE_SHIFT;
+		npage_lo = mlp->npage - npage_hi;
+
+		vfio_dma_unmap(iommu, start, npage_hi, mlp->rdwr);
+		mlp->npage -= npage_hi;
+		return npage_hi;
+	}
+
+	/* Split existing */
+	npage_lo = (start - mlp->daddr) >> PAGE_SHIFT;
+	npage_hi = mlp->npage - (size >> PAGE_SHIFT) - npage_lo;
+
+	split = kzalloc(sizeof *split, GFP_KERNEL);
+	if (!split)
+		return -ENOMEM;
+
+	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, mlp->rdwr);
+
+	mlp->npage = npage_lo;
+
+	split->npage = npage_hi;
+	split->daddr = start + size;
+	split->vaddr = mlp->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
+	split->rdwr = mlp->rdwr;
+	list_add(&split->list, &iommu->dm_list);
+	return size >> PAGE_SHIFT;
+}
+
+int vfio_dma_unmap_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
+{
+	int ret = 0;
+	size_t npage = dmp->size >> PAGE_SHIFT;
+	struct list_head *pos, *n;
+
+	if (dmp->dmaaddr & ~PAGE_MASK)
+		return -EINVAL;
+	if (dmp->size & ~PAGE_MASK)
+		return -EINVAL;
+
+	mutex_lock(&iommu->dgate);
+
+	list_for_each_safe(pos, n, &iommu->dm_list) {
+		struct dma_map_page *mlp;
+
+		mlp = list_entry(pos, struct dma_map_page, list);
+		if (ranges_overlap(mlp->daddr, NPAGE_TO_SIZE(mlp->npage),
+				   dmp->dmaaddr, dmp->size)) {
+			ret = vfio_remove_dma_overlap(iommu, dmp->dmaaddr,
+						      dmp->size, mlp);
+			if (ret > 0)
+				npage -= NPAGE_TO_SIZE(ret);
+			if (ret < 0 || npage == 0)
+				break;
+		}
+	}
+	mutex_unlock(&iommu->dgate);
+	return ret > 0 ? 0 : ret;
+}
+
+int vfio_dma_map_dm(struct vfio_iommu *iommu, struct vfio_dma_map *dmp)
+{
+	int npage;
+	struct dma_map_page *mlp, *mmlp = NULL;
+	dma_addr_t daddr = dmp->dmaaddr;
+	unsigned long locked, lock_limit, vaddr = dmp->vaddr;
+	size_t size = dmp->size;
+	int ret = 0, rdwr = dmp->flags & VFIO_DMA_MAP_FLAG_WRITE;
+
+	if (vaddr & (PAGE_SIZE-1))
+		return -EINVAL;
+	if (daddr & (PAGE_SIZE-1))
+		return -EINVAL;
+	if (size & (PAGE_SIZE-1))
+		return -EINVAL;
+
+	npage = size >> PAGE_SHIFT;
+	if (!npage)
+		return -EINVAL;
+
+	if (!iommu)
+		return -EINVAL;
+
+	mutex_lock(&iommu->dgate);
+
+	if (vfio_find_dma(iommu, daddr, size)) {
+		ret = -EBUSY;
+		goto out_lock;
+	}
+
+	/* account for locked pages */
+	locked = current->mm->locked_vm + npage;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		printk(KERN_WARNING "%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+			__func__, rlimit(RLIMIT_MEMLOCK));
+		ret = -ENOMEM;
+		goto out_lock;
+	}
+
+	ret = vfio_dma_map(iommu, daddr, vaddr, npage, rdwr);
+	if (ret)
+		goto out_lock;
+
+	/* Check if we abut a region below */
+	if (daddr) {
+		mlp = vfio_find_dma(iommu, daddr - 1, 1);
+		if (mlp && mlp->rdwr == rdwr &&
+		    mlp->vaddr + NPAGE_TO_SIZE(mlp->npage) == vaddr) {
+
+			mlp->npage += npage;
+			daddr = mlp->daddr;
+			vaddr = mlp->vaddr;
+			npage = mlp->npage;
+			size = NPAGE_TO_SIZE(npage);
+
+			mmlp = mlp;
+		}
+	}
+
+	if (daddr + size) {
+		mlp = vfio_find_dma(iommu, daddr + size, 1);
+		if (mlp && mlp->rdwr == rdwr && mlp->vaddr == vaddr + size) {
+
+			mlp->npage += npage;
+			mlp->daddr = daddr;
+			mlp->vaddr = vaddr;
+
+			/* If merged above and below, remove previously
+			 * merged entry.  New entry covers it.  */
+			if (mmlp) {
+				list_del(&mmlp->list);
+				kfree(mmlp);
+			}
+			mmlp = mlp;
+		}
+	}
+
+	if (!mmlp) {
+		mlp = kzalloc(sizeof *mlp, GFP_KERNEL);
+		if (!mlp) {
+			ret = -ENOMEM;
+			vfio_dma_unmap(iommu, daddr, npage, rdwr);
+			goto out_lock;
+		}
+
+		mlp->npage = npage;
+		mlp->daddr = daddr;
+		mlp->vaddr = vaddr;
+		mlp->rdwr = rdwr;
+		list_add(&mlp->list, &iommu->dm_list);
+	}
+
+out_lock:
+	mutex_unlock(&iommu->dgate);
+	return ret;
+}
+
+static int vfio_iommu_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_iommu *iommu = filep->private_data;
+
+	vfio_release_iommu(iommu);
+	return 0;
+}
+
+static long vfio_iommu_unl_ioctl(struct file *filep,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct vfio_iommu *iommu = filep->private_data;
+	int ret = -ENOSYS;
+
+        if (cmd == VFIO_IOMMU_GET_FLAGS) {
+                u64 flags = VFIO_IOMMU_FLAGS_MAP_ANY;
+
+                ret = put_user(flags, (u64 __user *)arg);
+
+        } else if (cmd == VFIO_IOMMU_MAP_DMA) {
+		struct vfio_dma_map dm;
+
+		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
+			return -EFAULT;
+
+		ret = vfio_dma_map_dm(iommu, &dm);
+
+		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
+			ret = -EFAULT;
+
+	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
+		struct vfio_dma_map dm;
+
+		if (copy_from_user(&dm, (void __user *)arg, sizeof dm))
+			return -EFAULT;
+
+		ret = vfio_dma_unmap_dm(iommu, &dm);
+
+		if (!ret && copy_to_user((void __user *)arg, &dm, sizeof dm))
+			ret = -EFAULT;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long vfio_iommu_compat_ioctl(struct file *filep,
+				    unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_iommu_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+const struct file_operations vfio_iommu_fops = {
+	.owner		= THIS_MODULE,
+	.release	= vfio_iommu_release,
+	.unlocked_ioctl	= vfio_iommu_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_iommu_compat_ioctl,
+#endif
+};
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
new file mode 100644
index 0000000..6169356
--- /dev/null
+++ b/drivers/vfio/vfio_main.c
@@ -0,0 +1,1151 @@ 
+/*
+ * VFIO framework
+ *
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/cdev.h>
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/iommu.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/wait.h>
+
+#include "vfio_private.h"
+
+#define DRIVER_VERSION	"0.2"
+#define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
+#define DRIVER_DESC	"VFIO - User Level meta-driver"
+
+static int allow_unsafe_intrs;
+module_param(allow_unsafe_intrs, int, 0);
+MODULE_PARM_DESC(allow_unsafe_intrs,
+        "Allow use of IOMMUs which do not support interrupt remapping");
+
+static struct vfio {
+	dev_t			devt;
+	struct cdev		cdev;
+	struct list_head	group_list;
+	struct mutex		lock;
+	struct kref		kref;
+	struct class		*class;
+	struct idr		idr;
+	wait_queue_head_t	release_q;
+} vfio;
+
+static const struct file_operations vfio_group_fops;
+extern const struct file_operations vfio_iommu_fops;
+
+struct vfio_group {
+	dev_t			devt;
+	unsigned int		groupid;
+	struct bus_type		*bus;
+	struct vfio_iommu	*iommu;
+	struct list_head	device_list;
+	struct list_head	iommu_next;
+	struct list_head	group_next;
+	int			refcnt;
+};
+
+struct vfio_device {
+	struct device			*dev;
+	const struct vfio_device_ops	*ops;
+	struct vfio_iommu		*iommu;
+	struct vfio_group		*group;
+	struct list_head		device_next;
+	bool				attached;
+	int				refcnt;
+	void				*device_data;
+};
+
+/*
+ * Helper functions called under vfio.lock
+ */
+
+/* Return true if any devices within a group are opened */
+static bool __vfio_group_devs_inuse(struct vfio_group *group)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &group->device_list) {
+		struct vfio_device *device;
+
+		device = list_entry(pos, struct vfio_device, device_next);
+		if (device->refcnt)
+			return true;
+	}
+	return false;
+}
+
+/* Return true if any of the groups attached to an iommu are opened.
+ * We can only tear apart merged groups when nothing is left open. */
+static bool __vfio_iommu_groups_inuse(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+
+		group = list_entry(pos, struct vfio_group, iommu_next);
+		if (group->refcnt)
+			return true;
+	}
+	return false;
+}
+
+/* An iommu is "in use" if it has a file descriptor open or if any of
+ * the groups assigned to the iommu have devices open. */
+static bool __vfio_iommu_inuse(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+
+	if (iommu->refcnt)
+		return true;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+
+		group = list_entry(pos, struct vfio_group, iommu_next);
+
+		if (__vfio_group_devs_inuse(group))
+			return true;
+	}
+	return false;
+}
+
+static void __vfio_group_set_iommu(struct vfio_group *group,
+				   struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+
+	if (group->iommu)
+		list_del(&group->iommu_next);
+	if (iommu)
+		list_add(&group->iommu_next, &iommu->group_list);
+
+	group->iommu = iommu;
+
+	list_for_each(pos, &group->device_list) {
+		struct vfio_device *device;
+
+		device = list_entry(pos, struct vfio_device, device_next);
+		device->iommu = iommu;
+	}
+}
+
+static void __vfio_iommu_detach_dev(struct vfio_iommu *iommu,
+				    struct vfio_device *device)
+{
+	BUG_ON(!iommu->domain && device->attached);
+
+	if (!iommu->domain || !device->attached)
+		return;
+
+	iommu_detach_device(iommu->domain, device->dev);
+	device->attached = false;
+}
+
+static void __vfio_iommu_detach_group(struct vfio_iommu *iommu,
+				      struct vfio_group *group)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &group->device_list) {
+		struct vfio_device *device;
+
+		device = list_entry(pos, struct vfio_device, device_next);
+		__vfio_iommu_detach_dev(iommu, device);
+	}
+}
+
+static int __vfio_iommu_attach_dev(struct vfio_iommu *iommu,
+				   struct vfio_device *device)
+{
+	int ret;
+
+	BUG_ON(device->attached);
+
+	if (!iommu || !iommu->domain)
+		return -EINVAL;
+
+	ret = iommu_attach_device(iommu->domain, device->dev);
+	if (!ret)
+		device->attached = true;
+
+	return ret;
+}
+
+static int __vfio_iommu_attach_group(struct vfio_iommu *iommu,
+				     struct vfio_group *group)
+{
+	struct list_head *pos;
+
+	list_for_each(pos, &group->device_list) {
+		struct vfio_device *device;
+		int ret;
+
+		device = list_entry(pos, struct vfio_device, device_next);
+		ret = __vfio_iommu_attach_dev(iommu, device);
+		if (ret) {
+			__vfio_iommu_detach_group(iommu, group);
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/* The iommu is viable, ie. ready to be configured, when all the devices
+ * for all the groups attached to the iommu are bound to their vfio device
+ * drivers (ex. vfio-pci).  This sets the device_data private data pointer. */
+static bool __vfio_iommu_viable(struct vfio_iommu *iommu)
+{
+	struct list_head *gpos, *dpos;
+
+	list_for_each(gpos, &iommu->group_list) {
+		struct vfio_group *group;
+		group = list_entry(gpos, struct vfio_group, iommu_next);
+
+		list_for_each(dpos, &group->device_list) {
+			struct vfio_device *device;
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+
+			if (!device->device_data)
+				return false;
+		}
+	}
+	return true;
+}
+
+static void __vfio_close_iommu(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+
+	if (!iommu->domain)
+		return;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+		group = list_entry(pos, struct vfio_group, iommu_next);
+
+		__vfio_iommu_detach_group(iommu, group);
+	}
+
+	vfio_iommu_unmapall(iommu);
+
+	iommu_domain_free(iommu->domain);
+	iommu->domain = NULL;
+	iommu->mm = NULL;
+}
+
+/* Open the IOMMU.  This gates all access to the iommu or device file
+ * descriptors and sets current->mm as the exclusive user. */
+static int __vfio_open_iommu(struct vfio_iommu *iommu)
+{
+	struct list_head *pos;
+	int ret;
+
+	if (!__vfio_iommu_viable(iommu))
+		return -EBUSY;
+
+	if (iommu->domain)
+		return -EINVAL;
+
+	iommu->domain = iommu_domain_alloc(iommu->bus);
+	if (!iommu->domain)
+		return -EFAULT;
+
+	list_for_each(pos, &iommu->group_list) {
+		struct vfio_group *group;
+		group = list_entry(pos, struct vfio_group, iommu_next);
+
+		ret = __vfio_iommu_attach_group(iommu, group);
+		if (ret) {
+			__vfio_close_iommu(iommu);
+			return ret;
+		}
+	}
+
+	if (!allow_unsafe_intrs &&
+	    !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
+		__vfio_close_iommu(iommu);
+		return -EFAULT;
+	}
+
+	iommu->cache = (iommu_domain_has_cap(iommu->domain,
+					     IOMMU_CAP_CACHE_COHERENCY) != 0);
+	iommu->mm = current->mm;
+
+	return 0;
+}
+
+/* Actively try to tear down the iommu and merged groups.  If there are no
+ * open iommu or device fds, we close the iommu.  If we close the iommu and
+ * there are also no open group fds, we can futher dissolve the group to
+ * iommu association and free the iommu data structure. */
+static int __vfio_try_dissolve_iommu(struct vfio_iommu *iommu)
+{
+
+	if (__vfio_iommu_inuse(iommu))
+		return -EBUSY;
+
+	__vfio_close_iommu(iommu);
+
+	if (!__vfio_iommu_groups_inuse(iommu)) {
+		struct list_head *pos, *ppos;
+
+		list_for_each_safe(pos, ppos, &iommu->group_list) {
+			struct vfio_group *group;
+
+			group = list_entry(pos, struct vfio_group, iommu_next);
+			__vfio_group_set_iommu(group, NULL);
+		}
+
+
+		kfree(iommu);
+	}
+
+	return 0;
+}
+
+static struct vfio_device *__vfio_lookup_dev(struct device *dev)
+{
+	struct list_head *gpos;
+	unsigned int groupid;
+
+	if (iommu_device_group(dev, &groupid))
+		return NULL;
+
+	list_for_each(gpos, &vfio.group_list) {
+		struct vfio_group *group;
+		struct list_head *dpos;
+
+		group = list_entry(gpos, struct vfio_group, group_next);
+
+		if (group->groupid != groupid)
+			continue;
+
+		list_for_each(dpos, &group->device_list) {
+			struct vfio_device *device;
+
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+
+			if (device->dev == dev)
+				return device;
+		}
+	}
+	return NULL;
+}
+
+/* All release paths simply decrement the refcnt, attempt to teardown
+ * the iommu and merged groups, and wakeup anything that might be
+ * waiting if we successfully dissolve anything. */
+static int vfio_do_release(int *refcnt, struct vfio_iommu *iommu)
+{
+	bool wake;
+
+	mutex_lock(&vfio.lock);
+
+	(*refcnt)--;
+	wake = (__vfio_try_dissolve_iommu(iommu) == 0);
+
+	mutex_unlock(&vfio.lock);
+
+	if (wake)
+		wake_up(&vfio.release_q);
+
+	return 0;
+}
+
+/*
+ * Device fops - passthrough to vfio device driver w/ device_data
+ */
+static int vfio_device_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_device *device = filep->private_data;
+
+	vfio_do_release(&device->refcnt, device->iommu);
+
+	device->ops->put(device->device_data);
+
+	return 0;
+}
+
+static long vfio_device_unl_ioctl(struct file *filep,
+				  unsigned int cmd, unsigned long arg)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->ioctl(device->device_data, cmd, arg);
+}
+
+static ssize_t vfio_device_read(struct file *filep, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->read(device->device_data, buf, count, ppos);
+}
+
+static ssize_t vfio_device_write(struct file *filep, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->write(device->device_data, buf, count, ppos);
+}
+
+static int vfio_device_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	struct vfio_device *device = filep->private_data;
+
+	return device->ops->mmap(device->device_data, vma);
+}
+	
+#ifdef CONFIG_COMPAT
+static long vfio_device_compat_ioctl(struct file *filep,
+				     unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_device_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+const struct file_operations vfio_device_fops = {
+	.owner		= THIS_MODULE,
+	.release	= vfio_device_release,
+	.read		= vfio_device_read,
+	.write		= vfio_device_write,
+	.unlocked_ioctl	= vfio_device_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_device_compat_ioctl,
+#endif
+	.mmap		= vfio_device_mmap,
+};
+
+/*
+ * Group fops
+ */
+static int vfio_group_open(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *group;
+	int ret = 0;
+
+	mutex_lock(&vfio.lock);
+
+	group = idr_find(&vfio.idr, iminor(inode));
+
+	if (!group) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	filep->private_data = group;
+
+	if (!group->iommu) {
+		struct vfio_iommu *iommu;
+
+		iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+		if (!iommu) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		INIT_LIST_HEAD(&iommu->group_list);
+		INIT_LIST_HEAD(&iommu->dm_list);
+		mutex_init(&iommu->dgate);
+		iommu->bus = group->bus;
+		__vfio_group_set_iommu(group, iommu);
+	}
+	group->refcnt++;
+
+out:
+	mutex_unlock(&vfio.lock);
+
+	return ret;
+}
+
+static int vfio_group_release(struct inode *inode, struct file *filep)
+{
+	struct vfio_group *group = filep->private_data;
+
+	return vfio_do_release(&group->refcnt, group->iommu);
+}
+
+/* Attempt to merge the group pointed to by fd into group.  The merge-ee
+ * group must not have an iommu or any devices open because we cannot
+ * maintain that context across the merge.  The merge-er group can be
+ * in use. */
+static int vfio_group_merge(struct vfio_group *group, int fd)
+{
+	struct vfio_group *new;
+	struct vfio_iommu *old_iommu;
+	struct file *file;
+	int ret = 0;
+	bool opened = false;
+
+	mutex_lock(&vfio.lock);
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out_noput;
+	}
+
+	/* Sanity check, is this really our fd? */
+	if (file->f_op != &vfio_group_fops) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	new = file->private_data;
+
+	if (!new || new == group || !new->iommu ||
+	    new->iommu->domain || new->bus != group->bus) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* We need to attach all the devices to each domain separately
+	 * in order to validate that the capabilities match for both.  */
+	ret = __vfio_open_iommu(new->iommu);
+	if (ret)
+		goto out;
+
+	if (!group->iommu->domain) {
+		ret = __vfio_open_iommu(group->iommu);
+		if (ret)
+			goto out;
+		opened = true;
+	}
+
+	/* If cache coherency doesn't match we'd potentialy need to
+	 * remap existing iommu mappings in the merge-er domain.
+	 * Poor return to bother trying to allow this currently. */
+	if (iommu_domain_has_cap(group->iommu->domain,
+				 IOMMU_CAP_CACHE_COHERENCY) !=
+	    iommu_domain_has_cap(new->iommu->domain,
+				 IOMMU_CAP_CACHE_COHERENCY)) {
+		__vfio_close_iommu(new->iommu);
+		if (opened)
+			__vfio_close_iommu(group->iommu);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Close the iommu for the merge-ee and attach all its devices
+	 * to the merge-er iommu. */
+	__vfio_close_iommu(new->iommu);
+
+	ret = __vfio_iommu_attach_group(group->iommu, new);
+	if (ret)
+		goto out;
+
+	/* set_iommu unlinks new from the iommu, so save a pointer to it */
+	old_iommu = new->iommu;
+	__vfio_group_set_iommu(new, group->iommu);
+	kfree(old_iommu);
+
+out:
+	fput(file);
+out_noput:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/* Unmerge the group pointed to by fd from group. */
+static int vfio_group_unmerge(struct vfio_group *group, int fd)
+{
+	struct vfio_group *new;
+	struct vfio_iommu *new_iommu;
+	struct file *file;
+	int ret = 0;
+
+	/* Since the merge-out group is already opened, it needs to
+	 * have an iommu struct associated with it. */
+	new_iommu = kzalloc(sizeof(*new_iommu), GFP_KERNEL);
+	if (!new_iommu)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&new_iommu->group_list);
+	INIT_LIST_HEAD(&new_iommu->dm_list);
+	mutex_init(&new_iommu->dgate);
+	new_iommu->bus = group->bus;
+
+	mutex_lock(&vfio.lock);
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out_noput;
+	}
+
+	/* Sanity check, is this really our fd? */
+	if (file->f_op != &vfio_group_fops) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	new = file->private_data;
+	if (!new || new == group || new->iommu != group->iommu) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* We can't merge-out a group with devices still in use. */
+	if (__vfio_group_devs_inuse(new)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	__vfio_iommu_detach_group(group->iommu, new);
+	__vfio_group_set_iommu(new, new_iommu);
+
+out:
+	fput(file);
+out_noput:
+	if (ret)
+		kfree(new_iommu);
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/* Get a new iommu file descriptor.  This will open the iommu, setting
+ * the current->mm ownership if it's not already set. */
+static int vfio_group_get_iommu_fd(struct vfio_group *group)
+{
+	int ret = 0;
+
+	mutex_lock(&vfio.lock);
+
+	if (!group->iommu->domain) {
+		ret = __vfio_open_iommu(group->iommu);
+		if (ret)
+			goto out;
+	}
+
+	ret = anon_inode_getfd("[vfio-iommu]", &vfio_iommu_fops,
+			       group->iommu, O_RDWR);
+	if (ret < 0)
+		goto out;
+
+	group->iommu->refcnt++;
+out:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/* Get a new device file descriptor.  This will open the iommu, setting
+ * the current->mm ownership if it's not already set.  It's difficult to
+ * specify the requirements for matching a user supplied buffer to a
+ * device, so we use a vfio driver callback to test for a match.  For
+ * PCI, dev_name(dev) is unique, but other drivers may require including
+ * a parent device string. */
+static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
+{
+	struct vfio_iommu *iommu = group->iommu;
+	struct list_head *gpos;
+	int ret = -ENODEV;
+
+	mutex_lock(&vfio.lock);
+
+	if (!iommu->domain) {
+		ret = __vfio_open_iommu(iommu);
+		if (ret)
+			goto out;
+	}
+
+	list_for_each(gpos, &iommu->group_list) {
+		struct list_head *dpos;
+
+		group = list_entry(gpos, struct vfio_group, iommu_next);
+
+		list_for_each(dpos, &group->device_list) {
+			struct vfio_device *device;
+
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+
+			if (device->ops->match(device->dev, buf)) {
+				struct file *file;
+
+				if (device->ops->get(device->device_data)) {
+					ret = -EFAULT;
+					goto out;
+				}
+
+				/* We can't use anon_inode_getfd(), like above
+				 * because we need to modify the f_mode flags
+				 * directly to allow more than just ioctls */
+				ret = get_unused_fd();
+				if (ret < 0) {
+					device->ops->put(device->device_data);
+					goto out;
+				}
+
+				file = anon_inode_getfile("[vfio-device]",
+							  &vfio_device_fops,
+							  device, O_RDWR);
+				if (IS_ERR(file)) {
+					put_unused_fd(ret);
+					ret = PTR_ERR(file);
+					device->ops->put(device->device_data);
+					goto out;
+				}
+
+				/* Todo: add an anon_inode interface to do
+				 * this.  Appears to be missing by lack of
+				 * need rather than explicitly prevented.
+				 * Now there's need. */
+				file->f_mode |= (FMODE_LSEEK |
+						 FMODE_PREAD |
+						 FMODE_PWRITE);
+
+				fd_install(ret, file);
+
+				device->refcnt++;
+				goto out;
+			}
+		}
+	}
+out:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+static long vfio_group_unl_ioctl(struct file *filep,
+				 unsigned int cmd, unsigned long arg)
+{
+	struct vfio_group *group = filep->private_data;
+
+	if (cmd == VFIO_GROUP_GET_FLAGS) {
+		u64 flags = 0;
+
+		mutex_lock(&vfio.lock);
+		if (__vfio_iommu_viable(group->iommu))
+			flags |= VFIO_GROUP_FLAGS_VIABLE;
+		mutex_unlock(&vfio.lock);
+
+		if (group->iommu->mm)
+			flags |= VFIO_GROUP_FLAGS_MM_LOCKED;
+
+		return put_user(flags, (u64 __user *)arg);
+	}
+		
+	/* Below commands are restricted once the mm is set */
+	if (group->iommu->mm && group->iommu->mm != current->mm)
+		return -EPERM;
+
+	if (cmd == VFIO_GROUP_MERGE || cmd == VFIO_GROUP_UNMERGE) {
+		int fd;
+		
+		if (get_user(fd, (int __user *)arg))
+			return -EFAULT;
+		if (fd < 0)
+			return -EINVAL;
+
+		if (cmd == VFIO_GROUP_MERGE)
+			return vfio_group_merge(group, fd);
+		else
+			return vfio_group_unmerge(group, fd);
+	} else if (cmd == VFIO_GROUP_GET_IOMMU_FD) {
+		return vfio_group_get_iommu_fd(group);
+	} else if (cmd == VFIO_GROUP_GET_DEVICE_FD) {
+		char *buf;
+		int ret;
+
+		buf = strndup_user((const char __user *)arg, PAGE_SIZE);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
+
+		ret = vfio_group_get_device_fd(group, buf);
+		kfree(buf);
+		return ret;
+	}
+
+	return -ENOSYS;
+}
+
+#ifdef CONFIG_COMPAT
+static long vfio_group_compat_ioctl(struct file *filep,
+				    unsigned int cmd, unsigned long arg)
+{
+	arg = (unsigned long)compat_ptr(arg);
+	return vfio_group_unl_ioctl(filep, cmd, arg);
+}
+#endif	/* CONFIG_COMPAT */
+
+static const struct file_operations vfio_group_fops = {
+	.owner		= THIS_MODULE,
+	.open		= vfio_group_open,
+	.release	= vfio_group_release,
+	.unlocked_ioctl	= vfio_group_unl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vfio_group_compat_ioctl,
+#endif
+};
+
+/* iommu fd release hook */
+int vfio_release_iommu(struct vfio_iommu *iommu)
+{
+	return vfio_do_release(&iommu->refcnt, iommu);
+}
+
+/*
+ * VFIO driver API
+ */
+
+/* Add a new device to the vfio framework with associated vfio driver
+ * callbacks.  This is the entry point for vfio drivers to register devices. */
+int vfio_group_add_dev(struct device *dev, const struct vfio_device_ops *ops)
+{
+	struct list_head *pos;
+	struct vfio_group *group = NULL;
+	struct vfio_device *device = NULL;
+	unsigned int groupid;
+	int ret = 0;
+	bool new_group = false;
+
+	if (!ops)
+		return -EINVAL;
+
+	if (iommu_device_group(dev, &groupid))
+		return -ENODEV;
+
+	mutex_lock(&vfio.lock);
+
+	list_for_each(pos, &vfio.group_list) {
+		group = list_entry(pos, struct vfio_group, group_next);
+		if (group->groupid == groupid)
+			break;
+		group = NULL;
+	}
+
+	if (!group) {
+		int minor;
+
+		if (unlikely(idr_pre_get(&vfio.idr, GFP_KERNEL) == 0)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		group = kzalloc(sizeof(*group), GFP_KERNEL);
+		if (!group) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		group->groupid = groupid;
+		INIT_LIST_HEAD(&group->device_list);
+
+		ret = idr_get_new(&vfio.idr, group, &minor);
+		if (ret == 0 && minor > MINORMASK) {
+			idr_remove(&vfio.idr, minor);
+			kfree(group);
+			ret = -ENOSPC;
+			goto out;
+		}
+
+		group->devt = MKDEV(MAJOR(vfio.devt), minor);
+		device_create(vfio.class, NULL, group->devt,
+			      group, "%u", groupid);
+
+		group->bus = dev->bus;
+		list_add(&group->group_next, &vfio.group_list);
+		new_group = true;
+	} else {
+		if (group->bus != dev->bus) {
+			printk(KERN_WARNING
+			       "Error: IOMMU group ID conflict.  Group ID %u "
+				"on both bus %s and %s\n", groupid,
+				group->bus->name, dev->bus->name);
+			ret = -EFAULT;
+			goto out;
+		}
+
+		list_for_each(pos, &group->device_list) {
+			device = list_entry(pos,
+					    struct vfio_device, device_next);
+			if (device->dev == dev)
+				break;
+			device = NULL;
+		}
+	}
+
+	if (!device) {
+		if (__vfio_group_devs_inuse(group) ||
+		    (group->iommu && group->iommu->refcnt)) {
+			printk(KERN_WARNING
+			       "Adding device %s to group %u while group is already in use!!\n",
+			       dev_name(dev), group->groupid);
+			/* XXX How to prevent other drivers from claiming? */
+		}
+
+		device = kzalloc(sizeof(*device), GFP_KERNEL);
+		if (!device) {
+			/* If we just created this group, tear it down */
+			if (new_group) {
+				list_del(&group->group_next);
+				device_destroy(vfio.class, group->devt);
+				idr_remove(&vfio.idr, MINOR(group->devt));
+				kfree(group);
+			}
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		list_add(&device->device_next, &group->device_list);
+		device->dev = dev;
+		device->ops = ops;
+		device->iommu = group->iommu; /* NULL if new */
+		__vfio_iommu_attach_dev(group->iommu, device);
+	}
+out:
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_group_add_dev);
+
+/* Remove a device from the vfio framework */
+void vfio_group_del_dev(struct device *dev)
+{
+	struct list_head *pos;
+	struct vfio_group *group = NULL;
+	struct vfio_device *device = NULL;
+	unsigned int groupid;
+
+	if (iommu_device_group(dev, &groupid))
+		return;
+
+	mutex_lock(&vfio.lock);
+
+	list_for_each(pos, &vfio.group_list) {
+		group = list_entry(pos, struct vfio_group, group_next);
+		if (group->groupid == groupid)
+			break;
+		group = NULL;
+	}
+
+	if (!group)
+		goto out;
+
+	list_for_each(pos, &group->device_list) {
+		device = list_entry(pos, struct vfio_device, device_next);
+		if (device->dev == dev)
+			break;
+		device = NULL;
+	}
+
+	if (!device)
+		goto out;
+
+	BUG_ON(device->refcnt);
+
+	if (device->attached)
+		__vfio_iommu_detach_dev(group->iommu, device);
+
+	list_del(&device->device_next);
+	kfree(device);
+
+	/* If this was the only device in the group, remove the group.
+	 * Note that we intentionally unmerge empty groups here if the
+	 * group fd isn't opened. */
+	if (list_empty(&group->device_list) && group->refcnt == 0) {
+		struct vfio_iommu *iommu = group->iommu;
+
+		if (iommu) {
+			__vfio_group_set_iommu(group, NULL);
+			__vfio_try_dissolve_iommu(iommu);
+		}
+
+		device_destroy(vfio.class, group->devt);
+		idr_remove(&vfio.idr, MINOR(group->devt));
+		list_del(&group->group_next);
+		kfree(group);
+	}
+out:
+	mutex_unlock(&vfio.lock);
+}
+EXPORT_SYMBOL_GPL(vfio_group_del_dev);
+
+/* When a device is bound to a vfio device driver (ex. vfio-pci), this
+ * entry point is used to mark the device usable (viable).  The vfio
+ * device driver associates a private device_data struct with the device
+ * here, which will later be return for vfio_device_fops callbacks. */
+int vfio_bind_dev(struct device *dev, void *device_data)
+{
+	struct vfio_device *device;
+	int ret = -EINVAL;
+
+	BUG_ON(!device_data);
+
+	mutex_lock(&vfio.lock);
+
+	device = __vfio_lookup_dev(dev);
+
+	BUG_ON(!device);
+
+	ret = dev_set_drvdata(dev, device);
+	if (!ret)
+		device->device_data = device_data;
+
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_bind_dev);
+
+/* A device is only removeable if the iommu for the group is not in use. */
+static bool vfio_device_removeable(struct vfio_device *device)
+{
+	bool ret = true;
+
+	mutex_lock(&vfio.lock);
+
+	if (device->iommu && __vfio_iommu_inuse(device->iommu))
+		ret = false;
+
+	mutex_unlock(&vfio.lock);
+	return ret;
+}
+
+/* Notify vfio that a device is being unbound from the vfio device driver
+ * and return the device private device_data pointer.  If the group is
+ * in use, we need to block or take other measures to make it safe for
+ * the device to be removed from the iommu. */
+void *vfio_unbind_dev(struct device *dev)
+{
+	struct vfio_device *device = dev_get_drvdata(dev);
+	void *device_data;
+
+	BUG_ON(!device);
+
+again:
+	if (!vfio_device_removeable(device)) {
+		/* XXX signal for all devices in group to be removed or
+		 * resort to killing the process holding the device fds.
+		 * For now just block waiting for releases to wake us. */
+		wait_event(vfio.release_q, vfio_device_removeable(device));
+	}
+
+	mutex_lock(&vfio.lock);
+
+	/* Need to re-check that the device is still removeable under lock. */
+	if (device->iommu && __vfio_iommu_inuse(device->iommu)) {
+		mutex_unlock(&vfio.lock);
+		goto again;
+	}
+
+	device_data = device->device_data;
+
+	device->device_data = NULL;
+	dev_set_drvdata(dev, NULL);
+
+	mutex_unlock(&vfio.lock);
+	return device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_unbind_dev);
+
+/*
+ * Module/class support
+ */
+static void vfio_class_release(struct kref *kref)
+{
+	class_destroy(vfio.class);
+	vfio.class = NULL;
+}
+
+static char *vfio_devnode(struct device *dev, mode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
+}
+
+static int __init vfio_init(void)
+{
+	int ret;
+
+	idr_init(&vfio.idr);
+	mutex_init(&vfio.lock);
+	INIT_LIST_HEAD(&vfio.group_list);
+	init_waitqueue_head(&vfio.release_q);
+
+	kref_init(&vfio.kref);
+	vfio.class = class_create(THIS_MODULE, "vfio");
+	if (IS_ERR(vfio.class)) {
+		ret = PTR_ERR(vfio.class);
+		goto err_class;
+	}
+
+	vfio.class->devnode = vfio_devnode;
+
+	/* FIXME - how many minors to allocate... all of them! */
+	ret = alloc_chrdev_region(&vfio.devt, 0, MINORMASK, "vfio");
+	if (ret)
+		goto err_chrdev;
+
+	cdev_init(&vfio.cdev, &vfio_group_fops);
+	ret = cdev_add(&vfio.cdev, vfio.devt, MINORMASK);
+	if (ret)
+		goto err_cdev;
+
+	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
+
+	return 0;
+
+err_cdev:
+	unregister_chrdev_region(vfio.devt, MINORMASK);
+err_chrdev:
+	kref_put(&vfio.kref, vfio_class_release);
+err_class:
+	return ret;
+}
+
+static void __exit vfio_cleanup(void)
+{
+	struct list_head *gpos, *gppos;
+
+	list_for_each_safe(gpos, gppos, &vfio.group_list) {
+		struct vfio_group *group;
+		struct list_head *dpos, *dppos;
+
+		group = list_entry(gpos, struct vfio_group, group_next);
+
+		list_for_each_safe(dpos, dppos, &group->device_list) {
+			struct vfio_device *device;
+
+			device = list_entry(dpos,
+					    struct vfio_device, device_next);
+			vfio_group_del_dev(device->dev);
+		}
+	}
+
+	idr_destroy(&vfio.idr);
+	cdev_del(&vfio.cdev);
+	unregister_chrdev_region(vfio.devt, MINORMASK);
+	kref_put(&vfio.kref, vfio_class_release);
+}
+
+module_init(vfio_init);
+module_exit(vfio_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/vfio_private.h b/drivers/vfio/vfio_private.h
new file mode 100644
index 0000000..350ad67
--- /dev/null
+++ b/drivers/vfio/vfio_private.h
@@ -0,0 +1,34 @@ 
+/*
+ * Copyright (C) 2011 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio:
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ */
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#ifndef VFIO_PRIVATE_H
+#define VFIO_PRIVATE_H
+
+struct vfio_iommu {
+	struct iommu_domain		*domain;
+	struct bus_type			*bus;
+	struct mutex			dgate;
+	struct list_head		dm_list;
+	struct mm_struct		*mm;
+	struct list_head		group_list;
+	int				refcnt;
+	bool				cache;
+};
+
+extern int vfio_release_iommu(struct vfio_iommu *iommu);
+extern void vfio_iommu_unmapall(struct vfio_iommu *iommu);
+
+#endif /* VFIO_PRIVATE_H */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
new file mode 100644
index 0000000..4269b08
--- /dev/null
+++ b/include/linux/vfio.h
@@ -0,0 +1,155 @@ 
+/*
+ * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
+ * Author: Tom Lyon, pugs@cisco.com
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Portions derived from drivers/uio/uio.c:
+ * Copyright(C) 2005, Benedikt Spranger <b.spranger@linutronix.de>
+ * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2006, Hans J. Koch <hjk@linutronix.de>
+ * Copyright(C) 2006, Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Portions derived from drivers/uio/uio_pci_generic.c:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ */
+#include <linux/types.h>
+
+#ifndef VFIO_H
+#define VFIO_H
+
+#ifdef __KERNEL__
+
+struct vfio_device_ops {
+	bool			(*match)(struct device *, char *);
+	int			(*get)(void *);
+	void			(*put)(void *);
+	ssize_t			(*read)(void *, char __user *,
+					size_t, loff_t *);
+	ssize_t			(*write)(void *, const char __user *,
+					 size_t, loff_t *);
+	long			(*ioctl)(void *, unsigned int, unsigned long);
+	int			(*mmap)(void *, struct vm_area_struct *);
+};
+
+extern int vfio_group_add_dev(struct device *device,
+			      const struct vfio_device_ops *ops);
+extern void vfio_group_del_dev(struct device *device);
+extern int vfio_bind_dev(struct device *device, void *device_data);
+extern void *vfio_unbind_dev(struct device *device);
+
+#endif /* __KERNEL__ */
+
+/*
+ * VFIO driver - allow mapping and use of certain devices
+ * in unprivileged user processes. (If IOMMU is present)
+ * Especially useful for Virtual Function parts of SR-IOV devices
+ */
+
+
+/* Kernel & User level defines for ioctls */
+
+#define VFIO_GROUP_GET_FLAGS		_IOR(';', 100, __u64)
+ #define VFIO_GROUP_FLAGS_VIABLE	(1 << 0)
+ #define VFIO_GROUP_FLAGS_MM_LOCKED	(1 << 1)
+#define VFIO_GROUP_MERGE		_IOW(';', 101, int)
+#define VFIO_GROUP_UNMERGE		_IOW(';', 102, int)
+#define VFIO_GROUP_GET_IOMMU_FD		_IO(';', 103)
+#define VFIO_GROUP_GET_DEVICE_FD	_IOW(';', 104, char *)
+
+/*
+ * Structure for DMA mapping of user buffers
+ * vaddr, dmaaddr, and size must all be page aligned
+ */
+struct vfio_dma_map {
+	__u64	len;		/* length of structure */
+	__u64	vaddr;		/* process virtual addr */
+	__u64	dmaaddr;	/* desired and/or returned dma address */
+	__u64	size;		/* size in bytes */
+	__u64	flags;
+#define	VFIO_DMA_MAP_FLAG_WRITE		(1 << 0) /* req writeable DMA mem */
+};
+
+#define	VFIO_IOMMU_GET_FLAGS		_IOR(';', 105, __u64)
+ /* Does the IOMMU support mapping any IOVA to any virtual address? */
+ #define VFIO_IOMMU_FLAGS_MAP_ANY	(1 << 0)
+#define	VFIO_IOMMU_MAP_DMA		_IOWR(';', 106, struct vfio_dma_map)
+#define	VFIO_IOMMU_UNMAP_DMA		_IOWR(';', 107, struct vfio_dma_map)
+
+#define VFIO_DEVICE_GET_FLAGS		_IOR(';', 108, __u64)
+ #define VFIO_DEVICE_FLAGS_PCI		(1 << 0)
+ #define VFIO_DEVICE_FLAGS_DT		(1 << 1)
+ #define VFIO_DEVICE_FLAGS_RESET	(1 << 2)
+#define VFIO_DEVICE_GET_NUM_REGIONS	_IOR(';', 109, int)
+
+struct vfio_region_info {
+	__u32	len;		/* length of structure */
+	__u32	index;		/* region number */
+	__u64	size;		/* size in bytes of region */
+	__u64	offset;		/* start offset of region */
+	__u64	flags;
+#define VFIO_REGION_INFO_FLAG_MMAP		(1 << 0)
+#define VFIO_REGION_INFO_FLAG_RO		(1 << 1)
+#define VFIO_REGION_INFO_FLAG_PHYS_VALID	(1 << 2)
+	__u64	phys;		/* physical address of region */
+};
+
+#define VFIO_DEVICE_GET_REGION_INFO	_IOWR(';', 110, struct vfio_region_info)
+
+#define VFIO_DEVICE_GET_NUM_IRQS	_IOR(';', 111, int)
+
+struct vfio_irq_info {
+	__u32	len;		/* length of structure */
+	__u32	index;		/* IRQ number */
+	__u32	count;		/* number of individual IRQs */
+	__u32	flags;
+#define VFIO_IRQ_INFO_FLAG_LEVEL		(1 << 0)
+};
+
+#define VFIO_DEVICE_GET_IRQ_INFO	_IOWR(';', 112, struct vfio_irq_info)
+
+/* Set IRQ eventfds, arg[0] = index, arg[1] = count, arg[2-n] = eventfds */
+#define VFIO_DEVICE_SET_IRQ_EVENTFDS	_IOW(';', 113, int)
+
+/* Unmask IRQ index, arg[0] = index */
+#define VFIO_DEVICE_UNMASK_IRQ		_IOW(';', 114, int)
+
+/* Set unmask eventfd, arg[0] = index, arg[1] = eventfd */
+#define VFIO_DEVICE_SET_UNMASK_IRQ_EVENTFD	_IOW(';', 115, int)
+
+#define VFIO_DEVICE_RESET		_IO(';', 116)
+
+struct vfio_dtpath {
+	__u32	len;		/* length of structure */
+	__u32	index;
+	__u64	flags;
+#define VFIO_DTPATH_FLAGS_REGION	(1 << 0)
+#define VFIO_DTPATH_FLAGS_IRQ		(1 << 1)
+	char	*path;
+};
+#define VFIO_DEVICE_GET_DTPATH		_IOWR(';', 117, struct vfio_dtpath)
+
+struct vfio_dtindex {
+	__u32	len;		/* length of structure */
+	__u32	index;
+	__u32	prop_type;
+	__u32	prop_index;
+	__u64	flags;
+#define VFIO_DTINDEX_FLAGS_REGION	(1 << 0)
+#define VFIO_DTINDEX_FLAGS_IRQ		(1 << 1)
+};
+#define VFIO_DEVICE_GET_DTINDEX		_IOWR(';', 118, struct vfio_dtindex)
+
+#endif /* VFIO_H */