Patchwork [1/2] Isolation groups

login
register
mail settings
Submitter Alex Williamson
Date March 12, 2012, 10:32 p.m.
Message ID <20120312223254.30321.74405.stgit@bling.home>
Download mbox | patch
Permalink /patch/146274/
State New
Headers show

Comments

Alex Williamson - March 12, 2012, 10:32 p.m.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 drivers/base/Kconfig      |   10 +
 drivers/base/Makefile     |    1 
 drivers/base/base.h       |    5 
 drivers/base/isolation.c  |  798 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h    |    4 
 include/linux/isolation.h |  138 ++++++++
 6 files changed, 956 insertions(+), 0 deletions(-)
 create mode 100644 drivers/base/isolation.c
 create mode 100644 include/linux/isolation.h
David Gibson - March 13, 2012, 2:33 p.m.
On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> 
>  drivers/base/Kconfig      |   10 +
>  drivers/base/Makefile     |    1 
>  drivers/base/base.h       |    5 
>  drivers/base/isolation.c  |  798 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/device.h    |    4 
>  include/linux/isolation.h |  138 ++++++++
>  6 files changed, 956 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/base/isolation.c
>  create mode 100644 include/linux/isolation.h
> 
> diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> index 7be9f79..e98a5f3 100644
> --- a/drivers/base/Kconfig
> +++ b/drivers/base/Kconfig
> @@ -189,4 +189,14 @@ config DMA_SHARED_BUFFER
>  	  APIs extension; the file's descriptor can then be passed on to other
>  	  driver.
>  
> +config ISOLATION_GROUPS
> +	bool "Enable Isolation Group API"
> +	default n
> +	depends on EXPERIMENTAL && IOMMU_API
> +	help
> +	  This option enables grouping of devices into Isolation Groups
> +	  which may be used by other subsystems to perform quirks across
> +	  sets of devices as well as userspace drivers for guaranteeing
> +	  devices are isolated from the rest of the system.
> +
>  endmenu
> diff --git a/drivers/base/Makefile b/drivers/base/Makefile
> index 610f999..047b5f9 100644
> --- a/drivers/base/Makefile
> +++ b/drivers/base/Makefile
> @@ -19,6 +19,7 @@ obj-$(CONFIG_MODULES)	+= module.o
>  endif
>  obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
>  obj-$(CONFIG_REGMAP)	+= regmap/
> +obj-$(CONFIG_ISOLATION_GROUPS)	+= isolation.o
>  
>  ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
>  
> diff --git a/drivers/base/base.h b/drivers/base/base.h
> index b858dfd..376758a 100644
> --- a/drivers/base/base.h
> +++ b/drivers/base/base.h
> @@ -1,4 +1,5 @@
>  #include <linux/notifier.h>
> +#include <linux/isolation.h>
>  
>  /**
>   * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
> @@ -108,6 +109,10 @@ extern int driver_probe_device(struct device_driver *drv, struct device *dev);
>  static inline int driver_match_device(struct device_driver *drv,
>  				      struct device *dev)
>  {
> +	if (isolation_group_managed(to_isolation_group(dev)) &&
> +	    !isolation_group_manager_driver(to_isolation_group(dev), drv))
> +		return 0;
> +
>  	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
>  }
>  
> diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c
> new file mode 100644
> index 0000000..c01365c
> --- /dev/null
> +++ b/drivers/base/isolation.c
> @@ -0,0 +1,798 @@
> +/*
> + * Isolation group interface
> + *
> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> + * Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/device.h>
> +#include <linux/iommu.h>
> +#include <linux/isolation.h>
> +#include <linux/list.h>
> +#include <linux/mutex.h>
> +#include <linux/notifier.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +
> +static struct kset *isolation_kset;
> +/* XXX add more complete locking, maybe rcu */
> +static DEFINE_MUTEX(isolation_lock);
> +static LIST_HEAD(isolation_groups);
> +static LIST_HEAD(isolation_notifiers);
> +
> +/* Keep these private */
> +struct isolation_manager_driver {
> +	struct device_driver *drv;
> +	struct list_head list;
> +};
> +
> +struct isolation_manager {
> +	struct list_head drivers;
> +	/* Likely need managers to register some callbacks */
> +};
> +
> +struct isolation_group {
> +	struct list_head list;
> +	struct list_head devices;
> +	struct kobject kobj;
> +	struct kobject *devices_kobj;
> +	struct iommu_domain *iommu_domain;
> +	struct iommu_ops *iommu_ops;
> +	struct isolation_manager *manager;
> +	uuid_le uuid;
> +};
> +
> +struct isolation_device {
> +	struct device *dev;
> +	struct list_head list;
> +};
> +
> +struct isolation_notifier {
> +	struct bus_type *bus;
> +	struct notifier_block nb;
> +	struct blocking_notifier_head notifier;
> +	struct list_head list;
> +	int refcnt;
> +};
> +
> +struct iso_attribute {
> +	struct attribute attr;
> +	ssize_t (*show)(struct isolation_group *group, char *buf);
> +	ssize_t (*store)(struct isolation_group *group,
> +			 const char *buf, size_t count);
> +};
> +
> +#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr)
> +#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj)
> +
> +static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr,
> +			     char *buf)
> +{
> +	struct iso_attribute *iso_attr = to_iso_attr(attr);
> +	struct isolation_group *group = to_iso_group(kobj);
> +	ssize_t ret = -EIO;
> +
> +	if (iso_attr->show)
> +		ret = iso_attr->show(group, buf);
> +	return ret;
> +}
> +
> +static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr,
> +			      const char *buf, size_t count)
> +{
> +	struct iso_attribute *iso_attr = to_iso_attr(attr);
> +	struct isolation_group *group = to_iso_group(kobj);
> +	ssize_t ret = -EIO;
> +
> +	if (iso_attr->store)
> +		ret = iso_attr->store(group, buf, count);
> +	return ret;
> +}
> +
> +static void iso_release(struct kobject *kobj)
> +{
> +	struct isolation_group *group = to_iso_group(kobj);
> +	kfree(group);
> +}
> +
> +static const struct sysfs_ops iso_sysfs_ops = {
> +	.show = iso_attr_show,
> +	.store = iso_attr_store,
> +};
> +
> +static struct kobj_type iso_ktype = {
> +	.sysfs_ops = &iso_sysfs_ops,
> +	.release = iso_release,
> +};
> +
> +/*
> + * Create an isolation group.  Isolation group "providers" register a
> + * notifier for their bus(es) and call this to create a new isolation
> + * group.
> + */
> +struct isolation_group *isolation_create_group(void)
> +{
> +	struct isolation_group *group, *tmp;
> +	int ret;
> +
> +	if (unlikely(!isolation_kset))
> +		return ERR_PTR(-ENODEV);
> +
> +	group = kzalloc(sizeof(*group), GFP_KERNEL);
> +	if (!group)
> +		return ERR_PTR(-ENOMEM);
> +
> +	mutex_lock(&isolation_lock);
> +
> +	/*
> +	 * Use a UUID for group identification simply because it seems wrong
> +	 * to expose it as a kernel pointer (%p).  Neither is user friendly.
> +	 * Mostly only expect userspace to do anything with this.
> +	 */

So, my plan was to require the isolation provider to give a unique
name - it can construct something that's actually meaningful (and with
luck, stable across boots).  For Power we'd use the PE number, for
VT-d, I was thinking the PCI device address would do it (of the "base"
device for the non-separable bridge and broken multifunction cases).

> +new_uuid:
> +	uuid_le_gen(&group->uuid);
> +
> +	/* Unlikely, but nothing prevents duplicates */
> +	list_for_each_entry(tmp, &isolation_groups, list)
> +		if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0)
> +			goto new_uuid;
> +
> +	INIT_LIST_HEAD(&group->devices);
> +	group->kobj.kset = isolation_kset;
> +
> +	ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL,
> +				   "%pUl", &group->uuid);

Um.. isn't this setting the kobject name to the address of the uuid
plus "Ul", not the content of the uuid?

> +	if (ret) {
> +		kfree(group);
> +		mutex_unlock(&isolation_lock);
> +		return ERR_PTR(ret);
> +	}
> +
> +	/* Add a subdirectory to save links for devices withing the group. */
> +	group->devices_kobj = kobject_create_and_add("devices", &group->kobj);
> +	if (!group->devices_kobj) {
> +		kobject_put(&group->kobj);
> +		mutex_unlock(&isolation_lock);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	list_add(&group->list, &isolation_groups);
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return group;
> +}
> +
> +/*
> + * Free isolation group.  XXX need to cleanup/reject based on manager status.
> + */
> +int isolation_free_group(struct isolation_group *group)
> +{
> +	mutex_lock(&isolation_lock);
> +
> +	if (!list_empty(&group->devices)) {
> +		mutex_unlock(&isolation_lock);
> +		return -EBUSY;
> +	}
> +
> +	list_del(&group->list);
> +	kobject_put(group->devices_kobj);
> +	kobject_put(&group->kobj);
> +
> +	mutex_unlock(&isolation_lock);
> +	return 0;
> +}
> +
> +/*
> + * Add a device to an isolation group.  Isolation groups start empty and
> + * must be told about the devices they contain.  Expect this to be called
> + * from isolation group providers via notifier.
> + */

Doesn't necessarily have to be from a notifier, particularly if the
provider is integrated into host bridge code.

> +int isolation_group_add_dev(struct isolation_group *group, struct device *dev)
> +{
> +	struct isolation_device *device;
> +	int ret = 0;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	if (dev->isolation_group) {
> +		ret = -EBUSY;
> +		goto out;

This should probably be a BUG_ON() - the isolation provider shouldn't
be putting the same device into two different groups.

> +	}
> +
> +	device = kzalloc(sizeof(*device), GFP_KERNEL);
> +	if (!device) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	device->dev = dev;
> +
> +	/* Cross link the device in sysfs */
> +	ret = sysfs_create_link(&dev->kobj, &group->kobj,
> +				"isolation_group");
> +	if (ret) {
> +		kfree(device);
> +		goto out;
> +	}
> +	
> +	ret = sysfs_create_link(group->devices_kobj, &dev->kobj,
> +				kobject_name(&dev->kobj));

So, a problem both here and in my version is what to name the device
links.  Because they could be potentially be quite widely scattered,
I'm not sure that kobject_name() is guaranteed to be sufficiently
unique.

> +	if (ret) {
> +		sysfs_remove_link(&dev->kobj, "isolation_group");
> +		kfree(device);
> +		goto out;
> +	}
> +
> +	list_add(&device->list, &group->devices);
> +
> +	dev->isolation_group = group;
> +
> +	if (group->iommu_domain) {
> +		ret = group->iommu_ops->attach_dev(group->iommu_domain, dev);
> +		if (ret) {
> +			/* XXX fail device? */
> +		}

So, I presume the idea is that this is a stop-gap until iommu_ops is
converted to be in terms of groups rather than indivdual devices?

> +	}
> +
> +	/* XXX signal manager? */

Uh, what did you have in mind here?

> +out:
> +	mutex_unlock(&isolation_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * Remove a device from an isolation group, likely because the device
> + * went away.
> + */
> +int isolation_group_del_dev(struct device *dev)
> +{
> +	struct isolation_group *group = dev->isolation_group;
> +	struct isolation_device *device;
> +
> +	if (!group)
> +		return -ENODEV;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	list_for_each_entry(device, &group->devices, list) {
> +		if (device->dev == dev) {
> +			/* XXX signal manager? */
> +
> +			if (group->iommu_domain)
> +				group->iommu_ops->detach_dev(
> +					group->iommu_domain, dev);
> +			list_del(&device->list);
> +			kfree(device);
> +			dev->isolation_group = NULL;
> +			sysfs_remove_link(group->devices_kobj,
> +					  kobject_name(&dev->kobj));
> +			sysfs_remove_link(&dev->kobj, "isolation_group");
> +			break;
> +		}
> +	}
> +			
> +	mutex_unlock(&isolation_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * Filter and call out to our own notifier chains
> + */

So, I haven't quite worked out what this isolation notifier
infrastructure gives you, as opposed to having isolation providers
directly register bus notifiers.

> +static int isolation_bus_notify(struct notifier_block *nb,
> +				unsigned long action, void *data)
> +{
> +	struct isolation_notifier *notifier =
> +		container_of(nb, struct isolation_notifier, nb);
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		if (!dev->isolation_group)
> +			blocking_notifier_call_chain(&notifier->notifier,
> +					ISOLATION_NOTIFY_ADD_DEVICE, dev);
> +		break;
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		if (dev->isolation_group)
> +			blocking_notifier_call_chain(&notifier->notifier,
> +					ISOLATION_NOTIFY_DEL_DEVICE, dev);
> +		break;
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
> +/*
> + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add
> + * a new notifier.
> + */
> +static int isolation_do_add_notify(struct device *dev, void *data)
> +{
> +	struct notifier_block *nb = data;
> +
> +	if (!dev->isolation_group)
> +		nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev);
> +
> +	return 0;
> +}
> +
> +/*
> + * Register a notifier.  This is for isolation group providers.  It's
> + * possible we could have multiple isolation group providers for the same
> + * bus type,

So the bit above doesn't seem to connect to the bit below.  We can
indeed have multiple isolation providers for one bus type, but the
below seems to be covering the (also possible, but different) case of
one isolation provider covering multiple bus types.

> so we create a struct isolation_notifier per bus_type, each
> + * with a notifier block.  Providers are therefore welcome to register
> + * as many buses as they can handle.
> + */
> +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb)
> +{
> +	struct isolation_notifier *notifier;
> +	bool found = false;
> +	int ret;
> +
> +	mutex_lock(&isolation_lock);
> +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> +		if (notifier->bus == bus) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	if (!found) {
> +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> +		if (!notifier) {
> +			mutex_unlock(&isolation_lock);
> +			return -ENOMEM;	
> +		}
> +		notifier->bus = bus;
> +		notifier->nb.notifier_call = isolation_bus_notify;
> +		BLOCKING_INIT_NOTIFIER_HEAD(&notifier->notifier);
> +		bus_register_notifier(bus, &notifier->nb);
> +		list_add(&notifier->list, &isolation_notifiers);
> +	}
> +
> +	ret = blocking_notifier_chain_register(&notifier->notifier, nb);
> +	if (ret) {
> +		if (notifier->refcnt == 0) {
> +			bus_unregister_notifier(bus, &notifier->nb);
> +			list_del(&notifier->list);
> +			kfree(notifier);
> +		}
> +		mutex_unlock(&isolation_lock);
> +		return ret;
> +	}
> +	notifier->refcnt++;
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify);
> +
> +	return 0;
> +}

So, somewhere, I think we need a fallback path, but I'm not sure
exactly where.  If an isolation provider doesn't explicitly put a
device into a group, the device should go into the group of its parent
bridge.  This covers the case of a bus with IOMMU which has below it a
bridge to a different type of DMA capable bus (which the IOMMU isn't
explicitly aware of).  DMAs from devices on the subordinate bus can be
translated by the top-level IOMMU (assuming it sees them as coming
from the bridge), but they all need to be treated as one group.

> +/*
> + * Unregister...
> + */
> +int isolation_unregister_notifier(struct bus_type *bus,
> +				  struct notifier_block *nb)
> +{
> +	struct isolation_notifier *notifier;
> +	bool found = false;
> +	int ret;
> +
> +	mutex_lock(&isolation_lock);
> +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> +		if (notifier->bus == bus) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	if (!found) {
> +		mutex_unlock(&isolation_lock);
> +		return -ENODEV;
> +	}
> +
> +	ret = blocking_notifier_chain_unregister(&notifier->notifier, nb);
> +	if (ret) {
> +		mutex_unlock(&isolation_lock);
> +		return ret;
> +	}
> +
> +	/* Free at nobody left in the notifier block */
> +	if (--notifier->refcnt == 0) {
> +		bus_unregister_notifier(bus, &notifier->nb);
> +		list_del(&notifier->list);
> +		kfree(notifier);
> +	}
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * Set iommu_ops on an isolation group.  Hopefully all DMA will eventually
> + * be done through isolation groups, for now, we just provide another
> + * interface to the iommu api.
> + */
> +int isolation_group_set_iommu_ops(struct isolation_group *group,
> +				  struct iommu_ops *iommu_ops)
> +{
> +	mutex_lock(&isolation_lock);
> +
> +	if (group->iommu_ops) {
> +		mutex_unlock(&isolation_lock);
> +		return -EBUSY;
> +	}
> +
> +	group->iommu_ops = iommu_ops;
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * Attach all the devices for a group to the specified iommu domain.
> + */
> +static int __isolation_group_domain_attach_devs(struct iommu_domain *domain,
> +					        struct list_head *devices)
> +{
> +	struct isolation_device *device;
> +	struct device *dev;
> +	int ret = 0;
> +
> +	list_for_each_entry(device, devices, list) {
> +		dev = device->dev;
> +
> +		ret = domain->ops->attach_dev(domain, dev);
> +		if (ret) {
> +			list_for_each_entry_continue_reverse(device,
> +							     devices, list) {
> +				dev = device->dev;
> +				domain->ops->detach_dev(domain, dev);
> +			}
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * And detach...
> + */
> +static void __isolation_group_domain_detach_devs(struct iommu_domain *domain,
> +						 struct list_head *devices)
> +{
> +	struct isolation_device *device;
> +	struct device *dev;
> +
> +	list_for_each_entry(device, devices, list) {
> +		dev = device->dev;
> +
> +		domain->ops->detach_dev(domain, dev);
> +	}
> +}
> +
> +/*
> + * Initialize the iommu domain for an isolation group.  This is to ease the
> + * transition to using isolation groups and expected to be used by current
> + * users of the iommu api for now.
> + */
> +int isolation_group_init_iommu_domain(struct isolation_group *group)
> +{
> +	int ret;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	if (!group->iommu_ops || list_empty(&group->devices)) {
> +		mutex_unlock(&isolation_lock);
> +		return -EINVAL;
> +	}
> +
> +	if (group->iommu_domain) {
> +		mutex_unlock(&isolation_lock);
> +		return -EBUSY;
> +	}
> +
> +	group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL);
> +	if (!group->iommu_domain) {
> +		mutex_unlock(&isolation_lock);
> +		return -ENOMEM;
> +	}
> +
> +	group->iommu_domain->ops = group->iommu_ops;
> +
> +	ret = group->iommu_ops->domain_init(group->iommu_domain);
> +	if (ret) {
> +		kfree(group->iommu_domain);
> +		group->iommu_domain = NULL;
> +		mutex_unlock(&isolation_lock);
> +		return ret;
> +	}
> +
> +	/* Automatically attach all the devices in the group. */
> +	ret = __isolation_group_domain_attach_devs(group->iommu_domain,
> +						   &group->devices);
> +	if (ret) {
> +		group->iommu_ops->domain_destroy(group->iommu_domain);
> +		kfree(group->iommu_domain);
> +		group->iommu_domain = NULL;
> +		mutex_unlock(&isolation_lock);
> +		return ret;
> +	}
> +		
> +	mutex_unlock(&isolation_lock);
> +	return 0;
> +}
> +
> +/*
> + * And free...
> + * Note just below, we add the ability to add another group to an iommu
> + * domain, so we don't always destroy and free the domain here.
> + */
> +void isolation_group_free_iommu_domain(struct isolation_group *group)
> +{
> +	struct isolation_group *tmp;
> +	bool inuse = false;
> +
> +	if (!group->iommu_domain || !group->iommu_ops)
> +		return;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	__isolation_group_domain_detach_devs(group->iommu_domain,
> +					     &group->devices);
> +
> +	list_for_each_entry(tmp, &isolation_groups, list) {
> +		if (tmp == group)
> +			continue;
> +		if (tmp->iommu_domain == group->iommu_domain) {
> +			inuse = true;
> +			break;
> +		}
> +	}
> +
> +	if (!inuse) {
> +		group->iommu_ops->domain_destroy(group->iommu_domain);
> +		kfree(group->iommu_domain);
> +	}
> +
> +	group->iommu_domain = NULL;
> +
> +	mutex_unlock(&isolation_lock);
> +}
> +
> +/*
> + * This handles the VFIO "merge" interface, allowing us to manage multiple
> + * isolation groups with a single domain.  We just rely on attach_dev to
> + * tell us whether this is ok.
> + */
> +int isolation_group_iommu_domain_add_group(struct isolation_group *groupa,
> +					   struct isolation_group *groupb)
> +{
> +	int ret;
> +
> +	if (!groupa->iommu_domain ||
> +	    groupb->iommu_domain || list_empty(&groupb->devices))
> +		return -EINVAL;
> +
> +	if (groupa->iommu_ops != groupb->iommu_ops)
> +		return -EIO;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	ret = __isolation_group_domain_attach_devs(groupa->iommu_domain,
> +						   &groupb->devices);
> +	if (ret) {
> +		mutex_unlock(&isolation_lock);
> +		return ret;
> +	}
> +
> +	groupb->iommu_domain = groupa->iommu_domain;
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * XXX page mapping/unmapping and more iommu api wrappers
> + */
> +
> +/*
> + * Do we have a group manager?  Group managers restrict what drivers are
> + * allowed to attach to devices.  A group is "locked" when all of the devices
> + * for a group belong to group manager drivers (or no driver at all).  At
> + * that point, the group manager can initialize the iommu.  vfio is an example
> + * of a group manager and vfio-pci is an exanple of a driver that a group
> + * manager may add as a "managed" driver.  Note that we don't gate iommu
> + * manipulation on being managed because we want to use it for regular DMA
> + * at some point as well.
> + */
> +bool isolation_group_managed(struct isolation_group *group)
> +{
> +	return group->manager != NULL;
> +}
> +
> +/*
> + * Initialize the group manager struct.  At this point the isolation group
> + * becomes "managed".
> + */

This assumes a separate manager struct for each group.  I would have
thought the VFIO merge case would be more obviously represented by
having a single manager struct for all the groups in the VFIO instance
(== iommu domain).

> +int isolation_group_init_manager(struct isolation_group *group)
> +{
> +	mutex_lock(&isolation_lock);
> +
> +	if (group->manager) {
> +		mutex_unlock(&isolation_lock);
> +		return -EBUSY;
> +	}
> +
> +	group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL);
> +	if (!group->manager) {
> +		mutex_unlock(&isolation_lock);
> +		return -ENOMEM;
> +	}
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * And free... cleanup registerd managed drivers too.
> + */
> +void isolation_group_free_manager(struct isolation_group *group)
> +{
> +	struct isolation_manager_driver *driver, *tmp;
> +
> +	if (!group->manager)
> +		return;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) {
> +		list_del(&driver->list);
> +		kfree(driver);
> +	}
> +		
> +	kfree(group->manager);
> +	group->manager = NULL;
> +	mutex_unlock(&isolation_lock);
> +}
> +
> +/*
> + * Add a managed driver.  Drivers added here are the only ones that will
> + * be allowed to attach to a managed isolation group.
> + */
> +int isolation_group_manager_add_driver(struct isolation_group *group,
> +				       struct device_driver *drv)
> +{
> +	struct isolation_manager_driver *driver;
> +
> +	if (!group->manager)
> +		return -EINVAL;
> +
> +	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
> +	if (!driver)
> +		return -ENOMEM;
> +
> +	driver->drv = drv;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	list_add(&driver->list, &group->manager->drivers);
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return 0;
> +}
> +
> +/*
> + * And remove a driver.  Don't really expect to need this much.
> + */
> +void isolation_group_manager_del_driver(struct isolation_group *group,
> +				        struct device_driver *drv)
> +{
> +	struct isolation_manager_driver *driver;
> +
> +	if (!group->manager)
> +		return;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	list_for_each_entry(driver, &group->manager->drivers, list) {
> +		if (driver->drv == drv) {
> +			list_del(&driver->list);
> +			kfree(driver);
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&isolation_lock);
> +}
> +
> +/*
> + * Test whether a driver is a "managed" driver for the group.  This allows
> + * is to preempt normal driver matching and only let our drivers in.
> + */
> +bool isolation_group_manager_driver(struct isolation_group *group,
> +				    struct device_driver *drv)
> +{
> +	struct isolation_manager_driver *driver;
> +	bool found = false;
> +
> +	if (!group->manager)
> +		return found;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	list_for_each_entry(driver, &group->manager->drivers, list) {
> +		if (driver->drv == drv) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return found;
> +}
> +
> +/*
> + * Does the group manager have control of all of the devices in the group?
> + * We consider driver-less devices to be ok here as they don't do DMA and
> + * don't present any interfaces to subvert the rest of the group.
> + */
> +bool isolation_group_manager_locked(struct isolation_group *group)
> +{
> +	struct isolation_device *device;
> +	struct isolation_manager_driver *driver;
> +	bool found, locked = true;
> +
> +	if (!group->manager)
> +		return false;
> +
> +	mutex_lock(&isolation_lock);
> +
> +	list_for_each_entry(device, &group->devices, list) {
> +		found = false;
> +
> +		if (!device->dev->driver)
> +			continue;
> +

You could simplify this using isolation_group_manager_driver(),
couldn't you?

> +		list_for_each_entry(driver, &group->manager->drivers, list) {
> +			if (device->dev->driver == driver->drv) {
> +				found = true;
> +				break;
> +			}
> +		}
> +
> +		if (!found) {
> +			locked = false;
> +			break;
> +		}
> +	}
> +
> +	mutex_unlock(&isolation_lock);
> +
> +	return locked;
> +}
> +
> +static int __init isolation_init(void)
> +{
> +	isolation_kset = kset_create_and_add("isolation", NULL, NULL);
> +	
> +	WARN(!isolation_kset, "Failed to initialize isolation group kset\n");
> +
> +	return isolation_kset ? 0 : -1;

I'd be tempted to just BUG() here if you can't add the kset - I can't
see any reason it would fail unless you're so short of RAM that you
have bigger problems.  Making this a fatal fail would save having to
double check if the kset is around in the later paths.

> +}
> +subsys_initcall(isolation_init);
> diff --git a/include/linux/device.h b/include/linux/device.h
> index b63fb39..5805c56 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -663,6 +663,10 @@ struct device {
>  
>  	struct device_dma_parameters *dma_parms;
>  
> +#ifdef CONFIG_ISOLATION_GROUPS
> +	struct isolation_group	*isolation_group;
> +#endif
> +
>  	struct list_head	dma_pools;	/* dma pools (if dma'ble) */
>  
>  	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
> diff --git a/include/linux/isolation.h b/include/linux/isolation.h
> new file mode 100644
> index 0000000..1d87caf
> --- /dev/null
> +++ b/include/linux/isolation.h
> @@ -0,0 +1,138 @@
> +/*
> + * Isolation group interface
> + *
> + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> + * Author: Alex Williamson <alex.williamson@redhat.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#ifndef _LINUX_ISOLATION_H
> +#define _LINUX_ISOLATION_H
> +
> +#define ISOLATION_NOTIFY_ADD_DEVICE	1
> +#define ISOLATION_NOTIFY_DEL_DEVICE	2
> +
> +#ifdef CONFIG_ISOLATION_GROUPS
> +
> +extern struct isolation_group *isolation_create_group(void);
> +extern int isolation_free_group(struct isolation_group *group);
> +extern int isolation_group_add_dev(struct isolation_group *group,
> +				   struct device *dev);
> +extern int isolation_group_del_dev(struct device *dev);
> +extern int isolation_register_notifier(struct bus_type *bus,
> +				       struct notifier_block *nb);
> +extern int isolation_unregister_notifier(struct bus_type *bus,
> +					 struct notifier_block *nb);
> +extern int isolation_group_set_iommu_ops(struct isolation_group *group,
> +					 struct iommu_ops *iommu_ops);
> +extern int isolation_group_init_iommu_domain(struct isolation_group *group);
> +extern void isolation_group_free_iommu_domain(struct isolation_group *group);
> +extern int isolation_group_iommu_domain_add_group(
> +	struct isolation_group *groupa, struct isolation_group *groupb);
> +extern bool isolation_group_managed(struct isolation_group *group);
> +extern int isolation_group_init_manager(struct isolation_group *group);
> +extern void isolation_group_free_manager(struct isolation_group *group);
> +extern int isolation_group_manager_add_driver(struct isolation_group *group,
> +					      struct device_driver *drv);
> +extern void isolation_group_manager_del_driver(struct isolation_group *group,
> +					       struct device_driver *drv);
> +extern bool isolation_group_manager_driver(struct isolation_group *group,
> +					   struct device_driver *drv);
> +extern bool isolation_group_manager_locked(struct isolation_group *group);
> +
> +#define to_isolation_group(dev)	((dev)->isolation_group)
> +
> +#else /* CONFIG_ISOLATION_GROUPS */
> +
> +static inline struct isolation_group *isolation_create_group(void)
> +{
> +	return NULL;
> +}
> +
> +static inline int isolation_free_group(struct isolation_group *group)
> +{
> +	return 0;
> +}
> +
> +static inline int isolation_group_add_dev(struct isolation_group *group,
> +					  struct device *dev)
> +{
> +	return 0;
> +}
> +
> +static inline int isolation_group_del_dev(struct device *dev)
> +{
> +	return 0;
> +}
> +
> +static int isolation_register_notifier(struct bus_type *bus,
> +				       struct notifier_block *nb)
> +{
> +	return 0;
> +}
> +
> +static int isolation_unregister_notifier(struct bus_type *bus,
> +					 struct notifier_block *nb)
> +{
> +	return 0;
> +}
> +
> +static int isolation_group_set_iommu_ops(struct isolation_group *group,
> +					 struct iommu_ops *iommu_ops)
> +{
> +	return -ENOSYS;
> +}
> +
> +static int isolation_group_init_iommu_domain(struct isolation_group *group)
> +{
> +	return -ENOSYS;
> +}
> +
> +static void isolation_group_free_iommu_domain(struct isolation_group *group)
> +{
> +}
> +
> +static int isolation_group_iommu_domain_add_group(
> +	struct isolation_group *groupa, struct isolation_group *groupb)
> +{
> +	return -ENOSYS;
> +}
> +
> +static int isolation_group_init_manager(struct isolation_group *group)
> +{
> +	return -ENOSYS;
> +}
> +
> +static void isolation_group_free_manager(struct isolation_group *group)
> +{
> +}
> +
> +static int isolation_group_manager_add_driver(struct isolation_group *group,
> +					      struct device_driver *drv)
> +{
> +	return -ENOSYS;
> +}
> +
> +static void isolation_group_manager_del_driver(struct isolation_group *group,
> +					       struct device_driver *drv)
> +{
> +}
> +
> +static bool isolation_group_manager_locked(struct isolation_group *group)
> +{
> +	return false;
> +}
> +
> +#define to_isolation_group(dev)	(NULL)
> +
> +#define isolation_group_managed(group) (false)
> +
> +#define isolation_group_manager_driver(group, drv) (false)
> +
> +#endif /* CONFIG_ISOLATION_GROUPS */
> +
> +#endif /* _LINUX_ISOLATION_H */
>
Alex Williamson - March 13, 2012, 4:49 p.m.
On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > ---
> > 
> >  drivers/base/Kconfig      |   10 +
> >  drivers/base/Makefile     |    1 
> >  drivers/base/base.h       |    5 
> >  drivers/base/isolation.c  |  798 +++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/device.h    |    4 
> >  include/linux/isolation.h |  138 ++++++++
> >  6 files changed, 956 insertions(+), 0 deletions(-)
> >  create mode 100644 drivers/base/isolation.c
> >  create mode 100644 include/linux/isolation.h
> > 
> > diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> > index 7be9f79..e98a5f3 100644
> > --- a/drivers/base/Kconfig
> > +++ b/drivers/base/Kconfig
> > @@ -189,4 +189,14 @@ config DMA_SHARED_BUFFER
> >  	  APIs extension; the file's descriptor can then be passed on to other
> >  	  driver.
> >  
> > +config ISOLATION_GROUPS
> > +	bool "Enable Isolation Group API"
> > +	default n
> > +	depends on EXPERIMENTAL && IOMMU_API
> > +	help
> > +	  This option enables grouping of devices into Isolation Groups
> > +	  which may be used by other subsystems to perform quirks across
> > +	  sets of devices as well as userspace drivers for guaranteeing
> > +	  devices are isolated from the rest of the system.
> > +
> >  endmenu
> > diff --git a/drivers/base/Makefile b/drivers/base/Makefile
> > index 610f999..047b5f9 100644
> > --- a/drivers/base/Makefile
> > +++ b/drivers/base/Makefile
> > @@ -19,6 +19,7 @@ obj-$(CONFIG_MODULES)	+= module.o
> >  endif
> >  obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
> >  obj-$(CONFIG_REGMAP)	+= regmap/
> > +obj-$(CONFIG_ISOLATION_GROUPS)	+= isolation.o
> >  
> >  ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
> >  
> > diff --git a/drivers/base/base.h b/drivers/base/base.h
> > index b858dfd..376758a 100644
> > --- a/drivers/base/base.h
> > +++ b/drivers/base/base.h
> > @@ -1,4 +1,5 @@
> >  #include <linux/notifier.h>
> > +#include <linux/isolation.h>
> >  
> >  /**
> >   * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
> > @@ -108,6 +109,10 @@ extern int driver_probe_device(struct device_driver *drv, struct device *dev);
> >  static inline int driver_match_device(struct device_driver *drv,
> >  				      struct device *dev)
> >  {
> > +	if (isolation_group_managed(to_isolation_group(dev)) &&
> > +	    !isolation_group_manager_driver(to_isolation_group(dev), drv))
> > +		return 0;
> > +
> >  	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
> >  }
> >  
> > diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c
> > new file mode 100644
> > index 0000000..c01365c
> > --- /dev/null
> > +++ b/drivers/base/isolation.c
> > @@ -0,0 +1,798 @@
> > +/*
> > + * Isolation group interface
> > + *
> > + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> > + * Author: Alex Williamson <alex.williamson@redhat.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + */
> > +
> > +#include <linux/device.h>
> > +#include <linux/iommu.h>
> > +#include <linux/isolation.h>
> > +#include <linux/list.h>
> > +#include <linux/mutex.h>
> > +#include <linux/notifier.h>
> > +#include <linux/slab.h>
> > +#include <linux/uuid.h>
> > +
> > +static struct kset *isolation_kset;
> > +/* XXX add more complete locking, maybe rcu */
> > +static DEFINE_MUTEX(isolation_lock);
> > +static LIST_HEAD(isolation_groups);
> > +static LIST_HEAD(isolation_notifiers);
> > +
> > +/* Keep these private */
> > +struct isolation_manager_driver {
> > +	struct device_driver *drv;
> > +	struct list_head list;
> > +};
> > +
> > +struct isolation_manager {
> > +	struct list_head drivers;
> > +	/* Likely need managers to register some callbacks */
> > +};
> > +
> > +struct isolation_group {
> > +	struct list_head list;
> > +	struct list_head devices;
> > +	struct kobject kobj;
> > +	struct kobject *devices_kobj;
> > +	struct iommu_domain *iommu_domain;
> > +	struct iommu_ops *iommu_ops;
> > +	struct isolation_manager *manager;
> > +	uuid_le uuid;
> > +};
> > +
> > +struct isolation_device {
> > +	struct device *dev;
> > +	struct list_head list;
> > +};
> > +
> > +struct isolation_notifier {
> > +	struct bus_type *bus;
> > +	struct notifier_block nb;
> > +	struct blocking_notifier_head notifier;
> > +	struct list_head list;
> > +	int refcnt;
> > +};
> > +
> > +struct iso_attribute {
> > +	struct attribute attr;
> > +	ssize_t (*show)(struct isolation_group *group, char *buf);
> > +	ssize_t (*store)(struct isolation_group *group,
> > +			 const char *buf, size_t count);
> > +};
> > +
> > +#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr)
> > +#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj)
> > +
> > +static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr,
> > +			     char *buf)
> > +{
> > +	struct iso_attribute *iso_attr = to_iso_attr(attr);
> > +	struct isolation_group *group = to_iso_group(kobj);
> > +	ssize_t ret = -EIO;
> > +
> > +	if (iso_attr->show)
> > +		ret = iso_attr->show(group, buf);
> > +	return ret;
> > +}
> > +
> > +static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr,
> > +			      const char *buf, size_t count)
> > +{
> > +	struct iso_attribute *iso_attr = to_iso_attr(attr);
> > +	struct isolation_group *group = to_iso_group(kobj);
> > +	ssize_t ret = -EIO;
> > +
> > +	if (iso_attr->store)
> > +		ret = iso_attr->store(group, buf, count);
> > +	return ret;
> > +}
> > +
> > +static void iso_release(struct kobject *kobj)
> > +{
> > +	struct isolation_group *group = to_iso_group(kobj);
> > +	kfree(group);
> > +}
> > +
> > +static const struct sysfs_ops iso_sysfs_ops = {
> > +	.show = iso_attr_show,
> > +	.store = iso_attr_store,
> > +};
> > +
> > +static struct kobj_type iso_ktype = {
> > +	.sysfs_ops = &iso_sysfs_ops,
> > +	.release = iso_release,
> > +};
> > +
> > +/*
> > + * Create an isolation group.  Isolation group "providers" register a
> > + * notifier for their bus(es) and call this to create a new isolation
> > + * group.
> > + */
> > +struct isolation_group *isolation_create_group(void)
> > +{
> > +	struct isolation_group *group, *tmp;
> > +	int ret;
> > +
> > +	if (unlikely(!isolation_kset))
> > +		return ERR_PTR(-ENODEV);
> > +
> > +	group = kzalloc(sizeof(*group), GFP_KERNEL);
> > +	if (!group)
> > +		return ERR_PTR(-ENOMEM);
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	/*
> > +	 * Use a UUID for group identification simply because it seems wrong
> > +	 * to expose it as a kernel pointer (%p).  Neither is user friendly.
> > +	 * Mostly only expect userspace to do anything with this.
> > +	 */
> 
> So, my plan was to require the isolation provider to give a unique
> name - it can construct something that's actually meaningful (and with
> luck, stable across boots).  For Power we'd use the PE number, for
> VT-d, I was thinking the PCI device address would do it (of the "base"
> device for the non-separable bridge and broken multifunction cases).

Naming always seems to end in "that's not unique" or "that doesn't apply
to us", so I figured I'd just avoid the problem by using random numbers.
We can allow providers to specify a name, but that still presents
challenges to uniqueness and consistency if we intend to generically
allow heterogeneous providers on a system.  In the VFIO use case I
expect userspace will get the name from readlink on the isolation_group
sysfs entry and open a vfio provided device file of the same name.  So
in the end, it doesn't matter what the name is.

> > +new_uuid:
> > +	uuid_le_gen(&group->uuid);
> > +
> > +	/* Unlikely, but nothing prevents duplicates */
> > +	list_for_each_entry(tmp, &isolation_groups, list)
> > +		if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0)
> > +			goto new_uuid;
> > +
> > +	INIT_LIST_HEAD(&group->devices);
> > +	group->kobj.kset = isolation_kset;
> > +
> > +	ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL,
> > +				   "%pUl", &group->uuid);
> 
> Um.. isn't this setting the kobject name to the address of the uuid
> plus "Ul", not the content of the uuid?

We have kernel extensions for %p, %pUl is a little endian UUID.  It
prints in the right format, but I'll have to check if I'm getting the
actual UUID or a UUID-ified pointer address.

> > +	if (ret) {
> > +		kfree(group);
> > +		mutex_unlock(&isolation_lock);
> > +		return ERR_PTR(ret);
> > +	}
> > +
> > +	/* Add a subdirectory to save links for devices withing the group. */
> > +	group->devices_kobj = kobject_create_and_add("devices", &group->kobj);
> > +	if (!group->devices_kobj) {
> > +		kobject_put(&group->kobj);
> > +		mutex_unlock(&isolation_lock);
> > +		return ERR_PTR(-ENOMEM);
> > +	}
> > +
> > +	list_add(&group->list, &isolation_groups);
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return group;
> > +}
> > +
> > +/*
> > + * Free isolation group.  XXX need to cleanup/reject based on manager status.
> > + */
> > +int isolation_free_group(struct isolation_group *group)
> > +{
> > +	mutex_lock(&isolation_lock);
> > +
> > +	if (!list_empty(&group->devices)) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -EBUSY;
> > +	}
> > +
> > +	list_del(&group->list);
> > +	kobject_put(group->devices_kobj);
> > +	kobject_put(&group->kobj);
> > +
> > +	mutex_unlock(&isolation_lock);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Add a device to an isolation group.  Isolation groups start empty and
> > + * must be told about the devices they contain.  Expect this to be called
> > + * from isolation group providers via notifier.
> > + */
> 
> Doesn't necessarily have to be from a notifier, particularly if the
> provider is integrated into host bridge code.

Sure, a provider could do this on it's own if it wants.  This just
provides some infrastructure for a common path.  Also note that this
helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
to be seen whether that can reasonably be the case once isolation groups
are added to streaming DMA paths.

> > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev)
> > +{
> > +	struct isolation_device *device;
> > +	int ret = 0;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	if (dev->isolation_group) {
> > +		ret = -EBUSY;
> > +		goto out;
> 
> This should probably be a BUG_ON() - the isolation provider shouldn't
> be putting the same device into two different groups.

Yeah, probably.

> > +	}
> > +
> > +	device = kzalloc(sizeof(*device), GFP_KERNEL);
> > +	if (!device) {
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +	device->dev = dev;
> > +
> > +	/* Cross link the device in sysfs */
> > +	ret = sysfs_create_link(&dev->kobj, &group->kobj,
> > +				"isolation_group");
> > +	if (ret) {
> > +		kfree(device);
> > +		goto out;
> > +	}
> > +	
> > +	ret = sysfs_create_link(group->devices_kobj, &dev->kobj,
> > +				kobject_name(&dev->kobj));
> 
> So, a problem both here and in my version is what to name the device
> links.  Because they could be potentially be quite widely scattered,
> I'm not sure that kobject_name() is guaranteed to be sufficiently
> unique.

Even if the name is not, we're pointing to a unique sysfs location.  I
expect the primary user of this will be when userspace translates a
device to a group (via the isolation_group link below), then tries to
walk all the devices in the group to determine effected host devices and
manager driver status.  So it would probably be traversing the link back
rather than relying solely on the name of the link.  Right?

> > +	if (ret) {
> > +		sysfs_remove_link(&dev->kobj, "isolation_group");
> > +		kfree(device);
> > +		goto out;
> > +	}
> > +
> > +	list_add(&device->list, &group->devices);
> > +
> > +	dev->isolation_group = group;
> > +
> > +	if (group->iommu_domain) {
> > +		ret = group->iommu_ops->attach_dev(group->iommu_domain, dev);
> > +		if (ret) {
> > +			/* XXX fail device? */
> > +		}
> 
> So, I presume the idea is that this is a stop-gap until iommu_ops is
> converted to be in terms of groups rather than indivdual devices?

Yes, I thought we could just back it by the iommu api for now so we can
implement managers, but eventually domain_init and attach_dev would
probably be a single callback in some kind of group aware iommu api.

> > +	}
> > +
> > +	/* XXX signal manager? */
> 
> Uh, what did you have in mind here?

Another notifier?  Maybe just a callback struct registered when a
manager is added to a group.  On one hand, maybe it's not necessary
because adding a device to a managed group already restricts driver
matching, but on the other, the manager may want to be informed about
new devices and try to actively bind a driver to it.  For instance, if
assigning an entire PE to a guest, which might include empty slots,
would the manager want to be informed about the addition of a device so
it could mirror the addition to the guest assigned the PE?

> > +out:
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Remove a device from an isolation group, likely because the device
> > + * went away.
> > + */
> > +int isolation_group_del_dev(struct device *dev)
> > +{
> > +	struct isolation_group *group = dev->isolation_group;
> > +	struct isolation_device *device;
> > +
> > +	if (!group)
> > +		return -ENODEV;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	list_for_each_entry(device, &group->devices, list) {
> > +		if (device->dev == dev) {
> > +			/* XXX signal manager? */
> > +
> > +			if (group->iommu_domain)
> > +				group->iommu_ops->detach_dev(
> > +					group->iommu_domain, dev);
> > +			list_del(&device->list);
> > +			kfree(device);
> > +			dev->isolation_group = NULL;
> > +			sysfs_remove_link(group->devices_kobj,
> > +					  kobject_name(&dev->kobj));
> > +			sysfs_remove_link(&dev->kobj, "isolation_group");
> > +			break;
> > +		}
> > +	}
> > +			
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Filter and call out to our own notifier chains
> > + */
> 
> So, I haven't quite worked out what this isolation notifier
> infrastructure gives you, as opposed to having isolation providers
> directly register bus notifiers.

For now, it nicely separates CONFIG_ISOLATION code so I don't litter
intel-iommu with #ifdefs.  It also embraces that devices are always
being added and removed and supports that with very little change to
existing code paths.

> > +static int isolation_bus_notify(struct notifier_block *nb,
> > +				unsigned long action, void *data)
> > +{
> > +	struct isolation_notifier *notifier =
> > +		container_of(nb, struct isolation_notifier, nb);
> > +	struct device *dev = data;
> > +
> > +	switch (action) {
> > +	case BUS_NOTIFY_ADD_DEVICE:
> > +		if (!dev->isolation_group)
> > +			blocking_notifier_call_chain(&notifier->notifier,
> > +					ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > +		break;
> > +	case BUS_NOTIFY_DEL_DEVICE:
> > +		if (dev->isolation_group)
> > +			blocking_notifier_call_chain(&notifier->notifier,
> > +					ISOLATION_NOTIFY_DEL_DEVICE, dev);
> > +		break;
> > +	}
> > +
> > +	return NOTIFY_DONE;
> > +}
> > +
> > +/*
> > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add
> > + * a new notifier.
> > + */
> > +static int isolation_do_add_notify(struct device *dev, void *data)
> > +{
> > +	struct notifier_block *nb = data;
> > +
> > +	if (!dev->isolation_group)
> > +		nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Register a notifier.  This is for isolation group providers.  It's
> > + * possible we could have multiple isolation group providers for the same
> > + * bus type,
> 
> So the bit above doesn't seem to connect to the bit below.  We can
> indeed have multiple isolation providers for one bus type, but the
> below seems to be covering the (also possible, but different) case of
> one isolation provider covering multiple bus types.

It covers both.  If a provider covers multiple buses it will call this
function once for each bus.  Each new bus creates a new struct
isolation_notifier and does a bus_register_notifier (using
isolation_bus_notify).  The provider notifier block is added to our own
notifier call chain for that struct isolation_notifier.  This way we
only ever register a single notifier per bus, but support multiple
providers for the bus.

> > so we create a struct isolation_notifier per bus_type, each
> > + * with a notifier block.  Providers are therefore welcome to register
> > + * as many buses as they can handle.
> > + */
> > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb)
> > +{
> > +	struct isolation_notifier *notifier;
> > +	bool found = false;
> > +	int ret;
> > +
> > +	mutex_lock(&isolation_lock);
> > +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> > +		if (notifier->bus == bus) {
> > +			found = true;
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (!found) {
> > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > +		if (!notifier) {
> > +			mutex_unlock(&isolation_lock);
> > +			return -ENOMEM;	
> > +		}
> > +		notifier->bus = bus;
> > +		notifier->nb.notifier_call = isolation_bus_notify;
> > +		BLOCKING_INIT_NOTIFIER_HEAD(&notifier->notifier);
> > +		bus_register_notifier(bus, &notifier->nb);
> > +		list_add(&notifier->list, &isolation_notifiers);
> > +	}
> > +
> > +	ret = blocking_notifier_chain_register(&notifier->notifier, nb);
> > +	if (ret) {
> > +		if (notifier->refcnt == 0) {
> > +			bus_unregister_notifier(bus, &notifier->nb);
> > +			list_del(&notifier->list);
> > +			kfree(notifier);
> > +		}
> > +		mutex_unlock(&isolation_lock);
> > +		return ret;
> > +	}
> > +	notifier->refcnt++;
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify);
> > +
> > +	return 0;
> > +}
> 
> So, somewhere, I think we need a fallback path, but I'm not sure
> exactly where.  If an isolation provider doesn't explicitly put a
> device into a group, the device should go into the group of its parent
> bridge.  This covers the case of a bus with IOMMU which has below it a
> bridge to a different type of DMA capable bus (which the IOMMU isn't
> explicitly aware of).  DMAs from devices on the subordinate bus can be
> translated by the top-level IOMMU (assuming it sees them as coming
> from the bridge), but they all need to be treated as one group.

Why would the top level IOMMU provider not set the isolation group in
this case.  I agree there's a gap here, but it's fuzzy when and how it
can occur and if it matters (devices without an isolation group can't be
used by managers, and apparently don't make use of any of the services
of a provider...).

> > +/*
> > + * Unregister...
> > + */
> > +int isolation_unregister_notifier(struct bus_type *bus,
> > +				  struct notifier_block *nb)
> > +{
> > +	struct isolation_notifier *notifier;
> > +	bool found = false;
> > +	int ret;
> > +
> > +	mutex_lock(&isolation_lock);
> > +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> > +		if (notifier->bus == bus) {
> > +			found = true;
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (!found) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -ENODEV;
> > +	}
> > +
> > +	ret = blocking_notifier_chain_unregister(&notifier->notifier, nb);
> > +	if (ret) {
> > +		mutex_unlock(&isolation_lock);
> > +		return ret;
> > +	}
> > +
> > +	/* Free at nobody left in the notifier block */
> > +	if (--notifier->refcnt == 0) {
> > +		bus_unregister_notifier(bus, &notifier->nb);
> > +		list_del(&notifier->list);
> > +		kfree(notifier);
> > +	}
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Set iommu_ops on an isolation group.  Hopefully all DMA will eventually
> > + * be done through isolation groups, for now, we just provide another
> > + * interface to the iommu api.
> > + */
> > +int isolation_group_set_iommu_ops(struct isolation_group *group,
> > +				  struct iommu_ops *iommu_ops)
> > +{
> > +	mutex_lock(&isolation_lock);
> > +
> > +	if (group->iommu_ops) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -EBUSY;
> > +	}
> > +
> > +	group->iommu_ops = iommu_ops;
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Attach all the devices for a group to the specified iommu domain.
> > + */
> > +static int __isolation_group_domain_attach_devs(struct iommu_domain *domain,
> > +					        struct list_head *devices)
> > +{
> > +	struct isolation_device *device;
> > +	struct device *dev;
> > +	int ret = 0;
> > +
> > +	list_for_each_entry(device, devices, list) {
> > +		dev = device->dev;
> > +
> > +		ret = domain->ops->attach_dev(domain, dev);
> > +		if (ret) {
> > +			list_for_each_entry_continue_reverse(device,
> > +							     devices, list) {
> > +				dev = device->dev;
> > +				domain->ops->detach_dev(domain, dev);
> > +			}
> > +			break;
> > +		}
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +/*
> > + * And detach...
> > + */
> > +static void __isolation_group_domain_detach_devs(struct iommu_domain *domain,
> > +						 struct list_head *devices)
> > +{
> > +	struct isolation_device *device;
> > +	struct device *dev;
> > +
> > +	list_for_each_entry(device, devices, list) {
> > +		dev = device->dev;
> > +
> > +		domain->ops->detach_dev(domain, dev);
> > +	}
> > +}
> > +
> > +/*
> > + * Initialize the iommu domain for an isolation group.  This is to ease the
> > + * transition to using isolation groups and expected to be used by current
> > + * users of the iommu api for now.
> > + */
> > +int isolation_group_init_iommu_domain(struct isolation_group *group)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	if (!group->iommu_ops || list_empty(&group->devices)) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (group->iommu_domain) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -EBUSY;
> > +	}
> > +
> > +	group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL);
> > +	if (!group->iommu_domain) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	group->iommu_domain->ops = group->iommu_ops;
> > +
> > +	ret = group->iommu_ops->domain_init(group->iommu_domain);
> > +	if (ret) {
> > +		kfree(group->iommu_domain);
> > +		group->iommu_domain = NULL;
> > +		mutex_unlock(&isolation_lock);
> > +		return ret;
> > +	}
> > +
> > +	/* Automatically attach all the devices in the group. */
> > +	ret = __isolation_group_domain_attach_devs(group->iommu_domain,
> > +						   &group->devices);
> > +	if (ret) {
> > +		group->iommu_ops->domain_destroy(group->iommu_domain);
> > +		kfree(group->iommu_domain);
> > +		group->iommu_domain = NULL;
> > +		mutex_unlock(&isolation_lock);
> > +		return ret;
> > +	}
> > +		
> > +	mutex_unlock(&isolation_lock);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * And free...
> > + * Note just below, we add the ability to add another group to an iommu
> > + * domain, so we don't always destroy and free the domain here.
> > + */
> > +void isolation_group_free_iommu_domain(struct isolation_group *group)
> > +{
> > +	struct isolation_group *tmp;
> > +	bool inuse = false;
> > +
> > +	if (!group->iommu_domain || !group->iommu_ops)
> > +		return;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	__isolation_group_domain_detach_devs(group->iommu_domain,
> > +					     &group->devices);
> > +
> > +	list_for_each_entry(tmp, &isolation_groups, list) {
> > +		if (tmp == group)
> > +			continue;
> > +		if (tmp->iommu_domain == group->iommu_domain) {
> > +			inuse = true;
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (!inuse) {
> > +		group->iommu_ops->domain_destroy(group->iommu_domain);
> > +		kfree(group->iommu_domain);
> > +	}
> > +
> > +	group->iommu_domain = NULL;
> > +
> > +	mutex_unlock(&isolation_lock);
> > +}
> > +
> > +/*
> > + * This handles the VFIO "merge" interface, allowing us to manage multiple
> > + * isolation groups with a single domain.  We just rely on attach_dev to
> > + * tell us whether this is ok.
> > + */
> > +int isolation_group_iommu_domain_add_group(struct isolation_group *groupa,
> > +					   struct isolation_group *groupb)
> > +{
> > +	int ret;
> > +
> > +	if (!groupa->iommu_domain ||
> > +	    groupb->iommu_domain || list_empty(&groupb->devices))
> > +		return -EINVAL;
> > +
> > +	if (groupa->iommu_ops != groupb->iommu_ops)
> > +		return -EIO;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	ret = __isolation_group_domain_attach_devs(groupa->iommu_domain,
> > +						   &groupb->devices);
> > +	if (ret) {
> > +		mutex_unlock(&isolation_lock);
> > +		return ret;
> > +	}
> > +
> > +	groupb->iommu_domain = groupa->iommu_domain;
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * XXX page mapping/unmapping and more iommu api wrappers
> > + */
> > +
> > +/*
> > + * Do we have a group manager?  Group managers restrict what drivers are
> > + * allowed to attach to devices.  A group is "locked" when all of the devices
> > + * for a group belong to group manager drivers (or no driver at all).  At
> > + * that point, the group manager can initialize the iommu.  vfio is an example
> > + * of a group manager and vfio-pci is an exanple of a driver that a group
> > + * manager may add as a "managed" driver.  Note that we don't gate iommu
> > + * manipulation on being managed because we want to use it for regular DMA
> > + * at some point as well.
> > + */
> > +bool isolation_group_managed(struct isolation_group *group)
> > +{
> > +	return group->manager != NULL;
> > +}
> > +
> > +/*
> > + * Initialize the group manager struct.  At this point the isolation group
> > + * becomes "managed".
> > + */
> 
> This assumes a separate manager struct for each group.  I would have
> thought the VFIO merge case would be more obviously represented by
> having a single manager struct for all the groups in the VFIO instance
> (== iommu domain).

Given the objections to merging groups, I thought perhaps it would be
safer to explicitly make it pertain to the iommu domain here.  My vision
of managers and merging is still solidifying, but the manager init seems
like a first step to claiming a group while attempting to share
resources comes a while later.  I don't see much overhead on either the
grouping code or the manager code for keeping these separate.  Do you?

> > +int isolation_group_init_manager(struct isolation_group *group)
> > +{
> > +	mutex_lock(&isolation_lock);
> > +
> > +	if (group->manager) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -EBUSY;
> > +	}
> > +
> > +	group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL);
> > +	if (!group->manager) {
> > +		mutex_unlock(&isolation_lock);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * And free... cleanup registerd managed drivers too.
> > + */
> > +void isolation_group_free_manager(struct isolation_group *group)
> > +{
> > +	struct isolation_manager_driver *driver, *tmp;
> > +
> > +	if (!group->manager)
> > +		return;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) {
> > +		list_del(&driver->list);
> > +		kfree(driver);
> > +	}
> > +		
> > +	kfree(group->manager);
> > +	group->manager = NULL;
> > +	mutex_unlock(&isolation_lock);
> > +}
> > +
> > +/*
> > + * Add a managed driver.  Drivers added here are the only ones that will
> > + * be allowed to attach to a managed isolation group.
> > + */
> > +int isolation_group_manager_add_driver(struct isolation_group *group,
> > +				       struct device_driver *drv)
> > +{
> > +	struct isolation_manager_driver *driver;
> > +
> > +	if (!group->manager)
> > +		return -EINVAL;
> > +
> > +	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
> > +	if (!driver)
> > +		return -ENOMEM;
> > +
> > +	driver->drv = drv;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	list_add(&driver->list, &group->manager->drivers);
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +/*
> > + * And remove a driver.  Don't really expect to need this much.
> > + */
> > +void isolation_group_manager_del_driver(struct isolation_group *group,
> > +				        struct device_driver *drv)
> > +{
> > +	struct isolation_manager_driver *driver;
> > +
> > +	if (!group->manager)
> > +		return;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	list_for_each_entry(driver, &group->manager->drivers, list) {
> > +		if (driver->drv == drv) {
> > +			list_del(&driver->list);
> > +			kfree(driver);
> > +			break;
> > +		}
> > +	}
> > +
> > +	mutex_unlock(&isolation_lock);
> > +}
> > +
> > +/*
> > + * Test whether a driver is a "managed" driver for the group.  This allows
> > + * is to preempt normal driver matching and only let our drivers in.
> > + */
> > +bool isolation_group_manager_driver(struct isolation_group *group,
> > +				    struct device_driver *drv)
> > +{
> > +	struct isolation_manager_driver *driver;
> > +	bool found = false;
> > +
> > +	if (!group->manager)
> > +		return found;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	list_for_each_entry(driver, &group->manager->drivers, list) {
> > +		if (driver->drv == drv) {
> > +			found = true;
> > +			break;
> > +		}
> > +	}
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return found;
> > +}
> > +
> > +/*
> > + * Does the group manager have control of all of the devices in the group?
> > + * We consider driver-less devices to be ok here as they don't do DMA and
> > + * don't present any interfaces to subvert the rest of the group.
> > + */
> > +bool isolation_group_manager_locked(struct isolation_group *group)
> > +{
> > +	struct isolation_device *device;
> > +	struct isolation_manager_driver *driver;
> > +	bool found, locked = true;
> > +
> > +	if (!group->manager)
> > +		return false;
> > +
> > +	mutex_lock(&isolation_lock);
> > +
> > +	list_for_each_entry(device, &group->devices, list) {
> > +		found = false;
> > +
> > +		if (!device->dev->driver)
> > +			continue;
> > +
> 
> You could simplify this using isolation_group_manager_driver(),
> couldn't you?

Yeah, probably.

> > +		list_for_each_entry(driver, &group->manager->drivers, list) {
> > +			if (device->dev->driver == driver->drv) {
> > +				found = true;
> > +				break;
> > +			}
> > +		}
> > +
> > +		if (!found) {
> > +			locked = false;
> > +			break;
> > +		}
> > +	}
> > +
> > +	mutex_unlock(&isolation_lock);
> > +
> > +	return locked;
> > +}
> > +
> > +static int __init isolation_init(void)
> > +{
> > +	isolation_kset = kset_create_and_add("isolation", NULL, NULL);
> > +	
> > +	WARN(!isolation_kset, "Failed to initialize isolation group kset\n");
> > +
> > +	return isolation_kset ? 0 : -1;
> 
> I'd be tempted to just BUG() here if you can't add the kset - I can't
> see any reason it would fail unless you're so short of RAM that you
> have bigger problems.  Making this a fatal fail would save having to
> double check if the kset is around in the later paths.

Yep.  Thanks for the comments,

Alex
David Gibson - March 14, 2012, 9:58 a.m.
On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
> > > ---
> > > 
> > >  drivers/base/Kconfig      |   10 +
> > >  drivers/base/Makefile     |    1 
> > >  drivers/base/base.h       |    5 
> > >  drivers/base/isolation.c  |  798 +++++++++++++++++++++++++++++++++++++++++++++
> > >  include/linux/device.h    |    4 
> > >  include/linux/isolation.h |  138 ++++++++
> > >  6 files changed, 956 insertions(+), 0 deletions(-)
> > >  create mode 100644 drivers/base/isolation.c
> > >  create mode 100644 include/linux/isolation.h
> > > 
> > > diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> > > index 7be9f79..e98a5f3 100644
> > > --- a/drivers/base/Kconfig
> > > +++ b/drivers/base/Kconfig
> > > @@ -189,4 +189,14 @@ config DMA_SHARED_BUFFER
> > >  	  APIs extension; the file's descriptor can then be passed on to other
> > >  	  driver.
> > >  
> > > +config ISOLATION_GROUPS
> > > +	bool "Enable Isolation Group API"
> > > +	default n
> > > +	depends on EXPERIMENTAL && IOMMU_API
> > > +	help
> > > +	  This option enables grouping of devices into Isolation Groups
> > > +	  which may be used by other subsystems to perform quirks across
> > > +	  sets of devices as well as userspace drivers for guaranteeing
> > > +	  devices are isolated from the rest of the system.
> > > +
> > >  endmenu
> > > diff --git a/drivers/base/Makefile b/drivers/base/Makefile
> > > index 610f999..047b5f9 100644
> > > --- a/drivers/base/Makefile
> > > +++ b/drivers/base/Makefile
> > > @@ -19,6 +19,7 @@ obj-$(CONFIG_MODULES)	+= module.o
> > >  endif
> > >  obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
> > >  obj-$(CONFIG_REGMAP)	+= regmap/
> > > +obj-$(CONFIG_ISOLATION_GROUPS)	+= isolation.o
> > >  
> > >  ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
> > >  
> > > diff --git a/drivers/base/base.h b/drivers/base/base.h
> > > index b858dfd..376758a 100644
> > > --- a/drivers/base/base.h
> > > +++ b/drivers/base/base.h
> > > @@ -1,4 +1,5 @@
> > >  #include <linux/notifier.h>
> > > +#include <linux/isolation.h>
> > >  
> > >  /**
> > >   * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
> > > @@ -108,6 +109,10 @@ extern int driver_probe_device(struct device_driver *drv, struct device *dev);
> > >  static inline int driver_match_device(struct device_driver *drv,
> > >  				      struct device *dev)
> > >  {
> > > +	if (isolation_group_managed(to_isolation_group(dev)) &&
> > > +	    !isolation_group_manager_driver(to_isolation_group(dev), drv))
> > > +		return 0;
> > > +
> > >  	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
> > >  }
> > >  
> > > diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c
> > > new file mode 100644
> > > index 0000000..c01365c
> > > --- /dev/null
> > > +++ b/drivers/base/isolation.c
> > > @@ -0,0 +1,798 @@
> > > +/*
> > > + * Isolation group interface
> > > + *
> > > + * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
> > > + * Author: Alex Williamson <alex.williamson@redhat.com>
> > > + *
> > > + * This program is free software; you can redistribute it and/or modify
> > > + * it under the terms of the GNU General Public License version 2 as
> > > + * published by the Free Software Foundation.
> > > + *
> > > + */
> > > +
> > > +#include <linux/device.h>
> > > +#include <linux/iommu.h>
> > > +#include <linux/isolation.h>
> > > +#include <linux/list.h>
> > > +#include <linux/mutex.h>
> > > +#include <linux/notifier.h>
> > > +#include <linux/slab.h>
> > > +#include <linux/uuid.h>
> > > +
> > > +static struct kset *isolation_kset;
> > > +/* XXX add more complete locking, maybe rcu */
> > > +static DEFINE_MUTEX(isolation_lock);
> > > +static LIST_HEAD(isolation_groups);
> > > +static LIST_HEAD(isolation_notifiers);
> > > +
> > > +/* Keep these private */
> > > +struct isolation_manager_driver {
> > > +	struct device_driver *drv;
> > > +	struct list_head list;
> > > +};
> > > +
> > > +struct isolation_manager {
> > > +	struct list_head drivers;
> > > +	/* Likely need managers to register some callbacks */
> > > +};
> > > +
> > > +struct isolation_group {
> > > +	struct list_head list;
> > > +	struct list_head devices;
> > > +	struct kobject kobj;
> > > +	struct kobject *devices_kobj;
> > > +	struct iommu_domain *iommu_domain;
> > > +	struct iommu_ops *iommu_ops;
> > > +	struct isolation_manager *manager;
> > > +	uuid_le uuid;
> > > +};
> > > +
> > > +struct isolation_device {
> > > +	struct device *dev;
> > > +	struct list_head list;
> > > +};
> > > +
> > > +struct isolation_notifier {
> > > +	struct bus_type *bus;
> > > +	struct notifier_block nb;
> > > +	struct blocking_notifier_head notifier;
> > > +	struct list_head list;
> > > +	int refcnt;
> > > +};
> > > +
> > > +struct iso_attribute {
> > > +	struct attribute attr;
> > > +	ssize_t (*show)(struct isolation_group *group, char *buf);
> > > +	ssize_t (*store)(struct isolation_group *group,
> > > +			 const char *buf, size_t count);
> > > +};
> > > +
> > > +#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr)
> > > +#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj)
> > > +
> > > +static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr,
> > > +			     char *buf)
> > > +{
> > > +	struct iso_attribute *iso_attr = to_iso_attr(attr);
> > > +	struct isolation_group *group = to_iso_group(kobj);
> > > +	ssize_t ret = -EIO;
> > > +
> > > +	if (iso_attr->show)
> > > +		ret = iso_attr->show(group, buf);
> > > +	return ret;
> > > +}
> > > +
> > > +static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr,
> > > +			      const char *buf, size_t count)
> > > +{
> > > +	struct iso_attribute *iso_attr = to_iso_attr(attr);
> > > +	struct isolation_group *group = to_iso_group(kobj);
> > > +	ssize_t ret = -EIO;
> > > +
> > > +	if (iso_attr->store)
> > > +		ret = iso_attr->store(group, buf, count);
> > > +	return ret;
> > > +}
> > > +
> > > +static void iso_release(struct kobject *kobj)
> > > +{
> > > +	struct isolation_group *group = to_iso_group(kobj);
> > > +	kfree(group);
> > > +}
> > > +
> > > +static const struct sysfs_ops iso_sysfs_ops = {
> > > +	.show = iso_attr_show,
> > > +	.store = iso_attr_store,
> > > +};
> > > +
> > > +static struct kobj_type iso_ktype = {
> > > +	.sysfs_ops = &iso_sysfs_ops,
> > > +	.release = iso_release,
> > > +};
> > > +
> > > +/*
> > > + * Create an isolation group.  Isolation group "providers" register a
> > > + * notifier for their bus(es) and call this to create a new isolation
> > > + * group.
> > > + */
> > > +struct isolation_group *isolation_create_group(void)
> > > +{
> > > +	struct isolation_group *group, *tmp;
> > > +	int ret;
> > > +
> > > +	if (unlikely(!isolation_kset))
> > > +		return ERR_PTR(-ENODEV);
> > > +
> > > +	group = kzalloc(sizeof(*group), GFP_KERNEL);
> > > +	if (!group)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	/*
> > > +	 * Use a UUID for group identification simply because it seems wrong
> > > +	 * to expose it as a kernel pointer (%p).  Neither is user friendly.
> > > +	 * Mostly only expect userspace to do anything with this.
> > > +	 */
> > 
> > So, my plan was to require the isolation provider to give a unique
> > name - it can construct something that's actually meaningful (and with
> > luck, stable across boots).  For Power we'd use the PE number, for
> > VT-d, I was thinking the PCI device address would do it (of the "base"
> > device for the non-separable bridge and broken multifunction cases).
> 
> Naming always seems to end in "that's not unique" or "that doesn't apply
> to us", so I figured I'd just avoid the problem by using random numbers.
> We can allow providers to specify a name, but that still presents
> challenges to uniqueness and consistency if we intend to generically
> allow heterogeneous providers on a system.  In the VFIO use case I
> expect userspace will get the name from readlink on the isolation_group
> sysfs entry and open a vfio provided device file of the same name.  So
> in the end, it doesn't matter what the name is.

Hrm, maybe.

> > > +new_uuid:
> > > +	uuid_le_gen(&group->uuid);
> > > +
> > > +	/* Unlikely, but nothing prevents duplicates */
> > > +	list_for_each_entry(tmp, &isolation_groups, list)
> > > +		if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0)
> > > +			goto new_uuid;
> > > +
> > > +	INIT_LIST_HEAD(&group->devices);
> > > +	group->kobj.kset = isolation_kset;
> > > +
> > > +	ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL,
> > > +				   "%pUl", &group->uuid);
> > 
> > Um.. isn't this setting the kobject name to the address of the uuid
> > plus "Ul", not the content of the uuid?
> 
> We have kernel extensions for %p, %pUl is a little endian UUID.  It
> prints in the right format, but I'll have to check if I'm getting the
> actual UUID or a UUID-ified pointer address.

Heh, and the modifiers bizarrely go after the 'p'.  Wow, ok.

> > > +	if (ret) {
> > > +		kfree(group);
> > > +		mutex_unlock(&isolation_lock);
> > > +		return ERR_PTR(ret);
> > > +	}
> > > +
> > > +	/* Add a subdirectory to save links for devices withing the group. */
> > > +	group->devices_kobj = kobject_create_and_add("devices", &group->kobj);
> > > +	if (!group->devices_kobj) {
> > > +		kobject_put(&group->kobj);
> > > +		mutex_unlock(&isolation_lock);
> > > +		return ERR_PTR(-ENOMEM);
> > > +	}
> > > +
> > > +	list_add(&group->list, &isolation_groups);
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return group;
> > > +}
> > > +
> > > +/*
> > > + * Free isolation group.  XXX need to cleanup/reject based on manager status.
> > > + */
> > > +int isolation_free_group(struct isolation_group *group)
> > > +{
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	if (!list_empty(&group->devices)) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -EBUSY;
> > > +	}
> > > +
> > > +	list_del(&group->list);
> > > +	kobject_put(group->devices_kobj);
> > > +	kobject_put(&group->kobj);
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > + * must be told about the devices they contain.  Expect this to be called
> > > + * from isolation group providers via notifier.
> > > + */
> > 
> > Doesn't necessarily have to be from a notifier, particularly if the
> > provider is integrated into host bridge code.
> 
> Sure, a provider could do this on it's own if it wants.  This just
> provides some infrastructure for a common path.  Also note that this
> helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> to be seen whether that can reasonably be the case once isolation groups
> are added to streaming DMA paths.

Right, but other than the #ifdef safety, which could be achieved more
simply, I'm not seeing what benefit the infrastructure provides over
directly calling the bus notifier function.  The infrastructure groups
the notifiers by bus type internally, but AFAICT exactly one bus
notifier call would become exactly one isolation notifier call, and
the notifier callback itself would be almost identical.

> > > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev)
> > > +{
> > > +	struct isolation_device *device;
> > > +	int ret = 0;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	if (dev->isolation_group) {
> > > +		ret = -EBUSY;
> > > +		goto out;
> > 
> > This should probably be a BUG_ON() - the isolation provider shouldn't
> > be putting the same device into two different groups.
> 
> Yeah, probably.
> 
> > > +	}
> > > +
> > > +	device = kzalloc(sizeof(*device), GFP_KERNEL);
> > > +	if (!device) {
> > > +		ret = -ENOMEM;
> > > +		goto out;
> > > +	}
> > > +
> > > +	device->dev = dev;
> > > +
> > > +	/* Cross link the device in sysfs */
> > > +	ret = sysfs_create_link(&dev->kobj, &group->kobj,
> > > +				"isolation_group");
> > > +	if (ret) {
> > > +		kfree(device);
> > > +		goto out;
> > > +	}
> > > +	
> > > +	ret = sysfs_create_link(group->devices_kobj, &dev->kobj,
> > > +				kobject_name(&dev->kobj));
> > 
> > So, a problem both here and in my version is what to name the device
> > links.  Because they could be potentially be quite widely scattered,
> > I'm not sure that kobject_name() is guaranteed to be sufficiently
> > unique.
> 
> Even if the name is not, we're pointing to a unique sysfs location.  I
> expect the primary user of this will be when userspace translates a
> device to a group (via the isolation_group link below), then tries to
> walk all the devices in the group to determine effected host devices and
> manager driver status.  So it would probably be traversing the link back
> rather than relying solely on the name of the link.  Right?

Yes, but if we get a duplicate kobject_name() we could get duplicate
link names within the same directory so we're still in trouble.  We
could replace the names with just an index number.

> > > +	if (ret) {
> > > +		sysfs_remove_link(&dev->kobj, "isolation_group");
> > > +		kfree(device);
> > > +		goto out;
> > > +	}
> > > +
> > > +	list_add(&device->list, &group->devices);
> > > +
> > > +	dev->isolation_group = group;
> > > +
> > > +	if (group->iommu_domain) {
> > > +		ret = group->iommu_ops->attach_dev(group->iommu_domain, dev);
> > > +		if (ret) {
> > > +			/* XXX fail device? */
> > > +		}
> > 
> > So, I presume the idea is that this is a stop-gap until iommu_ops is
> > converted to be in terms of groups rather than indivdual devices?
> 
> Yes, I thought we could just back it by the iommu api for now so we can
> implement managers, but eventually domain_init and attach_dev would
> probably be a single callback in some kind of group aware iommu api.

Makes sense.

> > > +	}
> > > +
> > > +	/* XXX signal manager? */
> > 
> > Uh, what did you have in mind here?
> 
> Another notifier?  Maybe just a callback struct registered when a
> manager is added to a group.  On one hand, maybe it's not necessary
> because adding a device to a managed group already restricts driver
> matching, but on the other, the manager may want to be informed about
> new devices and try to actively bind a driver to it.  For instance, if
> assigning an entire PE to a guest, which might include empty slots,
> would the manager want to be informed about the addition of a device so
> it could mirror the addition to the guest assigned the PE?

Oh, right, I was misinterpreting the comment.  Instead of reading it
as "notify the isolation group manager that something is happening" I
read it as "do something to manage when some process is given a Unix
signal".  Hence my complete confusion.

> > > +out:
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Remove a device from an isolation group, likely because the device
> > > + * went away.
> > > + */
> > > +int isolation_group_del_dev(struct device *dev)
> > > +{
> > > +	struct isolation_group *group = dev->isolation_group;
> > > +	struct isolation_device *device;
> > > +
> > > +	if (!group)
> > > +		return -ENODEV;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	list_for_each_entry(device, &group->devices, list) {
> > > +		if (device->dev == dev) {
> > > +			/* XXX signal manager? */
> > > +
> > > +			if (group->iommu_domain)
> > > +				group->iommu_ops->detach_dev(
> > > +					group->iommu_domain, dev);
> > > +			list_del(&device->list);
> > > +			kfree(device);
> > > +			dev->isolation_group = NULL;
> > > +			sysfs_remove_link(group->devices_kobj,
> > > +					  kobject_name(&dev->kobj));
> > > +			sysfs_remove_link(&dev->kobj, "isolation_group");
> > > +			break;
> > > +		}
> > > +	}
> > > +			
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Filter and call out to our own notifier chains
> > > + */
> > 
> > So, I haven't quite worked out what this isolation notifier
> > infrastructure gives you, as opposed to having isolation providers
> > directly register bus notifiers.
> 
> For now, it nicely separates CONFIG_ISOLATION code so I don't litter
> intel-iommu with #ifdefs.  It also embraces that devices are always
> being added and removed and supports that with very little change to
> existing code paths.

Well, I get the first bit, but it's a long way from the minimum needed
to de-#ifdef.  I still haven't seen what it's giving the provider that
wouldn't be just as simple with direct bus_notifier calls.

> > > +static int isolation_bus_notify(struct notifier_block *nb,
> > > +				unsigned long action, void *data)
> > > +{
> > > +	struct isolation_notifier *notifier =
> > > +		container_of(nb, struct isolation_notifier, nb);
> > > +	struct device *dev = data;
> > > +
> > > +	switch (action) {
> > > +	case BUS_NOTIFY_ADD_DEVICE:
> > > +		if (!dev->isolation_group)
> > > +			blocking_notifier_call_chain(&notifier->notifier,
> > > +					ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > +		break;
> > > +	case BUS_NOTIFY_DEL_DEVICE:
> > > +		if (dev->isolation_group)
> > > +			blocking_notifier_call_chain(&notifier->notifier,
> > > +					ISOLATION_NOTIFY_DEL_DEVICE, dev);
> > > +		break;
> > > +	}
> > > +
> > > +	return NOTIFY_DONE;
> > > +}
> > > +
> > > +/*
> > > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add
> > > + * a new notifier.
> > > + */
> > > +static int isolation_do_add_notify(struct device *dev, void *data)
> > > +{
> > > +	struct notifier_block *nb = data;
> > > +
> > > +	if (!dev->isolation_group)
> > > +		nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Register a notifier.  This is for isolation group providers.  It's
> > > + * possible we could have multiple isolation group providers for the same
> > > + * bus type,
> > 
> > So the bit above doesn't seem to connect to the bit below.  We can
> > indeed have multiple isolation providers for one bus type, but the
> > below seems to be covering the (also possible, but different) case of
> > one isolation provider covering multiple bus types.
> 
> It covers both.  If a provider covers multiple buses it will call this
> function once for each bus.  Each new bus creates a new struct
> isolation_notifier and does a bus_register_notifier (using
> isolation_bus_notify).  The provider notifier block is added to our own
> notifier call chain for that struct isolation_notifier.  This way we
> only ever register a single notifier per bus, but support multiple
> providers for the bus.

Um, right, but what would be the problem with registering multiple
notifiers per bus.

> > > so we create a struct isolation_notifier per bus_type, each
> > > + * with a notifier block.  Providers are therefore welcome to register
> > > + * as many buses as they can handle.
> > > + */
> > > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb)
> > > +{
> > > +	struct isolation_notifier *notifier;
> > > +	bool found = false;
> > > +	int ret;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> > > +		if (notifier->bus == bus) {
> > > +			found = true;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	if (!found) {
> > > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > > +		if (!notifier) {
> > > +			mutex_unlock(&isolation_lock);
> > > +			return -ENOMEM;	
> > > +		}
> > > +		notifier->bus = bus;
> > > +		notifier->nb.notifier_call = isolation_bus_notify;
> > > +		BLOCKING_INIT_NOTIFIER_HEAD(&notifier->notifier);
> > > +		bus_register_notifier(bus, &notifier->nb);
> > > +		list_add(&notifier->list, &isolation_notifiers);
> > > +	}
> > > +
> > > +	ret = blocking_notifier_chain_register(&notifier->notifier, nb);
> > > +	if (ret) {
> > > +		if (notifier->refcnt == 0) {
> > > +			bus_unregister_notifier(bus, &notifier->nb);
> > > +			list_del(&notifier->list);
> > > +			kfree(notifier);
> > > +		}
> > > +		mutex_unlock(&isolation_lock);
> > > +		return ret;
> > > +	}
> > > +	notifier->refcnt++;
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify);
> > > +
> > > +	return 0;
> > > +}
> > 
> > So, somewhere, I think we need a fallback path, but I'm not sure
> > exactly where.  If an isolation provider doesn't explicitly put a
> > device into a group, the device should go into the group of its parent
> > bridge.  This covers the case of a bus with IOMMU which has below it a
> > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > translated by the top-level IOMMU (assuming it sees them as coming
> > from the bridge), but they all need to be treated as one group.
> 
> Why would the top level IOMMU provider not set the isolation group in
> this case.

Because it knows nothing about the subordinate bus.  For example
imagine a VT-d system, with a wacky PCI card into which you plug some
other type of DMA capable device.  The PCI card is acting as a bridge
from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
have a FooBus notifier, since it knows nothing about FooBus.  But the
FooBus devices still need to end up in the group of the PCI bridge
device, since their DMA operations will appear as coming from the PCI
bridge card to the IOMMU, and can be isolated from the rest of the
system (but not each other) on that basis.

>  I agree there's a gap here, but it's fuzzy when and how it
> can occur and if it matters (devices without an isolation group can't be
> used by managers, and apparently don't make use of any of the services
> of a provider...).

> > > +/*
> > > + * Unregister...
> > > + */
> > > +int isolation_unregister_notifier(struct bus_type *bus,
> > > +				  struct notifier_block *nb)
> > > +{
> > > +	struct isolation_notifier *notifier;
> > > +	bool found = false;
> > > +	int ret;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> > > +		if (notifier->bus == bus) {
> > > +			found = true;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	if (!found) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -ENODEV;
> > > +	}
> > > +
> > > +	ret = blocking_notifier_chain_unregister(&notifier->notifier, nb);
> > > +	if (ret) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return ret;
> > > +	}
> > > +
> > > +	/* Free at nobody left in the notifier block */
> > > +	if (--notifier->refcnt == 0) {
> > > +		bus_unregister_notifier(bus, &notifier->nb);
> > > +		list_del(&notifier->list);
> > > +		kfree(notifier);
> > > +	}
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Set iommu_ops on an isolation group.  Hopefully all DMA will eventually
> > > + * be done through isolation groups, for now, we just provide another
> > > + * interface to the iommu api.
> > > + */
> > > +int isolation_group_set_iommu_ops(struct isolation_group *group,
> > > +				  struct iommu_ops *iommu_ops)
> > > +{
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	if (group->iommu_ops) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -EBUSY;
> > > +	}
> > > +
> > > +	group->iommu_ops = iommu_ops;
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Attach all the devices for a group to the specified iommu domain.
> > > + */
> > > +static int __isolation_group_domain_attach_devs(struct iommu_domain *domain,
> > > +					        struct list_head *devices)
> > > +{
> > > +	struct isolation_device *device;
> > > +	struct device *dev;
> > > +	int ret = 0;
> > > +
> > > +	list_for_each_entry(device, devices, list) {
> > > +		dev = device->dev;
> > > +
> > > +		ret = domain->ops->attach_dev(domain, dev);
> > > +		if (ret) {
> > > +			list_for_each_entry_continue_reverse(device,
> > > +							     devices, list) {
> > > +				dev = device->dev;
> > > +				domain->ops->detach_dev(domain, dev);
> > > +			}
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	return ret;
> > > +}
> > > +
> > > +/*
> > > + * And detach...
> > > + */
> > > +static void __isolation_group_domain_detach_devs(struct iommu_domain *domain,
> > > +						 struct list_head *devices)
> > > +{
> > > +	struct isolation_device *device;
> > > +	struct device *dev;
> > > +
> > > +	list_for_each_entry(device, devices, list) {
> > > +		dev = device->dev;
> > > +
> > > +		domain->ops->detach_dev(domain, dev);
> > > +	}
> > > +}
> > > +
> > > +/*
> > > + * Initialize the iommu domain for an isolation group.  This is to ease the
> > > + * transition to using isolation groups and expected to be used by current
> > > + * users of the iommu api for now.
> > > + */
> > > +int isolation_group_init_iommu_domain(struct isolation_group *group)
> > > +{
> > > +	int ret;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	if (!group->iommu_ops || list_empty(&group->devices)) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	if (group->iommu_domain) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -EBUSY;
> > > +	}
> > > +
> > > +	group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL);
> > > +	if (!group->iommu_domain) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -ENOMEM;
> > > +	}
> > > +
> > > +	group->iommu_domain->ops = group->iommu_ops;
> > > +
> > > +	ret = group->iommu_ops->domain_init(group->iommu_domain);
> > > +	if (ret) {
> > > +		kfree(group->iommu_domain);
> > > +		group->iommu_domain = NULL;
> > > +		mutex_unlock(&isolation_lock);
> > > +		return ret;
> > > +	}
> > > +
> > > +	/* Automatically attach all the devices in the group. */
> > > +	ret = __isolation_group_domain_attach_devs(group->iommu_domain,
> > > +						   &group->devices);
> > > +	if (ret) {
> > > +		group->iommu_ops->domain_destroy(group->iommu_domain);
> > > +		kfree(group->iommu_domain);
> > > +		group->iommu_domain = NULL;
> > > +		mutex_unlock(&isolation_lock);
> > > +		return ret;
> > > +	}
> > > +		
> > > +	mutex_unlock(&isolation_lock);
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * And free...
> > > + * Note just below, we add the ability to add another group to an iommu
> > > + * domain, so we don't always destroy and free the domain here.
> > > + */
> > > +void isolation_group_free_iommu_domain(struct isolation_group *group)
> > > +{
> > > +	struct isolation_group *tmp;
> > > +	bool inuse = false;
> > > +
> > > +	if (!group->iommu_domain || !group->iommu_ops)
> > > +		return;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	__isolation_group_domain_detach_devs(group->iommu_domain,
> > > +					     &group->devices);
> > > +
> > > +	list_for_each_entry(tmp, &isolation_groups, list) {
> > > +		if (tmp == group)
> > > +			continue;
> > > +		if (tmp->iommu_domain == group->iommu_domain) {
> > > +			inuse = true;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	if (!inuse) {
> > > +		group->iommu_ops->domain_destroy(group->iommu_domain);
> > > +		kfree(group->iommu_domain);
> > > +	}
> > > +
> > > +	group->iommu_domain = NULL;
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +}
> > > +
> > > +/*
> > > + * This handles the VFIO "merge" interface, allowing us to manage multiple
> > > + * isolation groups with a single domain.  We just rely on attach_dev to
> > > + * tell us whether this is ok.
> > > + */
> > > +int isolation_group_iommu_domain_add_group(struct isolation_group *groupa,
> > > +					   struct isolation_group *groupb)
> > > +{
> > > +	int ret;
> > > +
> > > +	if (!groupa->iommu_domain ||
> > > +	    groupb->iommu_domain || list_empty(&groupb->devices))
> > > +		return -EINVAL;
> > > +
> > > +	if (groupa->iommu_ops != groupb->iommu_ops)
> > > +		return -EIO;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	ret = __isolation_group_domain_attach_devs(groupa->iommu_domain,
> > > +						   &groupb->devices);
> > > +	if (ret) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return ret;
> > > +	}
> > > +
> > > +	groupb->iommu_domain = groupa->iommu_domain;
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * XXX page mapping/unmapping and more iommu api wrappers
> > > + */
> > > +
> > > +/*
> > > + * Do we have a group manager?  Group managers restrict what drivers are
> > > + * allowed to attach to devices.  A group is "locked" when all of the devices
> > > + * for a group belong to group manager drivers (or no driver at all).  At
> > > + * that point, the group manager can initialize the iommu.  vfio is an example
> > > + * of a group manager and vfio-pci is an exanple of a driver that a group
> > > + * manager may add as a "managed" driver.  Note that we don't gate iommu
> > > + * manipulation on being managed because we want to use it for regular DMA
> > > + * at some point as well.
> > > + */
> > > +bool isolation_group_managed(struct isolation_group *group)
> > > +{
> > > +	return group->manager != NULL;
> > > +}
> > > +
> > > +/*
> > > + * Initialize the group manager struct.  At this point the isolation group
> > > + * becomes "managed".
> > > + */
> > 
> > This assumes a separate manager struct for each group.  I would have
> > thought the VFIO merge case would be more obviously represented by
> > having a single manager struct for all the groups in the VFIO instance
> > (== iommu domain).
> 
> Given the objections to merging groups, I thought perhaps it would be
> safer to explicitly make it pertain to the iommu domain here.

My problem with merging is about the interface to it starting with
just group handles.  I have no problem with a scheme where you start
an instance/context/domain and then attach multiple groups to it.

>  My vision
> of managers and merging is still solidifying, but the manager init seems
> like a first step to claiming a group while attempting to share
> resources comes a while later.  I don't see much overhead on either the
> grouping code or the manager code for keeping these separate.  Do you?

Hrm, not sure yet.

> > > +int isolation_group_init_manager(struct isolation_group *group)
> > > +{
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	if (group->manager) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -EBUSY;
> > > +	}
> > > +
> > > +	group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL);
> > > +	if (!group->manager) {
> > > +		mutex_unlock(&isolation_lock);
> > > +		return -ENOMEM;
> > > +	}
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * And free... cleanup registerd managed drivers too.
> > > + */
> > > +void isolation_group_free_manager(struct isolation_group *group)
> > > +{
> > > +	struct isolation_manager_driver *driver, *tmp;
> > > +
> > > +	if (!group->manager)
> > > +		return;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) {
> > > +		list_del(&driver->list);
> > > +		kfree(driver);
> > > +	}
> > > +		
> > > +	kfree(group->manager);
> > > +	group->manager = NULL;
> > > +	mutex_unlock(&isolation_lock);
> > > +}
> > > +
> > > +/*
> > > + * Add a managed driver.  Drivers added here are the only ones that will
> > > + * be allowed to attach to a managed isolation group.
> > > + */
> > > +int isolation_group_manager_add_driver(struct isolation_group *group,
> > > +				       struct device_driver *drv)
> > > +{
> > > +	struct isolation_manager_driver *driver;
> > > +
> > > +	if (!group->manager)
> > > +		return -EINVAL;
> > > +
> > > +	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
> > > +	if (!driver)
> > > +		return -ENOMEM;
> > > +
> > > +	driver->drv = drv;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	list_add(&driver->list, &group->manager->drivers);
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * And remove a driver.  Don't really expect to need this much.
> > > + */
> > > +void isolation_group_manager_del_driver(struct isolation_group *group,
> > > +				        struct device_driver *drv)
> > > +{
> > > +	struct isolation_manager_driver *driver;
> > > +
> > > +	if (!group->manager)
> > > +		return;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	list_for_each_entry(driver, &group->manager->drivers, list) {
> > > +		if (driver->drv == drv) {
> > > +			list_del(&driver->list);
> > > +			kfree(driver);
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +}
> > > +
> > > +/*
> > > + * Test whether a driver is a "managed" driver for the group.  This allows
> > > + * is to preempt normal driver matching and only let our drivers in.
> > > + */
> > > +bool isolation_group_manager_driver(struct isolation_group *group,
> > > +				    struct device_driver *drv)
> > > +{
> > > +	struct isolation_manager_driver *driver;
> > > +	bool found = false;
> > > +
> > > +	if (!group->manager)
> > > +		return found;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	list_for_each_entry(driver, &group->manager->drivers, list) {
> > > +		if (driver->drv == drv) {
> > > +			found = true;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return found;
> > > +}
> > > +
> > > +/*
> > > + * Does the group manager have control of all of the devices in the group?
> > > + * We consider driver-less devices to be ok here as they don't do DMA and
> > > + * don't present any interfaces to subvert the rest of the group.
> > > + */
> > > +bool isolation_group_manager_locked(struct isolation_group *group)
> > > +{
> > > +	struct isolation_device *device;
> > > +	struct isolation_manager_driver *driver;
> > > +	bool found, locked = true;
> > > +
> > > +	if (!group->manager)
> > > +		return false;
> > > +
> > > +	mutex_lock(&isolation_lock);
> > > +
> > > +	list_for_each_entry(device, &group->devices, list) {
> > > +		found = false;
> > > +
> > > +		if (!device->dev->driver)
> > > +			continue;
> > > +
> > 
> > You could simplify this using isolation_group_manager_driver(),
> > couldn't you?
> 
> Yeah, probably.
> 
> > > +		list_for_each_entry(driver, &group->manager->drivers, list) {
> > > +			if (device->dev->driver == driver->drv) {
> > > +				found = true;
> > > +				break;
> > > +			}
> > > +		}
> > > +
> > > +		if (!found) {
> > > +			locked = false;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	mutex_unlock(&isolation_lock);
> > > +
> > > +	return locked;
> > > +}
> > > +
> > > +static int __init isolation_init(void)
> > > +{
> > > +	isolation_kset = kset_create_and_add("isolation", NULL, NULL);
> > > +	
> > > +	WARN(!isolation_kset, "Failed to initialize isolation group kset\n");
> > > +
> > > +	return isolation_kset ? 0 : -1;
> > 
> > I'd be tempted to just BUG() here if you can't add the kset - I can't
> > see any reason it would fail unless you're so short of RAM that you
> > have bigger problems.  Making this a fatal fail would save having to
> > double check if the kset is around in the later paths.
> 
> Yep.  Thanks for the comments,
> 
> Alex
> 
>
Alex Williamson - March 15, 2012, 8:15 p.m.
On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote:
> On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > > +/*
> > > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > > + * must be told about the devices they contain.  Expect this to be called
> > > > + * from isolation group providers via notifier.
> > > > + */
> > > 
> > > Doesn't necessarily have to be from a notifier, particularly if the
> > > provider is integrated into host bridge code.
> > 
> > Sure, a provider could do this on it's own if it wants.  This just
> > provides some infrastructure for a common path.  Also note that this
> > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> > to be seen whether that can reasonably be the case once isolation groups
> > are added to streaming DMA paths.
> 
> Right, but other than the #ifdef safety, which could be achieved more
> simply, I'm not seeing what benefit the infrastructure provides over
> directly calling the bus notifier function.  The infrastructure groups
> the notifiers by bus type internally, but AFAICT exactly one bus
> notifier call would become exactly one isolation notifier call, and
> the notifier callback itself would be almost identical.

I guess I don't see this as a fundamental design point of the proposal,
it's just a convenient way to initialize groups as a side-band addition
until isolation groups become a more fundamental part of the iommu
infrastructure.  If you want to do that level of integration in your
provider, do it and make the callbacks w/o the notifier.  If nobody ends
up using them, we'll remove them.  Maybe it will just end up being a
bootstrap.  In the typical case, yes, one bus notifier is one isolation
notifier.  It does however also allow one bus notifier to become
multiple isolation notifiers, and includes some filtering that would
just be duplicated if every provider decided to implement their own bus
notifier.

> > > > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev)
> > > > +{
> > > > +	struct isolation_device *device;
> > > > +	int ret = 0;
> > > > +
> > > > +	mutex_lock(&isolation_lock);
> > > > +
> > > > +	if (dev->isolation_group) {
> > > > +		ret = -EBUSY;
> > > > +		goto out;
> > > 
> > > This should probably be a BUG_ON() - the isolation provider shouldn't
> > > be putting the same device into two different groups.
> > 
> > Yeah, probably.
> > 
> > > > +	}
> > > > +
> > > > +	device = kzalloc(sizeof(*device), GFP_KERNEL);
> > > > +	if (!device) {
> > > > +		ret = -ENOMEM;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	device->dev = dev;
> > > > +
> > > > +	/* Cross link the device in sysfs */
> > > > +	ret = sysfs_create_link(&dev->kobj, &group->kobj,
> > > > +				"isolation_group");
> > > > +	if (ret) {
> > > > +		kfree(device);
> > > > +		goto out;
> > > > +	}
> > > > +	
> > > > +	ret = sysfs_create_link(group->devices_kobj, &dev->kobj,
> > > > +				kobject_name(&dev->kobj));
> > > 
> > > So, a problem both here and in my version is what to name the device
> > > links.  Because they could be potentially be quite widely scattered,
> > > I'm not sure that kobject_name() is guaranteed to be sufficiently
> > > unique.
> > 
> > Even if the name is not, we're pointing to a unique sysfs location.  I
> > expect the primary user of this will be when userspace translates a
> > device to a group (via the isolation_group link below), then tries to
> > walk all the devices in the group to determine effected host devices and
> > manager driver status.  So it would probably be traversing the link back
> > rather than relying solely on the name of the link.  Right?
> 
> Yes, but if we get a duplicate kobject_name() we could get duplicate
> link names within the same directory so we're still in trouble.  We
> could replace the names with just an index number.

This seems exceptionally unlikely, doesn't it?  Maybe
sysfs_create_link() will fail in such a case as we can append some
identifier to the name?

> > > > +	if (ret) {
> > > > +		sysfs_remove_link(&dev->kobj, "isolation_group");
> > > > +		kfree(device);
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	list_add(&device->list, &group->devices);
> > > > +
> > > > +	dev->isolation_group = group;
> > > > +
> > > > +	if (group->iommu_domain) {
> > > > +		ret = group->iommu_ops->attach_dev(group->iommu_domain, dev);
> > > > +		if (ret) {
> > > > +			/* XXX fail device? */
> > > > +		}
> > > 
> > > So, I presume the idea is that this is a stop-gap until iommu_ops is
> > > converted to be in terms of groups rather than indivdual devices?
> > 
> > Yes, I thought we could just back it by the iommu api for now so we can
> > implement managers, but eventually domain_init and attach_dev would
> > probably be a single callback in some kind of group aware iommu api.
> 
> Makes sense.
> 
> > > > +	}
> > > > +
> > > > +	/* XXX signal manager? */
> > > 
> > > Uh, what did you have in mind here?
> > 
> > Another notifier?  Maybe just a callback struct registered when a
> > manager is added to a group.  On one hand, maybe it's not necessary
> > because adding a device to a managed group already restricts driver
> > matching, but on the other, the manager may want to be informed about
> > new devices and try to actively bind a driver to it.  For instance, if
> > assigning an entire PE to a guest, which might include empty slots,
> > would the manager want to be informed about the addition of a device so
> > it could mirror the addition to the guest assigned the PE?
> 
> Oh, right, I was misinterpreting the comment.  Instead of reading it
> as "notify the isolation group manager that something is happening" I
> read it as "do something to manage when some process is given a Unix
> signal".  Hence my complete confusion.
> 
> > > > +out:
> > > > +	mutex_unlock(&isolation_lock);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Remove a device from an isolation group, likely because the device
> > > > + * went away.
> > > > + */
> > > > +int isolation_group_del_dev(struct device *dev)
> > > > +{
> > > > +	struct isolation_group *group = dev->isolation_group;
> > > > +	struct isolation_device *device;
> > > > +
> > > > +	if (!group)
> > > > +		return -ENODEV;
> > > > +
> > > > +	mutex_lock(&isolation_lock);
> > > > +
> > > > +	list_for_each_entry(device, &group->devices, list) {
> > > > +		if (device->dev == dev) {
> > > > +			/* XXX signal manager? */
> > > > +
> > > > +			if (group->iommu_domain)
> > > > +				group->iommu_ops->detach_dev(
> > > > +					group->iommu_domain, dev);
> > > > +			list_del(&device->list);
> > > > +			kfree(device);
> > > > +			dev->isolation_group = NULL;
> > > > +			sysfs_remove_link(group->devices_kobj,
> > > > +					  kobject_name(&dev->kobj));
> > > > +			sysfs_remove_link(&dev->kobj, "isolation_group");
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +			
> > > > +	mutex_unlock(&isolation_lock);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Filter and call out to our own notifier chains
> > > > + */
> > > 
> > > So, I haven't quite worked out what this isolation notifier
> > > infrastructure gives you, as opposed to having isolation providers
> > > directly register bus notifiers.
> > 
> > For now, it nicely separates CONFIG_ISOLATION code so I don't litter
> > intel-iommu with #ifdefs.  It also embraces that devices are always
> > being added and removed and supports that with very little change to
> > existing code paths.
> 
> Well, I get the first bit, but it's a long way from the minimum needed
> to de-#ifdef.  I still haven't seen what it's giving the provider that
> wouldn't be just as simple with direct bus_notifier calls.

I think this is covered above.

> > > > +static int isolation_bus_notify(struct notifier_block *nb,
> > > > +				unsigned long action, void *data)
> > > > +{
> > > > +	struct isolation_notifier *notifier =
> > > > +		container_of(nb, struct isolation_notifier, nb);
> > > > +	struct device *dev = data;
> > > > +
> > > > +	switch (action) {
> > > > +	case BUS_NOTIFY_ADD_DEVICE:
> > > > +		if (!dev->isolation_group)
> > > > +			blocking_notifier_call_chain(&notifier->notifier,
> > > > +					ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > > +		break;
> > > > +	case BUS_NOTIFY_DEL_DEVICE:
> > > > +		if (dev->isolation_group)
> > > > +			blocking_notifier_call_chain(&notifier->notifier,
> > > > +					ISOLATION_NOTIFY_DEL_DEVICE, dev);
> > > > +		break;
> > > > +	}
> > > > +
> > > > +	return NOTIFY_DONE;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add
> > > > + * a new notifier.
> > > > + */
> > > > +static int isolation_do_add_notify(struct device *dev, void *data)
> > > > +{
> > > > +	struct notifier_block *nb = data;
> > > > +
> > > > +	if (!dev->isolation_group)
> > > > +		nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/*
> > > > + * Register a notifier.  This is for isolation group providers.  It's
> > > > + * possible we could have multiple isolation group providers for the same
> > > > + * bus type,
> > > 
> > > So the bit above doesn't seem to connect to the bit below.  We can
> > > indeed have multiple isolation providers for one bus type, but the
> > > below seems to be covering the (also possible, but different) case of
> > > one isolation provider covering multiple bus types.
> > 
> > It covers both.  If a provider covers multiple buses it will call this
> > function once for each bus.  Each new bus creates a new struct
> > isolation_notifier and does a bus_register_notifier (using
> > isolation_bus_notify).  The provider notifier block is added to our own
> > notifier call chain for that struct isolation_notifier.  This way we
> > only ever register a single notifier per bus, but support multiple
> > providers for the bus.
> 
> Um, right, but what would be the problem with registering multiple
> notifiers per bus.

It's a minor optimization, but this way we can stop when one of the
providers decides it "owns" the device.  As noted, I'm not stuck to this
notifier and in the end, it doesn't gate creating groups using the
isolation functions directly.

> > > > so we create a struct isolation_notifier per bus_type, each
> > > > + * with a notifier block.  Providers are therefore welcome to register
> > > > + * as many buses as they can handle.
> > > > + */
> > > > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb)
> > > > +{
> > > > +	struct isolation_notifier *notifier;
> > > > +	bool found = false;
> > > > +	int ret;
> > > > +
> > > > +	mutex_lock(&isolation_lock);
> > > > +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> > > > +		if (notifier->bus == bus) {
> > > > +			found = true;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	if (!found) {
> > > > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > > > +		if (!notifier) {
> > > > +			mutex_unlock(&isolation_lock);
> > > > +			return -ENOMEM;	
> > > > +		}
> > > > +		notifier->bus = bus;
> > > > +		notifier->nb.notifier_call = isolation_bus_notify;
> > > > +		BLOCKING_INIT_NOTIFIER_HEAD(&notifier->notifier);
> > > > +		bus_register_notifier(bus, &notifier->nb);
> > > > +		list_add(&notifier->list, &isolation_notifiers);
> > > > +	}
> > > > +
> > > > +	ret = blocking_notifier_chain_register(&notifier->notifier, nb);
> > > > +	if (ret) {
> > > > +		if (notifier->refcnt == 0) {
> > > > +			bus_unregister_notifier(bus, &notifier->nb);
> > > > +			list_del(&notifier->list);
> > > > +			kfree(notifier);
> > > > +		}
> > > > +		mutex_unlock(&isolation_lock);
> > > > +		return ret;
> > > > +	}
> > > > +	notifier->refcnt++;
> > > > +
> > > > +	mutex_unlock(&isolation_lock);
> > > > +
> > > > +	bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify);
> > > > +
> > > > +	return 0;
> > > > +}
> > > 
> > > So, somewhere, I think we need a fallback path, but I'm not sure
> > > exactly where.  If an isolation provider doesn't explicitly put a
> > > device into a group, the device should go into the group of its parent
> > > bridge.  This covers the case of a bus with IOMMU which has below it a
> > > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > > translated by the top-level IOMMU (assuming it sees them as coming
> > > from the bridge), but they all need to be treated as one group.
> > 
> > Why would the top level IOMMU provider not set the isolation group in
> > this case.
> 
> Because it knows nothing about the subordinate bus.  For example
> imagine a VT-d system, with a wacky PCI card into which you plug some
> other type of DMA capable device.  The PCI card is acting as a bridge
> from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
> have a FooBus notifier, since it knows nothing about FooBus.  But the
> FooBus devices still need to end up in the group of the PCI bridge
> device, since their DMA operations will appear as coming from the PCI
> bridge card to the IOMMU, and can be isolated from the rest of the
> system (but not each other) on that basis.

I guess I was imagining that it's ok to have devices without an
isolation group.  When that happens we can traverse up the bus to find a
higher level isolation group.  It would probably make sense to have some
kind of top-most isolation group that contains everything as it's an
easy proof that if you own everything, you're isolated.  Potentially
though, wackyPCIcard can also register isolation groups for each of it's
FooBus devices if it's able to provide that capability.  Thanks,

Alex
David Gibson - March 16, 2012, 3:45 a.m.
On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote:
> On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote:
> > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > > > +/*
> > > > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > > > + * must be told about the devices they contain.  Expect this to be called
> > > > > + * from isolation group providers via notifier.
> > > > > + */
> > > > 
> > > > Doesn't necessarily have to be from a notifier, particularly if the
> > > > provider is integrated into host bridge code.
> > > 
> > > Sure, a provider could do this on it's own if it wants.  This just
> > > provides some infrastructure for a common path.  Also note that this
> > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> > > to be seen whether that can reasonably be the case once isolation groups
> > > are added to streaming DMA paths.
> > 
> > Right, but other than the #ifdef safety, which could be achieved more
> > simply, I'm not seeing what benefit the infrastructure provides over
> > directly calling the bus notifier function.  The infrastructure groups
> > the notifiers by bus type internally, but AFAICT exactly one bus
> > notifier call would become exactly one isolation notifier call, and
> > the notifier callback itself would be almost identical.
> 
> I guess I don't see this as a fundamental design point of the proposal,
> it's just a convenient way to initialize groups as a side-band addition
> until isolation groups become a more fundamental part of the iommu
> infrastructure.  If you want to do that level of integration in your
> provider, do it and make the callbacks w/o the notifier.  If nobody ends
> up using them, we'll remove them.  Maybe it will just end up being a
> bootstrap.  In the typical case, yes, one bus notifier is one isolation
> notifier.  It does however also allow one bus notifier to become
> multiple isolation notifiers, and includes some filtering that would
> just be duplicated if every provider decided to implement their own bus
> notifier.

Uh.. I didn't notice any filtering?  That's why I'm asking.

> > > > > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev)
> > > > > +{
> > > > > +	struct isolation_device *device;
> > > > > +	int ret = 0;
> > > > > +
> > > > > +	mutex_lock(&isolation_lock);
> > > > > +
> > > > > +	if (dev->isolation_group) {
> > > > > +		ret = -EBUSY;
> > > > > +		goto out;
> > > > 
> > > > This should probably be a BUG_ON() - the isolation provider shouldn't
> > > > be putting the same device into two different groups.
> > > 
> > > Yeah, probably.
> > > 
> > > > > +	}
> > > > > +
> > > > > +	device = kzalloc(sizeof(*device), GFP_KERNEL);
> > > > > +	if (!device) {
> > > > > +		ret = -ENOMEM;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	device->dev = dev;
> > > > > +
> > > > > +	/* Cross link the device in sysfs */
> > > > > +	ret = sysfs_create_link(&dev->kobj, &group->kobj,
> > > > > +				"isolation_group");
> > > > > +	if (ret) {
> > > > > +		kfree(device);
> > > > > +		goto out;
> > > > > +	}
> > > > > +	
> > > > > +	ret = sysfs_create_link(group->devices_kobj, &dev->kobj,
> > > > > +				kobject_name(&dev->kobj));
> > > > 
> > > > So, a problem both here and in my version is what to name the device
> > > > links.  Because they could be potentially be quite widely scattered,
> > > > I'm not sure that kobject_name() is guaranteed to be sufficiently
> > > > unique.
> > > 
> > > Even if the name is not, we're pointing to a unique sysfs location.  I
> > > expect the primary user of this will be when userspace translates a
> > > device to a group (via the isolation_group link below), then tries to
> > > walk all the devices in the group to determine effected host devices and
> > > manager driver status.  So it would probably be traversing the link back
> > > rather than relying solely on the name of the link.  Right?
> > 
> > Yes, but if we get a duplicate kobject_name() we could get duplicate
> > link names within the same directory so we're still in trouble.  We
> > could replace the names with just an index number.
> 
> This seems exceptionally unlikely, doesn't it?

Hmm, I don't think I have enough data to judge it particularly
unlikely.  Just that it's possible.

>  Maybe
> sysfs_create_link() will fail in such a case as we can append some
> identifier to the name?

That could work.

> > > > > +	if (ret) {
> > > > > +		sysfs_remove_link(&dev->kobj, "isolation_group");
> > > > > +		kfree(device);
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	list_add(&device->list, &group->devices);
> > > > > +
> > > > > +	dev->isolation_group = group;
> > > > > +
> > > > > +	if (group->iommu_domain) {
> > > > > +		ret = group->iommu_ops->attach_dev(group->iommu_domain, dev);
> > > > > +		if (ret) {
> > > > > +			/* XXX fail device? */
> > > > > +		}
> > > > 
> > > > So, I presume the idea is that this is a stop-gap until iommu_ops is
> > > > converted to be in terms of groups rather than indivdual devices?
> > > 
> > > Yes, I thought we could just back it by the iommu api for now so we can
> > > implement managers, but eventually domain_init and attach_dev would
> > > probably be a single callback in some kind of group aware iommu api.
> > 
> > Makes sense.
> > 
> > > > > +	}
> > > > > +
> > > > > +	/* XXX signal manager? */
> > > > 
> > > > Uh, what did you have in mind here?
> > > 
> > > Another notifier?  Maybe just a callback struct registered when a
> > > manager is added to a group.  On one hand, maybe it's not necessary
> > > because adding a device to a managed group already restricts driver
> > > matching, but on the other, the manager may want to be informed about
> > > new devices and try to actively bind a driver to it.  For instance, if
> > > assigning an entire PE to a guest, which might include empty slots,
> > > would the manager want to be informed about the addition of a device so
> > > it could mirror the addition to the guest assigned the PE?
> > 
> > Oh, right, I was misinterpreting the comment.  Instead of reading it
> > as "notify the isolation group manager that something is happening" I
> > read it as "do something to manage when some process is given a Unix
> > signal".  Hence my complete confusion.
> > 
> > > > > +out:
> > > > > +	mutex_unlock(&isolation_lock);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Remove a device from an isolation group, likely because the device
> > > > > + * went away.
> > > > > + */
> > > > > +int isolation_group_del_dev(struct device *dev)
> > > > > +{
> > > > > +	struct isolation_group *group = dev->isolation_group;
> > > > > +	struct isolation_device *device;
> > > > > +
> > > > > +	if (!group)
> > > > > +		return -ENODEV;
> > > > > +
> > > > > +	mutex_lock(&isolation_lock);
> > > > > +
> > > > > +	list_for_each_entry(device, &group->devices, list) {
> > > > > +		if (device->dev == dev) {
> > > > > +			/* XXX signal manager? */
> > > > > +
> > > > > +			if (group->iommu_domain)
> > > > > +				group->iommu_ops->detach_dev(
> > > > > +					group->iommu_domain, dev);
> > > > > +			list_del(&device->list);
> > > > > +			kfree(device);
> > > > > +			dev->isolation_group = NULL;
> > > > > +			sysfs_remove_link(group->devices_kobj,
> > > > > +					  kobject_name(&dev->kobj));
> > > > > +			sysfs_remove_link(&dev->kobj, "isolation_group");
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +			
> > > > > +	mutex_unlock(&isolation_lock);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Filter and call out to our own notifier chains
> > > > > + */
> > > > 
> > > > So, I haven't quite worked out what this isolation notifier
> > > > infrastructure gives you, as opposed to having isolation providers
> > > > directly register bus notifiers.
> > > 
> > > For now, it nicely separates CONFIG_ISOLATION code so I don't litter
> > > intel-iommu with #ifdefs.  It also embraces that devices are always
> > > being added and removed and supports that with very little change to
> > > existing code paths.
> > 
> > Well, I get the first bit, but it's a long way from the minimum needed
> > to de-#ifdef.  I still haven't seen what it's giving the provider that
> > wouldn't be just as simple with direct bus_notifier calls.
> 
> I think this is covered above.
> 
> > > > > +static int isolation_bus_notify(struct notifier_block *nb,
> > > > > +				unsigned long action, void *data)
> > > > > +{
> > > > > +	struct isolation_notifier *notifier =
> > > > > +		container_of(nb, struct isolation_notifier, nb);
> > > > > +	struct device *dev = data;
> > > > > +
> > > > > +	switch (action) {
> > > > > +	case BUS_NOTIFY_ADD_DEVICE:
> > > > > +		if (!dev->isolation_group)
> > > > > +			blocking_notifier_call_chain(&notifier->notifier,
> > > > > +					ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > > > +		break;
> > > > > +	case BUS_NOTIFY_DEL_DEVICE:
> > > > > +		if (dev->isolation_group)
> > > > > +			blocking_notifier_call_chain(&notifier->notifier,
> > > > > +					ISOLATION_NOTIFY_DEL_DEVICE, dev);
> > > > > +		break;
> > > > > +	}
> > > > > +
> > > > > +	return NOTIFY_DONE;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add
> > > > > + * a new notifier.
> > > > > + */
> > > > > +static int isolation_do_add_notify(struct device *dev, void *data)
> > > > > +{
> > > > > +	struct notifier_block *nb = data;
> > > > > +
> > > > > +	if (!dev->isolation_group)
> > > > > +		nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Register a notifier.  This is for isolation group providers.  It's
> > > > > + * possible we could have multiple isolation group providers for the same
> > > > > + * bus type,
> > > > 
> > > > So the bit above doesn't seem to connect to the bit below.  We can
> > > > indeed have multiple isolation providers for one bus type, but the
> > > > below seems to be covering the (also possible, but different) case of
> > > > one isolation provider covering multiple bus types.
> > > 
> > > It covers both.  If a provider covers multiple buses it will call this
> > > function once for each bus.  Each new bus creates a new struct
> > > isolation_notifier and does a bus_register_notifier (using
> > > isolation_bus_notify).  The provider notifier block is added to our own
> > > notifier call chain for that struct isolation_notifier.  This way we
> > > only ever register a single notifier per bus, but support multiple
> > > providers for the bus.
> > 
> > Um, right, but what would be the problem with registering multiple
> > notifiers per bus.
> 
> It's a minor optimization, but this way we can stop when one of the
> providers decides it "owns" the device.  As noted, I'm not stuck to this
> notifier and in the end, it doesn't gate creating groups using the
> isolation functions directly.

Hm, ok.

> > > > > so we create a struct isolation_notifier per bus_type, each
> > > > > + * with a notifier block.  Providers are therefore welcome to register
> > > > > + * as many buses as they can handle.
> > > > > + */
> > > > > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb)
> > > > > +{
> > > > > +	struct isolation_notifier *notifier;
> > > > > +	bool found = false;
> > > > > +	int ret;
> > > > > +
> > > > > +	mutex_lock(&isolation_lock);
> > > > > +	list_for_each_entry(notifier, &isolation_notifiers, list) {
> > > > > +		if (notifier->bus == bus) {
> > > > > +			found = true;
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	if (!found) {
> > > > > +		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
> > > > > +		if (!notifier) {
> > > > > +			mutex_unlock(&isolation_lock);
> > > > > +			return -ENOMEM;	
> > > > > +		}
> > > > > +		notifier->bus = bus;
> > > > > +		notifier->nb.notifier_call = isolation_bus_notify;
> > > > > +		BLOCKING_INIT_NOTIFIER_HEAD(&notifier->notifier);
> > > > > +		bus_register_notifier(bus, &notifier->nb);
> > > > > +		list_add(&notifier->list, &isolation_notifiers);
> > > > > +	}
> > > > > +
> > > > > +	ret = blocking_notifier_chain_register(&notifier->notifier, nb);
> > > > > +	if (ret) {
> > > > > +		if (notifier->refcnt == 0) {
> > > > > +			bus_unregister_notifier(bus, &notifier->nb);
> > > > > +			list_del(&notifier->list);
> > > > > +			kfree(notifier);
> > > > > +		}
> > > > > +		mutex_unlock(&isolation_lock);
> > > > > +		return ret;
> > > > > +	}
> > > > > +	notifier->refcnt++;
> > > > > +
> > > > > +	mutex_unlock(&isolation_lock);
> > > > > +
> > > > > +	bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > 
> > > > So, somewhere, I think we need a fallback path, but I'm not sure
> > > > exactly where.  If an isolation provider doesn't explicitly put a
> > > > device into a group, the device should go into the group of its parent
> > > > bridge.  This covers the case of a bus with IOMMU which has below it a
> > > > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > > > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > > > translated by the top-level IOMMU (assuming it sees them as coming
> > > > from the bridge), but they all need to be treated as one group.
> > > 
> > > Why would the top level IOMMU provider not set the isolation group in
> > > this case.
> > 
> > Because it knows nothing about the subordinate bus.  For example
> > imagine a VT-d system, with a wacky PCI card into which you plug some
> > other type of DMA capable device.  The PCI card is acting as a bridge
> > from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
> > have a FooBus notifier, since it knows nothing about FooBus.  But the
> > FooBus devices still need to end up in the group of the PCI bridge
> > device, since their DMA operations will appear as coming from the PCI
> > bridge card to the IOMMU, and can be isolated from the rest of the
> > system (but not each other) on that basis.
> 
> I guess I was imagining that it's ok to have devices without an
> isolation group.

It is, but having NULL isolation group has a pretty specific meaning -
it means it's never safe to give that device to userspace, but it also
means that normal kernel driver operation of that device must not
interfere with anything in any iso group (otherwise we can never no
that those iso groups _are_ safe to hand out).  Likewise userspace
operation of any isolation group can't mess with no-group devices.

None of these conditions is true for the hypothetical Foobus case.
The bus as a whole could be safely given to userspace, the devices on
it *could* mess with an existing isolation group (suppose the group
consisted of a PCI segment with the FooBus bridge plus another PCI
card - FooBus DMAs would be bridged onto the PCI segment and might
target the other card's MMIO space).  And other grouped devices can
certainly mess with the FooBus devices (if userspace grabs the bridge
and manipulates its IOMMU mappings, that would clearly screw up any
kernel drivers doing DMA from FooBus devices behind it).

Oh.. and I just thought of an existing-in-the-field case with the same
problem.  I'm pretty sure for devices that can appear on PCI but also
have "dumb-bus" versions used on embedded systems, at least some of
the kernel drivers are structured so that there is a separate struct
device for the PCI "wrapper" and the device inside.  If the inner
driver is initiating DMA, as it usually would, it has to be in the
same iso group as it's PCI device parent.

>  When that happens we can traverse up the bus to find a
> higher level isolation group.

Well, that's one possible answer to my "where should the hook be
question": rather than an initialization hook, when we look up a
device's isolation group, if it doesn't say one explicitly, we try
it's bridge parent and so forth up the tree.  I wonder about the
overhead of having to walk all the way up the bus heirarchy before
returning NULL whenever we ask about the group of a device that
doesn't have one.

>  It would probably make sense to have some
> kind of top-most isolation group that contains everything as it's an
> easy proof that if you own everything, you're isolated.

Hrm, no.  Things that have no IOMMU above them will have ungated
access to system RAM, and so can never be safely isolated for
userspace purposes, even if userspace owned every _device_ in the
system (not that you could do that in practice, anyway).

>  Potentially
> though, wackyPCIcard can also register isolation groups for each of it's
> FooBus devices if it's able to provide that capability.  Thanks,

It could, but why should it.  It doesn't know anything about IOMMUs or
isolation, and it doesn't need to.  Even less so for PCI devices which
create subordinate non-PCI struct devices for internal reasons.
Alex Williamson - March 16, 2012, 7:31 p.m.
On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote:
> On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote:
> > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote:
> > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > > > > +/*
> > > > > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > > > > + * must be told about the devices they contain.  Expect this to be called
> > > > > > + * from isolation group providers via notifier.
> > > > > > + */
> > > > > 
> > > > > Doesn't necessarily have to be from a notifier, particularly if the
> > > > > provider is integrated into host bridge code.
> > > > 
> > > > Sure, a provider could do this on it's own if it wants.  This just
> > > > provides some infrastructure for a common path.  Also note that this
> > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> > > > to be seen whether that can reasonably be the case once isolation groups
> > > > are added to streaming DMA paths.
> > > 
> > > Right, but other than the #ifdef safety, which could be achieved more
> > > simply, I'm not seeing what benefit the infrastructure provides over
> > > directly calling the bus notifier function.  The infrastructure groups
> > > the notifiers by bus type internally, but AFAICT exactly one bus
> > > notifier call would become exactly one isolation notifier call, and
> > > the notifier callback itself would be almost identical.
> > 
> > I guess I don't see this as a fundamental design point of the proposal,
> > it's just a convenient way to initialize groups as a side-band addition
> > until isolation groups become a more fundamental part of the iommu
> > infrastructure.  If you want to do that level of integration in your
> > provider, do it and make the callbacks w/o the notifier.  If nobody ends
> > up using them, we'll remove them.  Maybe it will just end up being a
> > bootstrap.  In the typical case, yes, one bus notifier is one isolation
> > notifier.  It does however also allow one bus notifier to become
> > multiple isolation notifiers, and includes some filtering that would
> > just be duplicated if every provider decided to implement their own bus
> > notifier.
> 
> Uh.. I didn't notice any filtering?  That's why I'm asking.

Not much, but a little:

+       switch (action) {
+       case BUS_NOTIFY_ADD_DEVICE:
+               if (!dev->isolation_group)
+                       blocking_notifier_call_chain(&notifier->notifier,
+                                       ISOLATION_NOTIFY_ADD_DEVICE, dev);
+               break;
+       case BUS_NOTIFY_DEL_DEVICE:
+               if (dev->isolation_group)
+                       blocking_notifier_call_chain(&notifier->notifier,
+                                       ISOLATION_NOTIFY_DEL_DEVICE, dev);
+               break;
+       }

...
> > > > > So, somewhere, I think we need a fallback path, but I'm not sure
> > > > > exactly where.  If an isolation provider doesn't explicitly put a
> > > > > device into a group, the device should go into the group of its parent
> > > > > bridge.  This covers the case of a bus with IOMMU which has below it a
> > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > > > > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > > > > translated by the top-level IOMMU (assuming it sees them as coming
> > > > > from the bridge), but they all need to be treated as one group.
> > > > 
> > > > Why would the top level IOMMU provider not set the isolation group in
> > > > this case.
> > > 
> > > Because it knows nothing about the subordinate bus.  For example
> > > imagine a VT-d system, with a wacky PCI card into which you plug some
> > > other type of DMA capable device.  The PCI card is acting as a bridge
> > > from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
> > > have a FooBus notifier, since it knows nothing about FooBus.  But the
> > > FooBus devices still need to end up in the group of the PCI bridge
> > > device, since their DMA operations will appear as coming from the PCI
> > > bridge card to the IOMMU, and can be isolated from the rest of the
> > > system (but not each other) on that basis.
> > 
> > I guess I was imagining that it's ok to have devices without an
> > isolation group.
> 
> It is, but having NULL isolation group has a pretty specific meaning -
> it means it's never safe to give that device to userspace, but it also
> means that normal kernel driver operation of that device must not
> interfere with anything in any iso group (otherwise we can never no
> that those iso groups _are_ safe to hand out).  Likewise userspace
> operation of any isolation group can't mess with no-group devices.

This is where wanting to use isolation groups as the working unit for an
iommu ops layer and also wanting to use iommu ops to replace dma ops
seem to collide a bit.  Do we want two interfaces for dma, one group
based and one for non-isolated devices?  Isolation providers like
intel-iommu would always use one, non-isolation capable dma paths, like
swiotlb or non-isolation hardware iommus, would use another.  And how do
we factor in the degree of isolation to avoid imposing policy in the
kernel?  MSI isolation is an example.  We should allow userspace to set
a policy of whether lack of MSI protection is an acceptable risk.  Does
that mean we can have isolation groups that provide no isolation and
sysfs files indicating capabilities?  Perhaps certain classes of groups
don't even allow manager binding?

> None of these conditions is true for the hypothetical Foobus case.
> The bus as a whole could be safely given to userspace, the devices on
> it *could* mess with an existing isolation group (suppose the group
> consisted of a PCI segment with the FooBus bridge plus another PCI
> card - FooBus DMAs would be bridged onto the PCI segment and might
> target the other card's MMIO space).  And other grouped devices can
> certainly mess with the FooBus devices (if userspace grabs the bridge
> and manipulates its IOMMU mappings, that would clearly screw up any
> kernel drivers doing DMA from FooBus devices behind it).

---segment-----+---------------+
               |               |
        [whackyPCIcard]   [other device]
               |
               +---FooBus---+-------+
                            |       |
                       [FooDev1] [FooDev2]

This is another example of the quality of the isolation group and what
factors we incorporate in judging that.  If the bridge/switch port
generating the segment does not support or enable PCI ACS then the IOMMU
may be able to differentiate whackyPCIcard from the other device but not
prevent peer-to-peer traffic between the two (or between FooDevs and the
other device - same difference).  This would suggest that we might have
an isolation group at the root of the segment for which the provider can
guarantee isolation (includes everything on and below the bus), and we
might also have isolation groups at whackyPCIcard and the other device
that have a difference quality of isolation.  /me pauses for rant about
superior hardware... ;)

> Oh.. and I just thought of an existing-in-the-field case with the same
> problem.  I'm pretty sure for devices that can appear on PCI but also
> have "dumb-bus" versions used on embedded systems, at least some of
> the kernel drivers are structured so that there is a separate struct
> device for the PCI "wrapper" and the device inside.  If the inner
> driver is initiating DMA, as it usually would, it has to be in the
> same iso group as it's PCI device parent.

I don't know that we can ever generically identify such devices, but
maybe this introduces the idea that we need to be able to blacklist
certain devices as multi-homed and tainting any isolation group that
contains them.

> >  When that happens we can traverse up the bus to find a
> > higher level isolation group.
> 
> Well, that's one possible answer to my "where should the hook be
> question": rather than an initialization hook, when we look up a
> device's isolation group, if it doesn't say one explicitly, we try
> it's bridge parent and so forth up the tree.  I wonder about the
> overhead of having to walk all the way up the bus heirarchy before
> returning NULL whenever we ask about the group of a device that
> doesn't have one.

Yep, that could definitely be a concern for streaming dma.

> >  It would probably make sense to have some
> > kind of top-most isolation group that contains everything as it's an
> > easy proof that if you own everything, you're isolated.
> 
> Hrm, no.  Things that have no IOMMU above them will have ungated
> access to system RAM, and so can never be safely isolated for
> userspace purposes, even if userspace owned every _device_ in the
> system (not that you could do that in practice, anyway).

RAM is a good point, there are "non-devices" to worry about.
Potentially that top level group doesn't allow managers.

> >  Potentially
> > though, wackyPCIcard can also register isolation groups for each of it's
> > FooBus devices if it's able to provide that capability.  Thanks,
> 
> It could, but why should it.  It doesn't know anything about IOMMUs or
> isolation, and it doesn't need to.  Even less so for PCI devices which
> create subordinate non-PCI struct devices for internal reasons.

Sorry I wasn't clear.  If wackyPCIcard happens to include an onboard
IOMMU of some sort that's capable of isolation, it might be able to
register groups for each of the devices below it.  Therefore we could
end up with a scenario like above that the segment may not have ACS and
therefore not be able to isolate wackyPCIcard from otherdevice, but
wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice.  I think
that means we potentially need to check all devices downstream of an
isolation group to allow a manager to lock it, as well as checking the
path upstream to make sure it isn't used above...  Are you sure we're
ready for this kind of infrastructure?  Thanks,

Alex
David Gibson - March 17, 2012, 4:57 a.m.
On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote:
> On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote:
> > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote:
> > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote:
> > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > > > > > +/*
> > > > > > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > > > > > + * must be told about the devices they contain.  Expect this to be called
> > > > > > > + * from isolation group providers via notifier.
> > > > > > > + */
> > > > > > 
> > > > > > Doesn't necessarily have to be from a notifier, particularly if the
> > > > > > provider is integrated into host bridge code.
> > > > > 
> > > > > Sure, a provider could do this on it's own if it wants.  This just
> > > > > provides some infrastructure for a common path.  Also note that this
> > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> > > > > to be seen whether that can reasonably be the case once isolation groups
> > > > > are added to streaming DMA paths.
> > > > 
> > > > Right, but other than the #ifdef safety, which could be achieved more
> > > > simply, I'm not seeing what benefit the infrastructure provides over
> > > > directly calling the bus notifier function.  The infrastructure groups
> > > > the notifiers by bus type internally, but AFAICT exactly one bus
> > > > notifier call would become exactly one isolation notifier call, and
> > > > the notifier callback itself would be almost identical.
> > > 
> > > I guess I don't see this as a fundamental design point of the proposal,
> > > it's just a convenient way to initialize groups as a side-band addition
> > > until isolation groups become a more fundamental part of the iommu
> > > infrastructure.  If you want to do that level of integration in your
> > > provider, do it and make the callbacks w/o the notifier.  If nobody ends
> > > up using them, we'll remove them.  Maybe it will just end up being a
> > > bootstrap.  In the typical case, yes, one bus notifier is one isolation
> > > notifier.  It does however also allow one bus notifier to become
> > > multiple isolation notifiers, and includes some filtering that would
> > > just be duplicated if every provider decided to implement their own bus
> > > notifier.
> > 
> > Uh.. I didn't notice any filtering?  That's why I'm asking.
> 
> Not much, but a little:
> 
> +       switch (action) {
> +       case BUS_NOTIFY_ADD_DEVICE:
> +               if (!dev->isolation_group)
> +                       blocking_notifier_call_chain(&notifier->notifier,
> +                                       ISOLATION_NOTIFY_ADD_DEVICE, dev);
> +               break;
> +       case BUS_NOTIFY_DEL_DEVICE:
> +               if (dev->isolation_group)
> +                       blocking_notifier_call_chain(&notifier->notifier,
> +                                       ISOLATION_NOTIFY_DEL_DEVICE, dev);
> +               break;
> +       }


Ah, I see, fair enough.

A couple of tangential observations.  First, I suspect using
BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug,
it might be better to have an unplug callback in the group instead.

Second, I don't think aborting the call chain early for hot-plug is
actually a good idea.  I can't see a clear guarantee on the order, so
individual providers couldn't rely on that short-cut behaviour.  Which
means that if two providers would have attempted to claim the same
device, something is seriously wrong and we should probably report
that.

> ...
> > > > > > So, somewhere, I think we need a fallback path, but I'm not sure
> > > > > > exactly where.  If an isolation provider doesn't explicitly put a
> > > > > > device into a group, the device should go into the group of its parent
> > > > > > bridge.  This covers the case of a bus with IOMMU which has below it a
> > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > > > > > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > > > > > translated by the top-level IOMMU (assuming it sees them as coming
> > > > > > from the bridge), but they all need to be treated as one group.
> > > > > 
> > > > > Why would the top level IOMMU provider not set the isolation group in
> > > > > this case.
> > > > 
> > > > Because it knows nothing about the subordinate bus.  For example
> > > > imagine a VT-d system, with a wacky PCI card into which you plug some
> > > > other type of DMA capable device.  The PCI card is acting as a bridge
> > > > from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
> > > > have a FooBus notifier, since it knows nothing about FooBus.  But the
> > > > FooBus devices still need to end up in the group of the PCI bridge
> > > > device, since their DMA operations will appear as coming from the PCI
> > > > bridge card to the IOMMU, and can be isolated from the rest of the
> > > > system (but not each other) on that basis.
> > > 
> > > I guess I was imagining that it's ok to have devices without an
> > > isolation group.
> > 
> > It is, but having NULL isolation group has a pretty specific meaning -
> > it means it's never safe to give that device to userspace, but it also
> > means that normal kernel driver operation of that device must not
> > interfere with anything in any iso group (otherwise we can never no
> > that those iso groups _are_ safe to hand out).  Likewise userspace
> > operation of any isolation group can't mess with no-group devices.
> 
> This is where wanting to use isolation groups as the working unit for an
> iommu ops layer and also wanting to use iommu ops to replace dma ops
> seem to collide a bit.  Do we want two interfaces for dma, one group
> based and one for non-isolated devices?

Well, I don't know enough about what dwmw2 had planned to answer
that.  I was assuming the in-kernel dma reply would remain device/bus
focussed, and how it looked up and used the groups was an internal
matter.  I would expect it would probably need a fallback path for "no
group" for transition purposes, even if it wasn't planned to keep that
forever.

>  Isolation providers like
> intel-iommu would always use one, non-isolation capable dma paths, like
> swiotlb or non-isolation hardware iommus, would use another.  And how do
> we factor in the degree of isolation to avoid imposing policy in the
> kernel?  MSI isolation is an example.  We should allow userspace to set
> a policy of whether lack of MSI protection is an acceptable risk.  Does
> that mean we can have isolation groups that provide no isolation and
> sysfs files indicating capabilities?  Perhaps certain classes of groups
> don't even allow manager binding?

Ugh.  I thought about flagging groups with some sort of bitmap of
isolation strength properties, but I think that way lies madness.  We
might want a global policy bitmask though which expresses which
constraints are acceptable risks.  The providers would then have to
provide groups which are as strong as requested, or none at all.

> > None of these conditions is true for the hypothetical Foobus case.
> > The bus as a whole could be safely given to userspace, the devices on
> > it *could* mess with an existing isolation group (suppose the group
> > consisted of a PCI segment with the FooBus bridge plus another PCI
> > card - FooBus DMAs would be bridged onto the PCI segment and might
> > target the other card's MMIO space).  And other grouped devices can
> > certainly mess with the FooBus devices (if userspace grabs the bridge
> > and manipulates its IOMMU mappings, that would clearly screw up any
> > kernel drivers doing DMA from FooBus devices behind it).
> 
> ---segment-----+---------------+
>                |               |
>         [whackyPCIcard]   [other device]
>                |
>                +---FooBus---+-------+
>                             |       |
>                        [FooDev1] [FooDev2]
> 
> This is another example of the quality of the isolation group and what
> factors we incorporate in judging that.  If the bridge/switch port
> generating the segment does not support or enable PCI ACS then the IOMMU
> may be able to differentiate whackyPCIcard from the other device but not
> prevent peer-to-peer traffic between the two (or between FooDevs and the
> other device - same difference).  This would suggest that we might have
> an isolation group at the root of the segment for which the provider can
> guarantee isolation (includes everything on and below the bus), and we
> might also have isolation groups at whackyPCIcard and the other device
> that have a difference quality of isolation.  /me pauses for rant about
> superior hardware... ;)

Nested isolation groups? Please no.

An isolation group at the bridge encompassing that segment was exactly
what I had in mind, but my point is that all the FooBus devices *also*
need to be in that group, even though the isolation provider knows
nothing about FooBus.

> > Oh.. and I just thought of an existing-in-the-field case with the same
> > problem.  I'm pretty sure for devices that can appear on PCI but also
> > have "dumb-bus" versions used on embedded systems, at least some of
> > the kernel drivers are structured so that there is a separate struct
> > device for the PCI "wrapper" and the device inside.  If the inner
> > driver is initiating DMA, as it usually would, it has to be in the
> > same iso group as it's PCI device parent.
> 
> I don't know that we can ever generically identify such devices, but
> maybe this introduces the idea that we need to be able to blacklist
> certain devices as multi-homed and tainting any isolation group that
> contains them.

Sorry, I was unclear.  These are not multi-homed devices, just
multiple-variant devices, any given instance will either be PCI or
not.  But to handle the two variants easily, the drivers have nested
struct devices.  It's essentially a pure software artifact we're
dealing with here, but nonetheless we need to get the group-ownership
of those struct devices correct, whether they're synthetic or not.

> > >  When that happens we can traverse up the bus to find a
> > > higher level isolation group.
> > 
> > Well, that's one possible answer to my "where should the hook be
> > question": rather than an initialization hook, when we look up a
> > device's isolation group, if it doesn't say one explicitly, we try
> > it's bridge parent and so forth up the tree.  I wonder about the
> > overhead of having to walk all the way up the bus heirarchy before
> > returning NULL whenever we ask about the group of a device that
> > doesn't have one.
> 
> Yep, that could definitely be a concern for streaming dma.

Right, which is why I'm suggesting handling at init time instead.
We'd need something that runs in the generic hot-add path, after bus
notifiers, which would set the device's group to the same as its
parent's if a provider hasn't already given it one.

> > >  It would probably make sense to have some
> > > kind of top-most isolation group that contains everything as it's an
> > > easy proof that if you own everything, you're isolated.
> > 
> > Hrm, no.  Things that have no IOMMU above them will have ungated
> > access to system RAM, and so can never be safely isolated for
> > userspace purposes, even if userspace owned every _device_ in the
> > system (not that you could do that in practice, anyway).
> 
> RAM is a good point, there are "non-devices" to worry about.
> Potentially that top level group doesn't allow managers.

I don't really see the point of a "top-level" group.  Groups are
exclusive, not heirarchical, and I don't see that there are any paths
really simplified by having a (not manageable) "leftovers" group.

> > >  Potentially
> > > though, wackyPCIcard can also register isolation groups for each of it's
> > > FooBus devices if it's able to provide that capability.  Thanks,
> > 
> > It could, but why should it.  It doesn't know anything about IOMMUs or
> > isolation, and it doesn't need to.  Even less so for PCI devices which
> > create subordinate non-PCI struct devices for internal reasons.
> 
> Sorry I wasn't clear.  If wackyPCIcard happens to include an onboard
> IOMMU of some sort that's capable of isolation, it might be able to
> register groups for each of the devices below it.  Therefore we could
> end up with a scenario like above that the segment may not have ACS and
> therefore not be able to isolate wackyPCIcard from otherdevice, but
> wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice.  I think
> that means we potentially need to check all devices downstream of an
> isolation group to allow a manager to lock it, as well as checking the
> path upstream to make sure it isn't used above...  Are you sure we're
> ready for this kind of infrastructure?  Thanks,

Hrm, at this stage I think I'm prepared to assume that IOMMUs are
either strictly independent (e.g. covering different PCI domains) or
are at least vaguely aware of each other (i.e. platform conventions
cover the combination).  Truly nested IOMMUs, that are entirely
unaware of each other is a problem for another day (by which I mean
probably never).

That said, again, groups have to be exclusive, not heirarchical, so
in the above case, I think we'd have a few possible ways to go:

	1) The FooBus isolation provider could refuse to initialize if
the FooBus bridge is sharing an iso group with something else before
it probes the FooBus.
	2) The FooBus isolation provider could be in the FooBus bridge
driver - meaning that if someone tried to grab the PCI group including
the FooBridge, the provider driver would be booted out, causing the
FooBus groups to cease to exist (meaning the PCI group manager would
never get lock while someone was already using the FooGroups).  In
this case, it gets a bit complex.  When the FooBus isolation provider
is active, the FooBus devices would be in their own groups, not the
group of the FooBridge and its sibling.  When the FooBus isolation
provider is removed, it would have to configure the FooBus IOMMU to a
passthrough mode, and revert the FooBus devices to the parent's
group.  Hm.  Complicated.
Alex Williamson - March 21, 2012, 9:12 p.m.
On Sat, 2012-03-17 at 15:57 +1100, David Gibson wrote:
> On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote:
> > On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote:
> > > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote:
> > > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote:
> > > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> > > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > > > > > > +/*
> > > > > > > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > > > > > > + * must be told about the devices they contain.  Expect this to be called
> > > > > > > > + * from isolation group providers via notifier.
> > > > > > > > + */
> > > > > > > 
> > > > > > > Doesn't necessarily have to be from a notifier, particularly if the
> > > > > > > provider is integrated into host bridge code.
> > > > > > 
> > > > > > Sure, a provider could do this on it's own if it wants.  This just
> > > > > > provides some infrastructure for a common path.  Also note that this
> > > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> > > > > > to be seen whether that can reasonably be the case once isolation groups
> > > > > > are added to streaming DMA paths.
> > > > > 
> > > > > Right, but other than the #ifdef safety, which could be achieved more
> > > > > simply, I'm not seeing what benefit the infrastructure provides over
> > > > > directly calling the bus notifier function.  The infrastructure groups
> > > > > the notifiers by bus type internally, but AFAICT exactly one bus
> > > > > notifier call would become exactly one isolation notifier call, and
> > > > > the notifier callback itself would be almost identical.
> > > > 
> > > > I guess I don't see this as a fundamental design point of the proposal,
> > > > it's just a convenient way to initialize groups as a side-band addition
> > > > until isolation groups become a more fundamental part of the iommu
> > > > infrastructure.  If you want to do that level of integration in your
> > > > provider, do it and make the callbacks w/o the notifier.  If nobody ends
> > > > up using them, we'll remove them.  Maybe it will just end up being a
> > > > bootstrap.  In the typical case, yes, one bus notifier is one isolation
> > > > notifier.  It does however also allow one bus notifier to become
> > > > multiple isolation notifiers, and includes some filtering that would
> > > > just be duplicated if every provider decided to implement their own bus
> > > > notifier.
> > > 
> > > Uh.. I didn't notice any filtering?  That's why I'm asking.
> > 
> > Not much, but a little:
> > 
> > +       switch (action) {
> > +       case BUS_NOTIFY_ADD_DEVICE:
> > +               if (!dev->isolation_group)
> > +                       blocking_notifier_call_chain(&notifier->notifier,
> > +                                       ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > +               break;
> > +       case BUS_NOTIFY_DEL_DEVICE:
> > +               if (dev->isolation_group)
> > +                       blocking_notifier_call_chain(&notifier->notifier,
> > +                                       ISOLATION_NOTIFY_DEL_DEVICE, dev);
> > +               break;
> > +       }
> 
> 
> Ah, I see, fair enough.
> 
> A couple of tangential observations.  First, I suspect using
> BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug,
> it might be better to have an unplug callback in the group instead.

There's really one already.  Assuming the device is attached to a group
driver, the .remove entry point on the driver will get called first.  It
doesn't get to return error, but it can block.  The DEL_DEVICE will only
happen after the group driver has relinquished the device, or at least
returned from remove().  For a device with no driver, the group would
only learn about it through the notifier, but it probably doesn't need
anything more direct.

> Second, I don't think aborting the call chain early for hot-plug is
> actually a good idea.  I can't see a clear guarantee on the order, so
> individual providers couldn't rely on that short-cut behaviour.  Which
> means that if two providers would have attempted to claim the same
> device, something is seriously wrong and we should probably report
> that.

Yeah, that seems like a reasonable safety check.

> > ...
> > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure
> > > > > > > exactly where.  If an isolation provider doesn't explicitly put a
> > > > > > > device into a group, the device should go into the group of its parent
> > > > > > > bridge.  This covers the case of a bus with IOMMU which has below it a
> > > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > > > > > > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > > > > > > translated by the top-level IOMMU (assuming it sees them as coming
> > > > > > > from the bridge), but they all need to be treated as one group.
> > > > > > 
> > > > > > Why would the top level IOMMU provider not set the isolation group in
> > > > > > this case.
> > > > > 
> > > > > Because it knows nothing about the subordinate bus.  For example
> > > > > imagine a VT-d system, with a wacky PCI card into which you plug some
> > > > > other type of DMA capable device.  The PCI card is acting as a bridge
> > > > > from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
> > > > > have a FooBus notifier, since it knows nothing about FooBus.  But the
> > > > > FooBus devices still need to end up in the group of the PCI bridge
> > > > > device, since their DMA operations will appear as coming from the PCI
> > > > > bridge card to the IOMMU, and can be isolated from the rest of the
> > > > > system (but not each other) on that basis.
> > > > 
> > > > I guess I was imagining that it's ok to have devices without an
> > > > isolation group.
> > > 
> > > It is, but having NULL isolation group has a pretty specific meaning -
> > > it means it's never safe to give that device to userspace, but it also
> > > means that normal kernel driver operation of that device must not
> > > interfere with anything in any iso group (otherwise we can never no
> > > that those iso groups _are_ safe to hand out).  Likewise userspace
> > > operation of any isolation group can't mess with no-group devices.
> > 
> > This is where wanting to use isolation groups as the working unit for an
> > iommu ops layer and also wanting to use iommu ops to replace dma ops
> > seem to collide a bit.  Do we want two interfaces for dma, one group
> > based and one for non-isolated devices?
> 
> Well, I don't know enough about what dwmw2 had planned to answer
> that.  I was assuming the in-kernel dma reply would remain device/bus
> focussed, and how it looked up and used the groups was an internal
> matter.  I would expect it would probably need a fallback path for "no
> group" for transition purposes, even if it wasn't planned to keep that
> forever.

Hmm, I think the value of a core isolation group layer starts to fall
apart if an iommu driver can't count on a group being present for any
device that might do dma.  Some ideas below.

> >  Isolation providers like
> > intel-iommu would always use one, non-isolation capable dma paths, like
> > swiotlb or non-isolation hardware iommus, would use another.  And how do
> > we factor in the degree of isolation to avoid imposing policy in the
> > kernel?  MSI isolation is an example.  We should allow userspace to set
> > a policy of whether lack of MSI protection is an acceptable risk.  Does
> > that mean we can have isolation groups that provide no isolation and
> > sysfs files indicating capabilities?  Perhaps certain classes of groups
> > don't even allow manager binding?
> 
> Ugh.  I thought about flagging groups with some sort of bitmap of
> isolation strength properties, but I think that way lies madness.  We
> might want a global policy bitmask though which expresses which
> constraints are acceptable risks.  The providers would then have to
> provide groups which are as strong as requested, or none at all.

That's effectively how the current iommu_device_group() interface works;
identify isolate-able groups, or none at all.  I don't think we get that
luxury if we push isolation groups down into the device layer though.
If we want to use them for dma_ops and to solve various device quirks,
they can't only be present on some configurations.  I think you're right
on the global policy though, we just need to apply it differently.  I'm
thinking we need something like "isolation_allow=" allowing options of
"nomsi" and "nop2p" (just for starters).  When a provider creates a
group, they provide a set of flags for the group.  A manager is not
allowed to bind to the group if the global policy doesn't match the
group flags.  We'll need some support functions, maybe even a sysfs
file, so userspace can know in advance and vfio can avoid creating
entries for unusable groups.

> > > None of these conditions is true for the hypothetical Foobus case.
> > > The bus as a whole could be safely given to userspace, the devices on
> > > it *could* mess with an existing isolation group (suppose the group
> > > consisted of a PCI segment with the FooBus bridge plus another PCI
> > > card - FooBus DMAs would be bridged onto the PCI segment and might
> > > target the other card's MMIO space).  And other grouped devices can
> > > certainly mess with the FooBus devices (if userspace grabs the bridge
> > > and manipulates its IOMMU mappings, that would clearly screw up any
> > > kernel drivers doing DMA from FooBus devices behind it).
> > 
> > ---segment-----+---------------+
> >                |               |
> >         [whackyPCIcard]   [other device]
> >                |
> >                +---FooBus---+-------+
> >                             |       |
> >                        [FooDev1] [FooDev2]
> > 
> > This is another example of the quality of the isolation group and what
> > factors we incorporate in judging that.  If the bridge/switch port
> > generating the segment does not support or enable PCI ACS then the IOMMU
> > may be able to differentiate whackyPCIcard from the other device but not
> > prevent peer-to-peer traffic between the two (or between FooDevs and the
> > other device - same difference).  This would suggest that we might have
> > an isolation group at the root of the segment for which the provider can
> > guarantee isolation (includes everything on and below the bus), and we
> > might also have isolation groups at whackyPCIcard and the other device
> > that have a difference quality of isolation.  /me pauses for rant about
> > superior hardware... ;)
> 
> Nested isolation groups? Please no.
> 
> An isolation group at the bridge encompassing that segment was exactly
> what I had in mind, but my point is that all the FooBus devices *also*
> need to be in that group, even though the isolation provider knows
> nothing about FooBus.

If we're able to strictly say "no isolation = no group" and ignore
FooBus for a moment, yes, the group at the bridge makes sense.  But as I
said, I don't think we get that luxury.  There will be different
"acceptable" levels of isolation and we'll need to support both the
model of single group encompassing the segment as well as separate
isolation groups for each device, whackyPCICard/other.

A question then is how do we support both?  Perhaps it's a provider
option whether to only create fully isolated group, which makes it
traverse up to the segment.  The default might be to make separate
groups at whackyPCICard and other, each with the nop2p flag.  It gets a
lot more complicated, but maybe we support both so if system policy
prevents a manager from binding to a sub-group due to nop2p, it can walk
up a level.  I suspect dma_ops always wants the closer group to avoid
traversing levels.

Back to FooBus, dma_ops only knows how to provide dma for certain types
of devices.  intel-iommu only knows about struct pci_dev.  So if FooDev
needed to do dma, it would need some kind of intermediary to do the
mapping via whackyPCICard.  So from that perspective, we could get away
w/o including FooBus devices in the group.  Of course if we want a
manager to attach to the group at whackyPCICard (ignoring nop2p), we
need to put requirements on the state of the FooDevs.  I think that
might actually give some credibility to nested groups, hierarchical
really, ie if we need to walk all the devices below a group, why not
allow groups below a group?  I don't like it either, but I'm not sure
it's avoidable.

> > > Oh.. and I just thought of an existing-in-the-field case with the same
> > > problem.  I'm pretty sure for devices that can appear on PCI but also
> > > have "dumb-bus" versions used on embedded systems, at least some of
> > > the kernel drivers are structured so that there is a separate struct
> > > device for the PCI "wrapper" and the device inside.  If the inner
> > > driver is initiating DMA, as it usually would, it has to be in the
> > > same iso group as it's PCI device parent.
> > 
> > I don't know that we can ever generically identify such devices, but
> > maybe this introduces the idea that we need to be able to blacklist
> > certain devices as multi-homed and tainting any isolation group that
> > contains them.
> 
> Sorry, I was unclear.  These are not multi-homed devices, just
> multiple-variant devices, any given instance will either be PCI or
> not.  But to handle the two variants easily, the drivers have nested
> struct devices.  It's essentially a pure software artifact we're
> dealing with here, but nonetheless we need to get the group-ownership
> of those struct devices correct, whether they're synthetic or not.

So long as the ADD_DEVICE happens with the currently running variant,
which seems like it has to be the case, I'm not sure how this matters.
I'm trying to be very careful to only use struct device for groups.

> > > >  When that happens we can traverse up the bus to find a
> > > > higher level isolation group.
> > > 
> > > Well, that's one possible answer to my "where should the hook be
> > > question": rather than an initialization hook, when we look up a
> > > device's isolation group, if it doesn't say one explicitly, we try
> > > it's bridge parent and so forth up the tree.  I wonder about the
> > > overhead of having to walk all the way up the bus heirarchy before
> > > returning NULL whenever we ask about the group of a device that
> > > doesn't have one.
> > 
> > Yep, that could definitely be a concern for streaming dma.
> 
> Right, which is why I'm suggesting handling at init time instead.
> We'd need something that runs in the generic hot-add path, after bus
> notifiers, which would set the device's group to the same as its
> parent's if a provider hasn't already given it one.

I don't think that works.  Back to the FooBus example, if the isolation
group becomes a fundamental part of the dma_ops path, we may end up with
groups at each FooDev (or maybe one for FooBus) generated by the
intermediary that sets up dma using whackyPCICard.

> > > >  It would probably make sense to have some
> > > > kind of top-most isolation group that contains everything as it's an
> > > > easy proof that if you own everything, you're isolated.
> > > 
> > > Hrm, no.  Things that have no IOMMU above them will have ungated
> > > access to system RAM, and so can never be safely isolated for
> > > userspace purposes, even if userspace owned every _device_ in the
> > > system (not that you could do that in practice, anyway).
> > 
> > RAM is a good point, there are "non-devices" to worry about.
> > Potentially that top level group doesn't allow managers.
> 
> I don't really see the point of a "top-level" group.  Groups are
> exclusive, not heirarchical, and I don't see that there are any paths
> really simplified by having a (not manageable) "leftovers" group.

Yeah, a top level group may not make sense, but if we plan for dma_ops,
we need non-manageable groups, which I think is just the degenerate case
of isolation quality.

> > > >  Potentially
> > > > though, wackyPCIcard can also register isolation groups for each of it's
> > > > FooBus devices if it's able to provide that capability.  Thanks,
> > > 
> > > It could, but why should it.  It doesn't know anything about IOMMUs or
> > > isolation, and it doesn't need to.  Even less so for PCI devices which
> > > create subordinate non-PCI struct devices for internal reasons.
> > 
> > Sorry I wasn't clear.  If wackyPCIcard happens to include an onboard
> > IOMMU of some sort that's capable of isolation, it might be able to
> > register groups for each of the devices below it.  Therefore we could
> > end up with a scenario like above that the segment may not have ACS and
> > therefore not be able to isolate wackyPCIcard from otherdevice, but
> > wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice.  I think
> > that means we potentially need to check all devices downstream of an
> > isolation group to allow a manager to lock it, as well as checking the
> > path upstream to make sure it isn't used above...  Are you sure we're
> > ready for this kind of infrastructure?  Thanks,
> 
> Hrm, at this stage I think I'm prepared to assume that IOMMUs are
> either strictly independent (e.g. covering different PCI domains) or
> are at least vaguely aware of each other (i.e. platform conventions
> cover the combination).  Truly nested IOMMUs, that are entirely
> unaware of each other is a problem for another day (by which I mean
> probably never).

Ok, but even if there's no iommu onboard whackyPCICard, it can still be
possible to create groups on FooBus just to facilitate dma_ops and
handle quirks.

> That said, again, groups have to be exclusive, not heirarchical, so
> in the above case, I think we'd have a few possible ways to go:

Ownership absolutely has to be exclusive, but it looks like groups
themselves are hierarchical.

> 	1) The FooBus isolation provider could refuse to initialize if
> the FooBus bridge is sharing an iso group with something else before
> it probes the FooBus.

Sharing, as in a parent group is managed?  Yes, something special needs
to happen in that case, I'd probably investigate having it initialized
in a managed state versus not initialized at all.

> 	2) The FooBus isolation provider could be in the FooBus bridge
> driver - meaning that if someone tried to grab the PCI group including
> the FooBridge, the provider driver would be booted out, causing the
> FooBus groups to cease to exist (meaning the PCI group manager would
> never get lock while someone was already using the FooGroups).  In

Right, I don't know that they need to cease to exist, but if a manager
is attached anywhere above or below the group it wants to lock, it has
to be blocked.

> this case, it gets a bit complex.  When the FooBus isolation provider
> is active, the FooBus devices would be in their own groups, not the
> group of the FooBridge and its sibling.  When the FooBus isolation
> provider is removed, it would have to configure the FooBus IOMMU to a
> passthrough mode, and revert the FooBus devices to the parent's
> group.  Hm.  Complicated.

Yep.  I think we're arriving at the same point.  Groups are
hierarchical, but ownership via a manager cannot be nested.  So to
manage a group, we need to walk the entire tree of devices below each
device checking that none of the groups are managed and all the devices
are using the right driver, then walk up from the group to verify no
group of a parent device is managed.  Thanks,

Alex
David Gibson - March 27, 2012, 5:14 a.m.
On Wed, Mar 21, 2012 at 03:12:58PM -0600, Alex Williamson wrote:
> On Sat, 2012-03-17 at 15:57 +1100, David Gibson wrote:
> > On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote:
> > > On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote:
> > > > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote:
> > > > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote:
> > > > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> > > > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > > > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > > > > > > > +/*
> > > > > > > > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > > > > > > > + * must be told about the devices they contain.  Expect this to be called
> > > > > > > > > + * from isolation group providers via notifier.
> > > > > > > > > + */
> > > > > > > > 
> > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the
> > > > > > > > provider is integrated into host bridge code.
> > > > > > > 
> > > > > > > Sure, a provider could do this on it's own if it wants.  This just
> > > > > > > provides some infrastructure for a common path.  Also note that this
> > > > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> > > > > > > to be seen whether that can reasonably be the case once isolation groups
> > > > > > > are added to streaming DMA paths.
> > > > > > 
> > > > > > Right, but other than the #ifdef safety, which could be achieved more
> > > > > > simply, I'm not seeing what benefit the infrastructure provides over
> > > > > > directly calling the bus notifier function.  The infrastructure groups
> > > > > > the notifiers by bus type internally, but AFAICT exactly one bus
> > > > > > notifier call would become exactly one isolation notifier call, and
> > > > > > the notifier callback itself would be almost identical.
> > > > > 
> > > > > I guess I don't see this as a fundamental design point of the proposal,
> > > > > it's just a convenient way to initialize groups as a side-band addition
> > > > > until isolation groups become a more fundamental part of the iommu
> > > > > infrastructure.  If you want to do that level of integration in your
> > > > > provider, do it and make the callbacks w/o the notifier.  If nobody ends
> > > > > up using them, we'll remove them.  Maybe it will just end up being a
> > > > > bootstrap.  In the typical case, yes, one bus notifier is one isolation
> > > > > notifier.  It does however also allow one bus notifier to become
> > > > > multiple isolation notifiers, and includes some filtering that would
> > > > > just be duplicated if every provider decided to implement their own bus
> > > > > notifier.
> > > > 
> > > > Uh.. I didn't notice any filtering?  That's why I'm asking.
> > > 
> > > Not much, but a little:
> > > 
> > > +       switch (action) {
> > > +       case BUS_NOTIFY_ADD_DEVICE:
> > > +               if (!dev->isolation_group)
> > > +                       blocking_notifier_call_chain(&notifier->notifier,
> > > +                                       ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > +               break;
> > > +       case BUS_NOTIFY_DEL_DEVICE:
> > > +               if (dev->isolation_group)
> > > +                       blocking_notifier_call_chain(&notifier->notifier,
> > > +                                       ISOLATION_NOTIFY_DEL_DEVICE, dev);
> > > +               break;
> > > +       }
T> > 
> > 
> > Ah, I see, fair enough.
> > 
> > A couple of tangential observations.  First, I suspect using
> > BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug,
> > it might be better to have an unplug callback in the group instead.
> 
> There's really one already.  Assuming the device is attached to a group
> driver, the .remove entry point on the driver will get called first.  It
> doesn't get to return error, but it can block.

Hrm.  Assuming the isolation provider has installed a driver for the
relevant bus type.  And as we're discussing elsewhere, I think we have
situations where the groups will get members on (subordinate) busses
which the isolation provider doesn't have explicit knowledge of.

> The DEL_DEVICE will only
> happen after the group driver has relinquished the device, or at least
> returned from remove().  For a device with no driver, the group would
> only learn about it through the notifier, but it probably doesn't need
> anything more direct.

DEL_DEVICE is certainly sufficient, it just seems a bit unnecessarily
awkward.

> > Second, I don't think aborting the call chain early for hot-plug is
> > actually a good idea.  I can't see a clear guarantee on the order, so
> > individual providers couldn't rely on that short-cut behaviour.  Which
> > means that if two providers would have attempted to claim the same
> > device, something is seriously wrong and we should probably report
> > that.
> 
> Yeah, that seems like a reasonable safety check.
> 
> > > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure
> > > > > > > > exactly where.  If an isolation provider doesn't explicitly put a
> > > > > > > > device into a group, the device should go into the group of its parent
> > > > > > > > bridge.  This covers the case of a bus with IOMMU which has below it a
> > > > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > > > > > > > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > > > > > > > translated by the top-level IOMMU (assuming it sees them as coming
> > > > > > > > from the bridge), but they all need to be treated as one group.
> > > > > > > 
> > > > > > > Why would the top level IOMMU provider not set the isolation group in
> > > > > > > this case.
> > > > > > 
> > > > > > Because it knows nothing about the subordinate bus.  For example
> > > > > > imagine a VT-d system, with a wacky PCI card into which you plug some
> > > > > > other type of DMA capable device.  The PCI card is acting as a bridge
> > > > > > from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
> > > > > > have a FooBus notifier, since it knows nothing about FooBus.  But the
> > > > > > FooBus devices still need to end up in the group of the PCI bridge
> > > > > > device, since their DMA operations will appear as coming from the PCI
> > > > > > bridge card to the IOMMU, and can be isolated from the rest of the
> > > > > > system (but not each other) on that basis.
> > > > > 
> > > > > I guess I was imagining that it's ok to have devices without an
> > > > > isolation group.
> > > > 
> > > > It is, but having NULL isolation group has a pretty specific meaning -
> > > > it means it's never safe to give that device to userspace, but it also
> > > > means that normal kernel driver operation of that device must not
> > > > interfere with anything in any iso group (otherwise we can never no
> > > > that those iso groups _are_ safe to hand out).  Likewise userspace
> > > > operation of any isolation group can't mess with no-group devices.
> > > 
> > > This is where wanting to use isolation groups as the working unit for an
> > > iommu ops layer and also wanting to use iommu ops to replace dma ops
> > > seem to collide a bit.  Do we want two interfaces for dma, one group
> > > based and one for non-isolated devices?
> > 
> > Well, I don't know enough about what dwmw2 had planned to answer
> > that.  I was assuming the in-kernel dma reply would remain device/bus
> > focussed, and how it looked up and used the groups was an internal
> > matter.  I would expect it would probably need a fallback path for "no
> > group" for transition purposes, even if it wasn't planned to keep that
> > forever.
> 
> Hmm, I think the value of a core isolation group layer starts to fall
> apart if an iommu driver can't count on a group being present for any
> device that might do dma.  Some ideas below.

Ah, yes, to clarify: I think anything subject to IOMMU translation
should have a group (which might need a no-manager flag).  However,
for devices with no IOMMU, or with an IOMMU we can switch permanently
into bypass mode, I think it's reasonable to leave them without group.

> > >  Isolation providers like
> > > intel-iommu would always use one, non-isolation capable dma paths, like
> > > swiotlb or non-isolation hardware iommus, would use another.  And how do
> > > we factor in the degree of isolation to avoid imposing policy in the
> > > kernel?  MSI isolation is an example.  We should allow userspace to set
> > > a policy of whether lack of MSI protection is an acceptable risk.  Does
> > > that mean we can have isolation groups that provide no isolation and
> > > sysfs files indicating capabilities?  Perhaps certain classes of groups
> > > don't even allow manager binding?
> > 
> > Ugh.  I thought about flagging groups with some sort of bitmap of
> > isolation strength properties, but I think that way lies madness.  We
> > might want a global policy bitmask though which expresses which
> > constraints are acceptable risks.  The providers would then have to
> > provide groups which are as strong as requested, or none at all.
> 
> That's effectively how the current iommu_device_group() interface works;
> identify isolate-able groups, or none at all.  I don't think we get that
> luxury if we push isolation groups down into the device layer though.
> If we want to use them for dma_ops and to solve various device quirks,
> they can't only be present on some configurations.  I think you're right
> on the global policy though, we just need to apply it differently.  I'm
> thinking we need something like "isolation_allow=" allowing options of
> "nomsi" and "nop2p" (just for starters).  When a provider creates a
> group, they provide a set of flags for the group.  A manager is not
> allowed to bind to the group if the global policy doesn't match the
> group flags.  We'll need some support functions, maybe even a sysfs
> file, so userspace can know in advance and vfio can avoid creating
> entries for unusable groups.

Just banning managers for groups of insufficient strength isn't quite
enough, because that doesn't allow for consolidation of several
poorly-isolated groups into one strongly isolated groups which may be
possible on some hardware configurations.

> > > > None of these conditions is true for the hypothetical Foobus case.
> > > > The bus as a whole could be safely given to userspace, the devices on
> > > > it *could* mess with an existing isolation group (suppose the group
> > > > consisted of a PCI segment with the FooBus bridge plus another PCI
> > > > card - FooBus DMAs would be bridged onto the PCI segment and might
> > > > target the other card's MMIO space).  And other grouped devices can
> > > > certainly mess with the FooBus devices (if userspace grabs the bridge
> > > > and manipulates its IOMMU mappings, that would clearly screw up any
> > > > kernel drivers doing DMA from FooBus devices behind it).
> > > 
> > > ---segment-----+---------------+
> > >                |               |
> > >         [whackyPCIcard]   [other device]
> > >                |
> > >                +---FooBus---+-------+
> > >                             |       |
> > >                        [FooDev1] [FooDev2]
> > > 
> > > This is another example of the quality of the isolation group and what
> > > factors we incorporate in judging that.  If the bridge/switch port
> > > generating the segment does not support or enable PCI ACS then the IOMMU
> > > may be able to differentiate whackyPCIcard from the other device but not
> > > prevent peer-to-peer traffic between the two (or between FooDevs and the
> > > other device - same difference).  This would suggest that we might have
> > > an isolation group at the root of the segment for which the provider can
> > > guarantee isolation (includes everything on and below the bus), and we
> > > might also have isolation groups at whackyPCIcard and the other device
> > > that have a difference quality of isolation.  /me pauses for rant about
> > > superior hardware... ;)
> > 
> > Nested isolation groups? Please no.
> > 
> > An isolation group at the bridge encompassing that segment was exactly
> > what I had in mind, but my point is that all the FooBus devices *also*
> > need to be in that group, even though the isolation provider knows
> > nothing about FooBus.
> 
> If we're able to strictly say "no isolation = no group" and ignore
> FooBus for a moment, yes, the group at the bridge makes sense.  But as I
> said, I don't think we get that luxury.  There will be different
> "acceptable" levels of isolation and we'll need to support both the
> model of single group encompassing the segment as well as separate
> isolation groups for each device, whackyPCICard/other.

Well, sure.  But in that case, the FooBus devices still need to live
in the same group as their bridge card, since they're subject to the
same translations.

> A question then is how do we support both?  Perhaps it's a provider
> option whether to only create fully isolated group, which makes it
> traverse up to the segment.  The default might be to make separate
> groups at whackyPCICard and other, each with the nop2p flag.  It gets a
> lot more complicated, but maybe we support both so if system policy
> prevents a manager from binding to a sub-group due to nop2p, it can walk
> up a level.  I suspect dma_ops always wants the closer group to avoid
> traversing levels.

Yeah.  I really wish I knew more about what dwmw2 had in mind.

> Back to FooBus, dma_ops only knows how to provide dma for certain types
> of devices.  intel-iommu only knows about struct pci_dev.  So if FooDev
> needed to do dma, it would need some kind of intermediary to do the
> mapping via whackyPCICard.  So from that perspective, we could get away
> w/o including FooBus devices in the group.

That seems very fragile  Logically the FooBus devices should be in the
same group.  Yes, the FooBus dma hooks will need to know how to find
the bridge PCI card and thereby manipulate it's IOMMU mappings, but
they still belong in the same isolation group.

>  Of course if we want a
> manager to attach to the group at whackyPCICard (ignoring nop2p), we
> need to put requirements on the state of the FooDevs.  I think that
> might actually give some credibility to nested groups, hierarchical
> really, ie if we need to walk all the devices below a group, why not
> allow groups below a group?  I don't like it either, but I'm not sure
> it's avoidable.

Urgh.  Nested groups, a group parent must always have a strictly
stronger set of isolation properties than subgroups.  I suppose that
could work, it's really setting off my over-engineering alarms,
though.

> > > > Oh.. and I just thought of an existing-in-the-field case with the same
> > > > problem.  I'm pretty sure for devices that can appear on PCI but also
> > > > have "dumb-bus" versions used on embedded systems, at least some of
> > > > the kernel drivers are structured so that there is a separate struct
> > > > device for the PCI "wrapper" and the device inside.  If the inner
> > > > driver is initiating DMA, as it usually would, it has to be in the
> > > > same iso group as it's PCI device parent.
> > > 
> > > I don't know that we can ever generically identify such devices, but
> > > maybe this introduces the idea that we need to be able to blacklist
> > > certain devices as multi-homed and tainting any isolation group that
> > > contains them.
> > 
> > Sorry, I was unclear.  These are not multi-homed devices, just
> > multiple-variant devices, any given instance will either be PCI or
> > not.  But to handle the two variants easily, the drivers have nested
> > struct devices.  It's essentially a pure software artifact we're
> > dealing with here, but nonetheless we need to get the group-ownership
> > of those struct devices correct, whether they're synthetic or not.
> 
> So long as the ADD_DEVICE happens with the currently running variant,
> which seems like it has to be the case, I'm not sure how this matters.
> I'm trying to be very careful to only use struct device for groups.

Nope, I still haven't managed to convey the situation.  The *hardware*
comes in two varants, PCI and "dumb bus".  The core part of the driver
software binds (only) to a dumb bus struct device (usually a platform
device, actually).

To support the PCI variant, there is a "wrapper" driver.  This is a
PCI driver which does any PCI specific initialization, determines the
register addresses from config space then creates a dumb bus struct
device as a child of the pci_dev.  The core driver then attaches to
that dumb bus (e.g. platform) device.

My point is that in the second case, the dumb bus struct device needs
to be in the same group as its pci_dev parent.

> > > > >  When that happens we can traverse up the bus to find a
> > > > > higher level isolation group.
> > > > 
> > > > Well, that's one possible answer to my "where should the hook be
> > > > question": rather than an initialization hook, when we look up a
> > > > device's isolation group, if it doesn't say one explicitly, we try
> > > > it's bridge parent and so forth up the tree.  I wonder about the
> > > > overhead of having to walk all the way up the bus heirarchy before
> > > > returning NULL whenever we ask about the group of a device that
> > > > doesn't have one.
> > > 
> > > Yep, that could definitely be a concern for streaming dma.
> > 
> > Right, which is why I'm suggesting handling at init time instead.
> > We'd need something that runs in the generic hot-add path, after bus
> > notifiers, which would set the device's group to the same as its
> > parent's if a provider hasn't already given it one.
> 
> I don't think that works.  Back to the FooBus example, if the isolation
> group becomes a fundamental part of the dma_ops path, we may end up with
> groups at each FooDev (or maybe one for FooBus) generated by the
> intermediary that sets up dma using whackyPCICard.

That's why this would only do something *iff* nothing else has set a
group (like that intermediary).

> > > > >  It would probably make sense to have some
> > > > > kind of top-most isolation group that contains everything as it's an
> > > > > easy proof that if you own everything, you're isolated.
> > > > 
> > > > Hrm, no.  Things that have no IOMMU above them will have ungated
> > > > access to system RAM, and so can never be safely isolated for
> > > > userspace purposes, even if userspace owned every _device_ in the
> > > > system (not that you could do that in practice, anyway).
> > > 
> > > RAM is a good point, there are "non-devices" to worry about.
> > > Potentially that top level group doesn't allow managers.
> > 
> > I don't really see the point of a "top-level" group.  Groups are
> > exclusive, not heirarchical, and I don't see that there are any paths
> > really simplified by having a (not manageable) "leftovers" group.
> 
> Yeah, a top level group may not make sense, but if we plan for dma_ops,
> we need non-manageable groups, which I think is just the degenerate case
> of isolation quality.

Yeah, I'll buy that.  Pending more details on dwmw2's plans, anyway.

> > > > >  Potentially
> > > > > though, wackyPCIcard can also register isolation groups for each of it's
> > > > > FooBus devices if it's able to provide that capability.  Thanks,
> > > > 
> > > > It could, but why should it.  It doesn't know anything about IOMMUs or
> > > > isolation, and it doesn't need to.  Even less so for PCI devices which
> > > > create subordinate non-PCI struct devices for internal reasons.
> > > 
> > > Sorry I wasn't clear.  If wackyPCIcard happens to include an onboard
> > > IOMMU of some sort that's capable of isolation, it might be able to
> > > register groups for each of the devices below it.  Therefore we could
> > > end up with a scenario like above that the segment may not have ACS and
> > > therefore not be able to isolate wackyPCIcard from otherdevice, but
> > > wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice.  I think
> > > that means we potentially need to check all devices downstream of an
> > > isolation group to allow a manager to lock it, as well as checking the
> > > path upstream to make sure it isn't used above...  Are you sure we're
> > > ready for this kind of infrastructure?  Thanks,
> > 
> > Hrm, at this stage I think I'm prepared to assume that IOMMUs are
> > either strictly independent (e.g. covering different PCI domains) or
> > are at least vaguely aware of each other (i.e. platform conventions
> > cover the combination).  Truly nested IOMMUs, that are entirely
> > unaware of each other is a problem for another day (by which I mean
> > probably never).
> 
> Ok, but even if there's no iommu onboard whackyPCICard, it can still be
> possible to create groups on FooBus just to facilitate dma_ops and
> handle quirks.

No, you can't arbitrarily go creating groups - the group boundary
means something specific.  If dma_ops or something needs more info, it
will need that *in addition* to the group.

> > That said, again, groups have to be exclusive, not heirarchical, so
> > in the above case, I think we'd have a few possible ways to go:
> 
> Ownership absolutely has to be exclusive, but it looks like groups
> themselves are hierarchical.

Doesn't really make sense, since a group is defined as the minimum
isolatable parcel.  You can do something if as you walk down the tree
isolation level decreases, but it's certainly not feeling right.

> > 	1) The FooBus isolation provider could refuse to initialize if
> > the FooBus bridge is sharing an iso group with something else before
> > it probes the FooBus.
> 
> Sharing, as in a parent group is managed?

No, just as in a group exists which contains both the FooBus bridge
and something else.  Basically this option means that if you plugged
the FooBus bridge into a slot that wasn't sufficiently isolated, it
would simply refuse to work.

>  Yes, something special needs
> to happen in that case, I'd probably investigate having it initialized
> in a managed state versus not initialized at all.
> 
> > 	2) The FooBus isolation provider could be in the FooBus bridge
> > driver - meaning that if someone tried to grab the PCI group including
> > the FooBridge, the provider driver would be booted out, causing the
> > FooBus groups to cease to exist (meaning the PCI group manager would
> > never get lock while someone was already using the FooGroups).  In
> 
> Right, I don't know that they need to cease to exist, but if a manager
> is attached anywhere above or below the group it wants to lock, it has
> to be blocked.
> 
> > this case, it gets a bit complex.  When the FooBus isolation provider
> > is active, the FooBus devices would be in their own groups, not the
> > group of the FooBridge and its sibling.  When the FooBus isolation
> > provider is removed, it would have to configure the FooBus IOMMU to a
> > passthrough mode, and revert the FooBus devices to the parent's
> > group.  Hm.  Complicated.
> 
> Yep.  I think we're arriving at the same point.  Groups are
> hierarchical, but ownership via a manager cannot be nested.  So to
> manage a group, we need to walk the entire tree of devices below each
> device checking that none of the groups are managed and all the devices
> are using the right driver, then walk up from the group to verify no
> group of a parent device is managed.  Thanks,

Blargh.  I really, really hope we can come up with a simpler model
than that.
Alex Williamson - March 27, 2012, 7:34 p.m.
On Tue, 2012-03-27 at 16:14 +1100, David Gibson wrote:
> On Wed, Mar 21, 2012 at 03:12:58PM -0600, Alex Williamson wrote:
> > On Sat, 2012-03-17 at 15:57 +1100, David Gibson wrote:
> > > On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote:
> > > > On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote:
> > > > > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote:
> > > > > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote:
> > > > > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote:
> > > > > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote:
> > > > > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote:
> > > > > > > > > > +/*
> > > > > > > > > > + * Add a device to an isolation group.  Isolation groups start empty and
> > > > > > > > > > + * must be told about the devices they contain.  Expect this to be called
> > > > > > > > > > + * from isolation group providers via notifier.
> > > > > > > > > > + */
> > > > > > > > > 
> > > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the
> > > > > > > > > provider is integrated into host bridge code.
> > > > > > > > 
> > > > > > > > Sure, a provider could do this on it's own if it wants.  This just
> > > > > > > > provides some infrastructure for a common path.  Also note that this
> > > > > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider.  Yet
> > > > > > > > to be seen whether that can reasonably be the case once isolation groups
> > > > > > > > are added to streaming DMA paths.
> > > > > > > 
> > > > > > > Right, but other than the #ifdef safety, which could be achieved more
> > > > > > > simply, I'm not seeing what benefit the infrastructure provides over
> > > > > > > directly calling the bus notifier function.  The infrastructure groups
> > > > > > > the notifiers by bus type internally, but AFAICT exactly one bus
> > > > > > > notifier call would become exactly one isolation notifier call, and
> > > > > > > the notifier callback itself would be almost identical.
> > > > > > 
> > > > > > I guess I don't see this as a fundamental design point of the proposal,
> > > > > > it's just a convenient way to initialize groups as a side-band addition
> > > > > > until isolation groups become a more fundamental part of the iommu
> > > > > > infrastructure.  If you want to do that level of integration in your
> > > > > > provider, do it and make the callbacks w/o the notifier.  If nobody ends
> > > > > > up using them, we'll remove them.  Maybe it will just end up being a
> > > > > > bootstrap.  In the typical case, yes, one bus notifier is one isolation
> > > > > > notifier.  It does however also allow one bus notifier to become
> > > > > > multiple isolation notifiers, and includes some filtering that would
> > > > > > just be duplicated if every provider decided to implement their own bus
> > > > > > notifier.
> > > > > 
> > > > > Uh.. I didn't notice any filtering?  That's why I'm asking.
> > > > 
> > > > Not much, but a little:
> > > > 
> > > > +       switch (action) {
> > > > +       case BUS_NOTIFY_ADD_DEVICE:
> > > > +               if (!dev->isolation_group)
> > > > +                       blocking_notifier_call_chain(&notifier->notifier,
> > > > +                                       ISOLATION_NOTIFY_ADD_DEVICE, dev);
> > > > +               break;
> > > > +       case BUS_NOTIFY_DEL_DEVICE:
> > > > +               if (dev->isolation_group)
> > > > +                       blocking_notifier_call_chain(&notifier->notifier,
> > > > +                                       ISOLATION_NOTIFY_DEL_DEVICE, dev);
> > > > +               break;
> > > > +       }
> T> > 
> > > 
> > > Ah, I see, fair enough.
> > > 
> > > A couple of tangential observations.  First, I suspect using
> > > BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug,
> > > it might be better to have an unplug callback in the group instead.
> > 
> > There's really one already.  Assuming the device is attached to a group
> > driver, the .remove entry point on the driver will get called first.  It
> > doesn't get to return error, but it can block.
> 
> Hrm.  Assuming the isolation provider has installed a driver for the
> relevant bus type.  And as we're discussing elsewhere, I think we have
> situations where the groups will get members on (subordinate) busses
> which the isolation provider doesn't have explicit knowledge of.
> 
> > The DEL_DEVICE will only
> > happen after the group driver has relinquished the device, or at least
> > returned from remove().  For a device with no driver, the group would
> > only learn about it through the notifier, but it probably doesn't need
> > anything more direct.
> 
> DEL_DEVICE is certainly sufficient, it just seems a bit unnecessarily
> awkward.
> 
> > > Second, I don't think aborting the call chain early for hot-plug is
> > > actually a good idea.  I can't see a clear guarantee on the order, so
> > > individual providers couldn't rely on that short-cut behaviour.  Which
> > > means that if two providers would have attempted to claim the same
> > > device, something is seriously wrong and we should probably report
> > > that.
> > 
> > Yeah, that seems like a reasonable safety check.
> > 
> > > > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure
> > > > > > > > > exactly where.  If an isolation provider doesn't explicitly put a
> > > > > > > > > device into a group, the device should go into the group of its parent
> > > > > > > > > bridge.  This covers the case of a bus with IOMMU which has below it a
> > > > > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't
> > > > > > > > > explicitly aware of).  DMAs from devices on the subordinate bus can be
> > > > > > > > > translated by the top-level IOMMU (assuming it sees them as coming
> > > > > > > > > from the bridge), but they all need to be treated as one group.
> > > > > > > > 
> > > > > > > > Why would the top level IOMMU provider not set the isolation group in
> > > > > > > > this case.
> > > > > > > 
> > > > > > > Because it knows nothing about the subordinate bus.  For example
> > > > > > > imagine a VT-d system, with a wacky PCI card into which you plug some
> > > > > > > other type of DMA capable device.  The PCI card is acting as a bridge
> > > > > > > from PCI to this, let's call it FooBus.  Obviously the VT-d code won't
> > > > > > > have a FooBus notifier, since it knows nothing about FooBus.  But the
> > > > > > > FooBus devices still need to end up in the group of the PCI bridge
> > > > > > > device, since their DMA operations will appear as coming from the PCI
> > > > > > > bridge card to the IOMMU, and can be isolated from the rest of the
> > > > > > > system (but not each other) on that basis.
> > > > > > 
> > > > > > I guess I was imagining that it's ok to have devices without an
> > > > > > isolation group.
> > > > > 
> > > > > It is, but having NULL isolation group has a pretty specific meaning -
> > > > > it means it's never safe to give that device to userspace, but it also
> > > > > means that normal kernel driver operation of that device must not
> > > > > interfere with anything in any iso group (otherwise we can never no
> > > > > that those iso groups _are_ safe to hand out).  Likewise userspace
> > > > > operation of any isolation group can't mess with no-group devices.
> > > > 
> > > > This is where wanting to use isolation groups as the working unit for an
> > > > iommu ops layer and also wanting to use iommu ops to replace dma ops
> > > > seem to collide a bit.  Do we want two interfaces for dma, one group
> > > > based and one for non-isolated devices?
> > > 
> > > Well, I don't know enough about what dwmw2 had planned to answer
> > > that.  I was assuming the in-kernel dma reply would remain device/bus
> > > focussed, and how it looked up and used the groups was an internal
> > > matter.  I would expect it would probably need a fallback path for "no
> > > group" for transition purposes, even if it wasn't planned to keep that
> > > forever.
> > 
> > Hmm, I think the value of a core isolation group layer starts to fall
> > apart if an iommu driver can't count on a group being present for any
> > device that might do dma.  Some ideas below.
> 
> Ah, yes, to clarify: I think anything subject to IOMMU translation
> should have a group (which might need a no-manager flag).  However,
> for devices with no IOMMU, or with an IOMMU we can switch permanently
> into bypass mode, I think it's reasonable to leave them without group.
> 
> > > >  Isolation providers like
> > > > intel-iommu would always use one, non-isolation capable dma paths, like
> > > > swiotlb or non-isolation hardware iommus, would use another.  And how do
> > > > we factor in the degree of isolation to avoid imposing policy in the
> > > > kernel?  MSI isolation is an example.  We should allow userspace to set
> > > > a policy of whether lack of MSI protection is an acceptable risk.  Does
> > > > that mean we can have isolation groups that provide no isolation and
> > > > sysfs files indicating capabilities?  Perhaps certain classes of groups
> > > > don't even allow manager binding?
> > > 
> > > Ugh.  I thought about flagging groups with some sort of bitmap of
> > > isolation strength properties, but I think that way lies madness.  We
> > > might want a global policy bitmask though which expresses which
> > > constraints are acceptable risks.  The providers would then have to
> > > provide groups which are as strong as requested, or none at all.
> > 
> > That's effectively how the current iommu_device_group() interface works;
> > identify isolate-able groups, or none at all.  I don't think we get that
> > luxury if we push isolation groups down into the device layer though.
> > If we want to use them for dma_ops and to solve various device quirks,
> > they can't only be present on some configurations.  I think you're right
> > on the global policy though, we just need to apply it differently.  I'm
> > thinking we need something like "isolation_allow=" allowing options of
> > "nomsi" and "nop2p" (just for starters).  When a provider creates a
> > group, they provide a set of flags for the group.  A manager is not
> > allowed to bind to the group if the global policy doesn't match the
> > group flags.  We'll need some support functions, maybe even a sysfs
> > file, so userspace can know in advance and vfio can avoid creating
> > entries for unusable groups.
> 
> Just banning managers for groups of insufficient strength isn't quite
> enough, because that doesn't allow for consolidation of several
> poorly-isolated groups into one strongly isolated groups which may be
> possible on some hardware configurations.

This is the part where things start to completely fall apart.

> > > > > None of these conditions is true for the hypothetical Foobus case.
> > > > > The bus as a whole could be safely given to userspace, the devices on
> > > > > it *could* mess with an existing isolation group (suppose the group
> > > > > consisted of a PCI segment with the FooBus bridge plus another PCI
> > > > > card - FooBus DMAs would be bridged onto the PCI segment and might
> > > > > target the other card's MMIO space).  And other grouped devices can
> > > > > certainly mess with the FooBus devices (if userspace grabs the bridge
> > > > > and manipulates its IOMMU mappings, that would clearly screw up any
> > > > > kernel drivers doing DMA from FooBus devices behind it).
> > > > 
> > > > ---segment-----+---------------+
> > > >                |               |
> > > >         [whackyPCIcard]   [other device]
> > > >                |
> > > >                +---FooBus---+-------+
> > > >                             |       |
> > > >                        [FooDev1] [FooDev2]
> > > > 
> > > > This is another example of the quality of the isolation group and what
> > > > factors we incorporate in judging that.  If the bridge/switch port
> > > > generating the segment does not support or enable PCI ACS then the IOMMU
> > > > may be able to differentiate whackyPCIcard from the other device but not
> > > > prevent peer-to-peer traffic between the two (or between FooDevs and the
> > > > other device - same difference).  This would suggest that we might have
> > > > an isolation group at the root of the segment for which the provider can
> > > > guarantee isolation (includes everything on and below the bus), and we
> > > > might also have isolation groups at whackyPCIcard and the other device
> > > > that have a difference quality of isolation.  /me pauses for rant about
> > > > superior hardware... ;)
> > > 
> > > Nested isolation groups? Please no.
> > > 
> > > An isolation group at the bridge encompassing that segment was exactly
> > > what I had in mind, but my point is that all the FooBus devices *also*
> > > need to be in that group, even though the isolation provider knows
> > > nothing about FooBus.
> > 
> > If we're able to strictly say "no isolation = no group" and ignore
> > FooBus for a moment, yes, the group at the bridge makes sense.  But as I
> > said, I don't think we get that luxury.  There will be different
> > "acceptable" levels of isolation and we'll need to support both the
> > model of single group encompassing the segment as well as separate
> > isolation groups for each device, whackyPCICard/other.
> 
> Well, sure.  But in that case, the FooBus devices still need to live
> in the same group as their bridge card, since they're subject to the
> same translations.
> 
> > A question then is how do we support both?  Perhaps it's a provider
> > option whether to only create fully isolated group, which makes it
> > traverse up to the segment.  The default might be to make separate
> > groups at whackyPCICard and other, each with the nop2p flag.  It gets a
> > lot more complicated, but maybe we support both so if system policy
> > prevents a manager from binding to a sub-group due to nop2p, it can walk
> > up a level.  I suspect dma_ops always wants the closer group to avoid
> > traversing levels.
> 
> Yeah.  I really wish I knew more about what dwmw2 had in mind.
> 
> > Back to FooBus, dma_ops only knows how to provide dma for certain types
> > of devices.  intel-iommu only knows about struct pci_dev.  So if FooDev
> > needed to do dma, it would need some kind of intermediary to do the
> > mapping via whackyPCICard.  So from that perspective, we could get away
> > w/o including FooBus devices in the group.
> 
> That seems very fragile  Logically the FooBus devices should be in the
> same group.  Yes, the FooBus dma hooks will need to know how to find
> the bridge PCI card and thereby manipulate it's IOMMU mappings, but
> they still belong in the same isolation group.
> 
> >  Of course if we want a
> > manager to attach to the group at whackyPCICard (ignoring nop2p), we
> > need to put requirements on the state of the FooDevs.  I think that
> > might actually give some credibility to nested groups, hierarchical
> > really, ie if we need to walk all the devices below a group, why not
> > allow groups below a group?  I don't like it either, but I'm not sure
> > it's avoidable.
> 
> Urgh.  Nested groups, a group parent must always have a strictly
> stronger set of isolation properties than subgroups.  I suppose that
> could work, it's really setting off my over-engineering alarms,
> though.

This entire thing is over engineered, I think it's time to take a step
back.  We're combining iommu visibility with device quirks with
isolation strength and hierarchical groups with manager locking and
driver autoprobing... it's all over the place.

> > > > > Oh.. and I just thought of an existing-in-the-field case with the same
> > > > > problem.  I'm pretty sure for devices that can appear on PCI but also
> > > > > have "dumb-bus" versions used on embedded systems, at least some of
> > > > > the kernel drivers are structured so that there is a separate struct
> > > > > device for the PCI "wrapper" and the device inside.  If the inner
> > > > > driver is initiating DMA, as it usually would, it has to be in the
> > > > > same iso group as it's PCI device parent.
> > > > 
> > > > I don't know that we can ever generically identify such devices, but
> > > > maybe this introduces the idea that we need to be able to blacklist
> > > > certain devices as multi-homed and tainting any isolation group that
> > > > contains them.
> > > 
> > > Sorry, I was unclear.  These are not multi-homed devices, just
> > > multiple-variant devices, any given instance will either be PCI or
> > > not.  But to handle the two variants easily, the drivers have nested
> > > struct devices.  It's essentially a pure software artifact we're
> > > dealing with here, but nonetheless we need to get the group-ownership
> > > of those struct devices correct, whether they're synthetic or not.
> > 
> > So long as the ADD_DEVICE happens with the currently running variant,
> > which seems like it has to be the case, I'm not sure how this matters.
> > I'm trying to be very careful to only use struct device for groups.
> 
> Nope, I still haven't managed to convey the situation.  The *hardware*
> comes in two varants, PCI and "dumb bus".  The core part of the driver
> software binds (only) to a dumb bus struct device (usually a platform
> device, actually).
> 
> To support the PCI variant, there is a "wrapper" driver.  This is a
> PCI driver which does any PCI specific initialization, determines the
> register addresses from config space then creates a dumb bus struct
> device as a child of the pci_dev.  The core driver then attaches to
> that dumb bus (e.g. platform) device.
> 
> My point is that in the second case, the dumb bus struct device needs
> to be in the same group as its pci_dev parent.
> 
> > > > > >  When that happens we can traverse up the bus to find a
> > > > > > higher level isolation group.
> > > > > 
> > > > > Well, that's one possible answer to my "where should the hook be
> > > > > question": rather than an initialization hook, when we look up a
> > > > > device's isolation group, if it doesn't say one explicitly, we try
> > > > > it's bridge parent and so forth up the tree.  I wonder about the
> > > > > overhead of having to walk all the way up the bus heirarchy before
> > > > > returning NULL whenever we ask about the group of a device that
> > > > > doesn't have one.
> > > > 
> > > > Yep, that could definitely be a concern for streaming dma.
> > > 
> > > Right, which is why I'm suggesting handling at init time instead.
> > > We'd need something that runs in the generic hot-add path, after bus
> > > notifiers, which would set the device's group to the same as its
> > > parent's if a provider hasn't already given it one.
> > 
> > I don't think that works.  Back to the FooBus example, if the isolation
> > group becomes a fundamental part of the dma_ops path, we may end up with
> > groups at each FooDev (or maybe one for FooBus) generated by the
> > intermediary that sets up dma using whackyPCICard.
> 
> That's why this would only do something *iff* nothing else has set a
> group (like that intermediary).
> 
> > > > > >  It would probably make sense to have some
> > > > > > kind of top-most isolation group that contains everything as it's an
> > > > > > easy proof that if you own everything, you're isolated.
> > > > > 
> > > > > Hrm, no.  Things that have no IOMMU above them will have ungated
> > > > > access to system RAM, and so can never be safely isolated for
> > > > > userspace purposes, even if userspace owned every _device_ in the
> > > > > system (not that you could do that in practice, anyway).
> > > > 
> > > > RAM is a good point, there are "non-devices" to worry about.
> > > > Potentially that top level group doesn't allow managers.
> > > 
> > > I don't really see the point of a "top-level" group.  Groups are
> > > exclusive, not heirarchical, and I don't see that there are any paths
> > > really simplified by having a (not manageable) "leftovers" group.
> > 
> > Yeah, a top level group may not make sense, but if we plan for dma_ops,
> > we need non-manageable groups, which I think is just the degenerate case
> > of isolation quality.
> 
> Yeah, I'll buy that.  Pending more details on dwmw2's plans, anyway.
> 
> > > > > >  Potentially
> > > > > > though, wackyPCIcard can also register isolation groups for each of it's
> > > > > > FooBus devices if it's able to provide that capability.  Thanks,
> > > > > 
> > > > > It could, but why should it.  It doesn't know anything about IOMMUs or
> > > > > isolation, and it doesn't need to.  Even less so for PCI devices which
> > > > > create subordinate non-PCI struct devices for internal reasons.
> > > > 
> > > > Sorry I wasn't clear.  If wackyPCIcard happens to include an onboard
> > > > IOMMU of some sort that's capable of isolation, it might be able to
> > > > register groups for each of the devices below it.  Therefore we could
> > > > end up with a scenario like above that the segment may not have ACS and
> > > > therefore not be able to isolate wackyPCIcard from otherdevice, but
> > > > wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice.  I think
> > > > that means we potentially need to check all devices downstream of an
> > > > isolation group to allow a manager to lock it, as well as checking the
> > > > path upstream to make sure it isn't used above...  Are you sure we're
> > > > ready for this kind of infrastructure?  Thanks,
> > > 
> > > Hrm, at this stage I think I'm prepared to assume that IOMMUs are
> > > either strictly independent (e.g. covering different PCI domains) or
> > > are at least vaguely aware of each other (i.e. platform conventions
> > > cover the combination).  Truly nested IOMMUs, that are entirely
> > > unaware of each other is a problem for another day (by which I mean
> > > probably never).
> > 
> > Ok, but even if there's no iommu onboard whackyPCICard, it can still be
> > possible to create groups on FooBus just to facilitate dma_ops and
> > handle quirks.
> 
> No, you can't arbitrarily go creating groups - the group boundary
> means something specific.  If dma_ops or something needs more info, it
> will need that *in addition* to the group.
> 
> > > That said, again, groups have to be exclusive, not heirarchical, so
> > > in the above case, I think we'd have a few possible ways to go:
> > 
> > Ownership absolutely has to be exclusive, but it looks like groups
> > themselves are hierarchical.
> 
> Doesn't really make sense, since a group is defined as the minimum
> isolatable parcel.  You can do something if as you walk down the tree
> isolation level decreases, but it's certainly not feeling right.
> 
> > > 	1) The FooBus isolation provider could refuse to initialize if
> > > the FooBus bridge is sharing an iso group with something else before
> > > it probes the FooBus.
> > 
> > Sharing, as in a parent group is managed?
> 
> No, just as in a group exists which contains both the FooBus bridge
> and something else.  Basically this option means that if you plugged
> the FooBus bridge into a slot that wasn't sufficiently isolated, it
> would simply refuse to work.
> 
> >  Yes, something special needs
> > to happen in that case, I'd probably investigate having it initialized
> > in a managed state versus not initialized at all.
> > 
> > > 	2) The FooBus isolation provider could be in the FooBus bridge
> > > driver - meaning that if someone tried to grab the PCI group including
> > > the FooBridge, the provider driver would be booted out, causing the
> > > FooBus groups to cease to exist (meaning the PCI group manager would
> > > never get lock while someone was already using the FooGroups).  In
> > 
> > Right, I don't know that they need to cease to exist, but if a manager
> > is attached anywhere above or below the group it wants to lock, it has
> > to be blocked.
> > 
> > > this case, it gets a bit complex.  When the FooBus isolation provider
> > > is active, the FooBus devices would be in their own groups, not the
> > > group of the FooBridge and its sibling.  When the FooBus isolation
> > > provider is removed, it would have to configure the FooBus IOMMU to a
> > > passthrough mode, and revert the FooBus devices to the parent's
> > > group.  Hm.  Complicated.
> > 
> > Yep.  I think we're arriving at the same point.  Groups are
> > hierarchical, but ownership via a manager cannot be nested.  So to
> > manage a group, we need to walk the entire tree of devices below each
> > device checking that none of the groups are managed and all the devices
> > are using the right driver, then walk up from the group to verify no
> > group of a parent device is managed.  Thanks,
> 
> Blargh.  I really, really hope we can come up with a simpler model
> than that.

Yep, I'm pretty well at the end of this experiment.  Honestly, I think
isolation groups are the wrong approach.  We're trying to wrap too many
concepts together and it's completely unmanageable.  I cannot see adding
the complexity we're talking about here to the core device model for
such a niche usage.  I think we're better off going back to the
iommu_device_group() and building that out into something more complete,
starting with group based iommu ops and a dma quirk infrastructure.
From there we can add some basic facilities to toggle driver autoprobe,
maybe setup notifications for the group, and hopefully include a way to
share iommu mappings between groups.  Anything much beyond that we
should probably leave for something like the vfio driver.  Thanks,

Alex
David Gibson - March 30, 2012, 4:19 a.m.
On Tue, Mar 27, 2012 at 01:34:43PM -0600, Alex Williamson wrote:
[snip]
> > > > this case, it gets a bit complex.  When the FooBus isolation provider
> > > > is active, the FooBus devices would be in their own groups, not the
> > > > group of the FooBridge and its sibling.  When the FooBus isolation
> > > > provider is removed, it would have to configure the FooBus IOMMU to a
> > > > passthrough mode, and revert the FooBus devices to the parent's
> > > > group.  Hm.  Complicated.
> > > 
> > > Yep.  I think we're arriving at the same point.  Groups are
> > > hierarchical, but ownership via a manager cannot be nested.  So to
> > > manage a group, we need to walk the entire tree of devices below each
> > > device checking that none of the groups are managed and all the devices
> > > are using the right driver, then walk up from the group to verify no
> > > group of a parent device is managed.  Thanks,
> > 
> > Blargh.  I really, really hope we can come up with a simpler model
> > than that.
> 
> Yep, I'm pretty well at the end of this experiment.  Honestly, I think
> isolation groups are the wrong approach.  We're trying to wrap too many
> concepts together and it's completely unmanageable.  I cannot see adding
> the complexity we're talking about here to the core device model for
> such a niche usage.  I think we're better off going back to the
> iommu_device_group() and building that out into something more complete,
> starting with group based iommu ops and a dma quirk infrastructure.
> >From there we can add some basic facilities to toggle driver autoprobe,
> maybe setup notifications for the group, and hopefully include a way to
> share iommu mappings between groups.  Anything much beyond that we
> should probably leave for something like the vfio driver.  Thanks,

Yes, well, I was hoping for a simpler model that didn't involve simply
sweeping all the issues under a rug.

Patch

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 7be9f79..e98a5f3 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -189,4 +189,14 @@  config DMA_SHARED_BUFFER
 	  APIs extension; the file's descriptor can then be passed on to other
 	  driver.
 
+config ISOLATION_GROUPS
+	bool "Enable Isolation Group API"
+	default n
+	depends on EXPERIMENTAL && IOMMU_API
+	help
+	  This option enables grouping of devices into Isolation Groups
+	  which may be used by other subsystems to perform quirks across
+	  sets of devices as well as userspace drivers for guaranteeing
+	  devices are isolated from the rest of the system.
+
 endmenu
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 610f999..047b5f9 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -19,6 +19,7 @@  obj-$(CONFIG_MODULES)	+= module.o
 endif
 obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
 obj-$(CONFIG_REGMAP)	+= regmap/
+obj-$(CONFIG_ISOLATION_GROUPS)	+= isolation.o
 
 ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG
 
diff --git a/drivers/base/base.h b/drivers/base/base.h
index b858dfd..376758a 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -1,4 +1,5 @@ 
 #include <linux/notifier.h>
+#include <linux/isolation.h>
 
 /**
  * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
@@ -108,6 +109,10 @@  extern int driver_probe_device(struct device_driver *drv, struct device *dev);
 static inline int driver_match_device(struct device_driver *drv,
 				      struct device *dev)
 {
+	if (isolation_group_managed(to_isolation_group(dev)) &&
+	    !isolation_group_manager_driver(to_isolation_group(dev), drv))
+		return 0;
+
 	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
 }
 
diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c
new file mode 100644
index 0000000..c01365c
--- /dev/null
+++ b/drivers/base/isolation.c
@@ -0,0 +1,798 @@ 
+/*
+ * Isolation group interface
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/isolation.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+
+static struct kset *isolation_kset;
+/* XXX add more complete locking, maybe rcu */
+static DEFINE_MUTEX(isolation_lock);
+static LIST_HEAD(isolation_groups);
+static LIST_HEAD(isolation_notifiers);
+
+/* Keep these private */
+struct isolation_manager_driver {
+	struct device_driver *drv;
+	struct list_head list;
+};
+
+struct isolation_manager {
+	struct list_head drivers;
+	/* Likely need managers to register some callbacks */
+};
+
+struct isolation_group {
+	struct list_head list;
+	struct list_head devices;
+	struct kobject kobj;
+	struct kobject *devices_kobj;
+	struct iommu_domain *iommu_domain;
+	struct iommu_ops *iommu_ops;
+	struct isolation_manager *manager;
+	uuid_le uuid;
+};
+
+struct isolation_device {
+	struct device *dev;
+	struct list_head list;
+};
+
+struct isolation_notifier {
+	struct bus_type *bus;
+	struct notifier_block nb;
+	struct blocking_notifier_head notifier;
+	struct list_head list;
+	int refcnt;
+};
+
+struct iso_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct isolation_group *group, char *buf);
+	ssize_t (*store)(struct isolation_group *group,
+			 const char *buf, size_t count);
+};
+
+#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr)
+#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj)
+
+static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct iso_attribute *iso_attr = to_iso_attr(attr);
+	struct isolation_group *group = to_iso_group(kobj);
+	ssize_t ret = -EIO;
+
+	if (iso_attr->show)
+		ret = iso_attr->show(group, buf);
+	return ret;
+}
+
+static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buf, size_t count)
+{
+	struct iso_attribute *iso_attr = to_iso_attr(attr);
+	struct isolation_group *group = to_iso_group(kobj);
+	ssize_t ret = -EIO;
+
+	if (iso_attr->store)
+		ret = iso_attr->store(group, buf, count);
+	return ret;
+}
+
+static void iso_release(struct kobject *kobj)
+{
+	struct isolation_group *group = to_iso_group(kobj);
+	kfree(group);
+}
+
+static const struct sysfs_ops iso_sysfs_ops = {
+	.show = iso_attr_show,
+	.store = iso_attr_store,
+};
+
+static struct kobj_type iso_ktype = {
+	.sysfs_ops = &iso_sysfs_ops,
+	.release = iso_release,
+};
+
+/*
+ * Create an isolation group.  Isolation group "providers" register a
+ * notifier for their bus(es) and call this to create a new isolation
+ * group.
+ */
+struct isolation_group *isolation_create_group(void)
+{
+	struct isolation_group *group, *tmp;
+	int ret;
+
+	if (unlikely(!isolation_kset))
+		return ERR_PTR(-ENODEV);
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&isolation_lock);
+
+	/*
+	 * Use a UUID for group identification simply because it seems wrong
+	 * to expose it as a kernel pointer (%p).  Neither is user friendly.
+	 * Mostly only expect userspace to do anything with this.
+	 */
+new_uuid:
+	uuid_le_gen(&group->uuid);
+
+	/* Unlikely, but nothing prevents duplicates */
+	list_for_each_entry(tmp, &isolation_groups, list)
+		if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0)
+			goto new_uuid;
+
+	INIT_LIST_HEAD(&group->devices);
+	group->kobj.kset = isolation_kset;
+
+	ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL,
+				   "%pUl", &group->uuid);
+	if (ret) {
+		kfree(group);
+		mutex_unlock(&isolation_lock);
+		return ERR_PTR(ret);
+	}
+
+	/* Add a subdirectory to save links for devices withing the group. */
+	group->devices_kobj = kobject_create_and_add("devices", &group->kobj);
+	if (!group->devices_kobj) {
+		kobject_put(&group->kobj);
+		mutex_unlock(&isolation_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	list_add(&group->list, &isolation_groups);
+
+	mutex_unlock(&isolation_lock);
+
+	return group;
+}
+
+/*
+ * Free isolation group.  XXX need to cleanup/reject based on manager status.
+ */
+int isolation_free_group(struct isolation_group *group)
+{
+	mutex_lock(&isolation_lock);
+
+	if (!list_empty(&group->devices)) {
+		mutex_unlock(&isolation_lock);
+		return -EBUSY;
+	}
+
+	list_del(&group->list);
+	kobject_put(group->devices_kobj);
+	kobject_put(&group->kobj);
+
+	mutex_unlock(&isolation_lock);
+	return 0;
+}
+
+/*
+ * Add a device to an isolation group.  Isolation groups start empty and
+ * must be told about the devices they contain.  Expect this to be called
+ * from isolation group providers via notifier.
+ */
+int isolation_group_add_dev(struct isolation_group *group, struct device *dev)
+{
+	struct isolation_device *device;
+	int ret = 0;
+
+	mutex_lock(&isolation_lock);
+
+	if (dev->isolation_group) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	device = kzalloc(sizeof(*device), GFP_KERNEL);
+	if (!device) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	device->dev = dev;
+
+	/* Cross link the device in sysfs */
+	ret = sysfs_create_link(&dev->kobj, &group->kobj,
+				"isolation_group");
+	if (ret) {
+		kfree(device);
+		goto out;
+	}
+	
+	ret = sysfs_create_link(group->devices_kobj, &dev->kobj,
+				kobject_name(&dev->kobj));
+	if (ret) {
+		sysfs_remove_link(&dev->kobj, "isolation_group");
+		kfree(device);
+		goto out;
+	}
+
+	list_add(&device->list, &group->devices);
+
+	dev->isolation_group = group;
+
+	if (group->iommu_domain) {
+		ret = group->iommu_ops->attach_dev(group->iommu_domain, dev);
+		if (ret) {
+			/* XXX fail device? */
+		}
+	}
+
+	/* XXX signal manager? */
+out:
+	mutex_unlock(&isolation_lock);
+
+	return 0;
+}
+
+/*
+ * Remove a device from an isolation group, likely because the device
+ * went away.
+ */
+int isolation_group_del_dev(struct device *dev)
+{
+	struct isolation_group *group = dev->isolation_group;
+	struct isolation_device *device;
+
+	if (!group)
+		return -ENODEV;
+
+	mutex_lock(&isolation_lock);
+
+	list_for_each_entry(device, &group->devices, list) {
+		if (device->dev == dev) {
+			/* XXX signal manager? */
+
+			if (group->iommu_domain)
+				group->iommu_ops->detach_dev(
+					group->iommu_domain, dev);
+			list_del(&device->list);
+			kfree(device);
+			dev->isolation_group = NULL;
+			sysfs_remove_link(group->devices_kobj,
+					  kobject_name(&dev->kobj));
+			sysfs_remove_link(&dev->kobj, "isolation_group");
+			break;
+		}
+	}
+			
+	mutex_unlock(&isolation_lock);
+
+	return 0;
+}
+
+/*
+ * Filter and call out to our own notifier chains
+ */
+static int isolation_bus_notify(struct notifier_block *nb,
+				unsigned long action, void *data)
+{
+	struct isolation_notifier *notifier =
+		container_of(nb, struct isolation_notifier, nb);
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		if (!dev->isolation_group)
+			blocking_notifier_call_chain(&notifier->notifier,
+					ISOLATION_NOTIFY_ADD_DEVICE, dev);
+		break;
+	case BUS_NOTIFY_DEL_DEVICE:
+		if (dev->isolation_group)
+			blocking_notifier_call_chain(&notifier->notifier,
+					ISOLATION_NOTIFY_DEL_DEVICE, dev);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add
+ * a new notifier.
+ */
+static int isolation_do_add_notify(struct device *dev, void *data)
+{
+	struct notifier_block *nb = data;
+
+	if (!dev->isolation_group)
+		nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev);
+
+	return 0;
+}
+
+/*
+ * Register a notifier.  This is for isolation group providers.  It's
+ * possible we could have multiple isolation group providers for the same
+ * bus type, so we create a struct isolation_notifier per bus_type, each
+ * with a notifier block.  Providers are therefore welcome to register
+ * as many buses as they can handle.
+ */
+int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb)
+{
+	struct isolation_notifier *notifier;
+	bool found = false;
+	int ret;
+
+	mutex_lock(&isolation_lock);
+	list_for_each_entry(notifier, &isolation_notifiers, list) {
+		if (notifier->bus == bus) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
+		if (!notifier) {
+			mutex_unlock(&isolation_lock);
+			return -ENOMEM;	
+		}
+		notifier->bus = bus;
+		notifier->nb.notifier_call = isolation_bus_notify;
+		BLOCKING_INIT_NOTIFIER_HEAD(&notifier->notifier);
+		bus_register_notifier(bus, &notifier->nb);
+		list_add(&notifier->list, &isolation_notifiers);
+	}
+
+	ret = blocking_notifier_chain_register(&notifier->notifier, nb);
+	if (ret) {
+		if (notifier->refcnt == 0) {
+			bus_unregister_notifier(bus, &notifier->nb);
+			list_del(&notifier->list);
+			kfree(notifier);
+		}
+		mutex_unlock(&isolation_lock);
+		return ret;
+	}
+	notifier->refcnt++;
+
+	mutex_unlock(&isolation_lock);
+
+	bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify);
+
+	return 0;
+}
+
+/*
+ * Unregister...
+ */
+int isolation_unregister_notifier(struct bus_type *bus,
+				  struct notifier_block *nb)
+{
+	struct isolation_notifier *notifier;
+	bool found = false;
+	int ret;
+
+	mutex_lock(&isolation_lock);
+	list_for_each_entry(notifier, &isolation_notifiers, list) {
+		if (notifier->bus == bus) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		mutex_unlock(&isolation_lock);
+		return -ENODEV;
+	}
+
+	ret = blocking_notifier_chain_unregister(&notifier->notifier, nb);
+	if (ret) {
+		mutex_unlock(&isolation_lock);
+		return ret;
+	}
+
+	/* Free at nobody left in the notifier block */
+	if (--notifier->refcnt == 0) {
+		bus_unregister_notifier(bus, &notifier->nb);
+		list_del(&notifier->list);
+		kfree(notifier);
+	}
+
+	mutex_unlock(&isolation_lock);
+
+	return 0;
+}
+
+/*
+ * Set iommu_ops on an isolation group.  Hopefully all DMA will eventually
+ * be done through isolation groups, for now, we just provide another
+ * interface to the iommu api.
+ */
+int isolation_group_set_iommu_ops(struct isolation_group *group,
+				  struct iommu_ops *iommu_ops)
+{
+	mutex_lock(&isolation_lock);
+
+	if (group->iommu_ops) {
+		mutex_unlock(&isolation_lock);
+		return -EBUSY;
+	}
+
+	group->iommu_ops = iommu_ops;
+
+	mutex_unlock(&isolation_lock);
+
+	return 0;
+}
+
+/*
+ * Attach all the devices for a group to the specified iommu domain.
+ */
+static int __isolation_group_domain_attach_devs(struct iommu_domain *domain,
+					        struct list_head *devices)
+{
+	struct isolation_device *device;
+	struct device *dev;
+	int ret = 0;
+
+	list_for_each_entry(device, devices, list) {
+		dev = device->dev;
+
+		ret = domain->ops->attach_dev(domain, dev);
+		if (ret) {
+			list_for_each_entry_continue_reverse(device,
+							     devices, list) {
+				dev = device->dev;
+				domain->ops->detach_dev(domain, dev);
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * And detach...
+ */
+static void __isolation_group_domain_detach_devs(struct iommu_domain *domain,
+						 struct list_head *devices)
+{
+	struct isolation_device *device;
+	struct device *dev;
+
+	list_for_each_entry(device, devices, list) {
+		dev = device->dev;
+
+		domain->ops->detach_dev(domain, dev);
+	}
+}
+
+/*
+ * Initialize the iommu domain for an isolation group.  This is to ease the
+ * transition to using isolation groups and expected to be used by current
+ * users of the iommu api for now.
+ */
+int isolation_group_init_iommu_domain(struct isolation_group *group)
+{
+	int ret;
+
+	mutex_lock(&isolation_lock);
+
+	if (!group->iommu_ops || list_empty(&group->devices)) {
+		mutex_unlock(&isolation_lock);
+		return -EINVAL;
+	}
+
+	if (group->iommu_domain) {
+		mutex_unlock(&isolation_lock);
+		return -EBUSY;
+	}
+
+	group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL);
+	if (!group->iommu_domain) {
+		mutex_unlock(&isolation_lock);
+		return -ENOMEM;
+	}
+
+	group->iommu_domain->ops = group->iommu_ops;
+
+	ret = group->iommu_ops->domain_init(group->iommu_domain);
+	if (ret) {
+		kfree(group->iommu_domain);
+		group->iommu_domain = NULL;
+		mutex_unlock(&isolation_lock);
+		return ret;
+	}
+
+	/* Automatically attach all the devices in the group. */
+	ret = __isolation_group_domain_attach_devs(group->iommu_domain,
+						   &group->devices);
+	if (ret) {
+		group->iommu_ops->domain_destroy(group->iommu_domain);
+		kfree(group->iommu_domain);
+		group->iommu_domain = NULL;
+		mutex_unlock(&isolation_lock);
+		return ret;
+	}
+		
+	mutex_unlock(&isolation_lock);
+	return 0;
+}
+
+/*
+ * And free...
+ * Note just below, we add the ability to add another group to an iommu
+ * domain, so we don't always destroy and free the domain here.
+ */
+void isolation_group_free_iommu_domain(struct isolation_group *group)
+{
+	struct isolation_group *tmp;
+	bool inuse = false;
+
+	if (!group->iommu_domain || !group->iommu_ops)
+		return;
+
+	mutex_lock(&isolation_lock);
+
+	__isolation_group_domain_detach_devs(group->iommu_domain,
+					     &group->devices);
+
+	list_for_each_entry(tmp, &isolation_groups, list) {
+		if (tmp == group)
+			continue;
+		if (tmp->iommu_domain == group->iommu_domain) {
+			inuse = true;
+			break;
+		}
+	}
+
+	if (!inuse) {
+		group->iommu_ops->domain_destroy(group->iommu_domain);
+		kfree(group->iommu_domain);
+	}
+
+	group->iommu_domain = NULL;
+
+	mutex_unlock(&isolation_lock);
+}
+
+/*
+ * This handles the VFIO "merge" interface, allowing us to manage multiple
+ * isolation groups with a single domain.  We just rely on attach_dev to
+ * tell us whether this is ok.
+ */
+int isolation_group_iommu_domain_add_group(struct isolation_group *groupa,
+					   struct isolation_group *groupb)
+{
+	int ret;
+
+	if (!groupa->iommu_domain ||
+	    groupb->iommu_domain || list_empty(&groupb->devices))
+		return -EINVAL;
+
+	if (groupa->iommu_ops != groupb->iommu_ops)
+		return -EIO;
+
+	mutex_lock(&isolation_lock);
+
+	ret = __isolation_group_domain_attach_devs(groupa->iommu_domain,
+						   &groupb->devices);
+	if (ret) {
+		mutex_unlock(&isolation_lock);
+		return ret;
+	}
+
+	groupb->iommu_domain = groupa->iommu_domain;
+
+	mutex_unlock(&isolation_lock);
+
+	return 0;
+}
+
+/*
+ * XXX page mapping/unmapping and more iommu api wrappers
+ */
+
+/*
+ * Do we have a group manager?  Group managers restrict what drivers are
+ * allowed to attach to devices.  A group is "locked" when all of the devices
+ * for a group belong to group manager drivers (or no driver at all).  At
+ * that point, the group manager can initialize the iommu.  vfio is an example
+ * of a group manager and vfio-pci is an exanple of a driver that a group
+ * manager may add as a "managed" driver.  Note that we don't gate iommu
+ * manipulation on being managed because we want to use it for regular DMA
+ * at some point as well.
+ */
+bool isolation_group_managed(struct isolation_group *group)
+{
+	return group->manager != NULL;
+}
+
+/*
+ * Initialize the group manager struct.  At this point the isolation group
+ * becomes "managed".
+ */
+int isolation_group_init_manager(struct isolation_group *group)
+{
+	mutex_lock(&isolation_lock);
+
+	if (group->manager) {
+		mutex_unlock(&isolation_lock);
+		return -EBUSY;
+	}
+
+	group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL);
+	if (!group->manager) {
+		mutex_unlock(&isolation_lock);
+		return -ENOMEM;
+	}
+
+	mutex_unlock(&isolation_lock);
+
+	return 0;
+}
+
+/*
+ * And free... cleanup registerd managed drivers too.
+ */
+void isolation_group_free_manager(struct isolation_group *group)
+{
+	struct isolation_manager_driver *driver, *tmp;
+
+	if (!group->manager)
+		return;
+
+	mutex_lock(&isolation_lock);
+
+	list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) {
+		list_del(&driver->list);
+		kfree(driver);
+	}
+		
+	kfree(group->manager);
+	group->manager = NULL;
+	mutex_unlock(&isolation_lock);
+}
+
+/*
+ * Add a managed driver.  Drivers added here are the only ones that will
+ * be allowed to attach to a managed isolation group.
+ */
+int isolation_group_manager_add_driver(struct isolation_group *group,
+				       struct device_driver *drv)
+{
+	struct isolation_manager_driver *driver;
+
+	if (!group->manager)
+		return -EINVAL;
+
+	driver = kzalloc(sizeof(*driver), GFP_KERNEL);
+	if (!driver)
+		return -ENOMEM;
+
+	driver->drv = drv;
+
+	mutex_lock(&isolation_lock);
+
+	list_add(&driver->list, &group->manager->drivers);
+
+	mutex_unlock(&isolation_lock);
+
+	return 0;
+}
+
+/*
+ * And remove a driver.  Don't really expect to need this much.
+ */
+void isolation_group_manager_del_driver(struct isolation_group *group,
+				        struct device_driver *drv)
+{
+	struct isolation_manager_driver *driver;
+
+	if (!group->manager)
+		return;
+
+	mutex_lock(&isolation_lock);
+
+	list_for_each_entry(driver, &group->manager->drivers, list) {
+		if (driver->drv == drv) {
+			list_del(&driver->list);
+			kfree(driver);
+			break;
+		}
+	}
+
+	mutex_unlock(&isolation_lock);
+}
+
+/*
+ * Test whether a driver is a "managed" driver for the group.  This allows
+ * is to preempt normal driver matching and only let our drivers in.
+ */
+bool isolation_group_manager_driver(struct isolation_group *group,
+				    struct device_driver *drv)
+{
+	struct isolation_manager_driver *driver;
+	bool found = false;
+
+	if (!group->manager)
+		return found;
+
+	mutex_lock(&isolation_lock);
+
+	list_for_each_entry(driver, &group->manager->drivers, list) {
+		if (driver->drv == drv) {
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&isolation_lock);
+
+	return found;
+}
+
+/*
+ * Does the group manager have control of all of the devices in the group?
+ * We consider driver-less devices to be ok here as they don't do DMA and
+ * don't present any interfaces to subvert the rest of the group.
+ */
+bool isolation_group_manager_locked(struct isolation_group *group)
+{
+	struct isolation_device *device;
+	struct isolation_manager_driver *driver;
+	bool found, locked = true;
+
+	if (!group->manager)
+		return false;
+
+	mutex_lock(&isolation_lock);
+
+	list_for_each_entry(device, &group->devices, list) {
+		found = false;
+
+		if (!device->dev->driver)
+			continue;
+
+		list_for_each_entry(driver, &group->manager->drivers, list) {
+			if (device->dev->driver == driver->drv) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found) {
+			locked = false;
+			break;
+		}
+	}
+
+	mutex_unlock(&isolation_lock);
+
+	return locked;
+}
+
+static int __init isolation_init(void)
+{
+	isolation_kset = kset_create_and_add("isolation", NULL, NULL);
+	
+	WARN(!isolation_kset, "Failed to initialize isolation group kset\n");
+
+	return isolation_kset ? 0 : -1;
+}
+subsys_initcall(isolation_init);
diff --git a/include/linux/device.h b/include/linux/device.h
index b63fb39..5805c56 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -663,6 +663,10 @@  struct device {
 
 	struct device_dma_parameters *dma_parms;
 
+#ifdef CONFIG_ISOLATION_GROUPS
+	struct isolation_group	*isolation_group;
+#endif
+
 	struct list_head	dma_pools;	/* dma pools (if dma'ble) */
 
 	struct dma_coherent_mem	*dma_mem; /* internal for coherent mem
diff --git a/include/linux/isolation.h b/include/linux/isolation.h
new file mode 100644
index 0000000..1d87caf
--- /dev/null
+++ b/include/linux/isolation.h
@@ -0,0 +1,138 @@ 
+/*
+ * Isolation group interface
+ *
+ * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef _LINUX_ISOLATION_H
+#define _LINUX_ISOLATION_H
+
+#define ISOLATION_NOTIFY_ADD_DEVICE	1
+#define ISOLATION_NOTIFY_DEL_DEVICE	2
+
+#ifdef CONFIG_ISOLATION_GROUPS
+
+extern struct isolation_group *isolation_create_group(void);
+extern int isolation_free_group(struct isolation_group *group);
+extern int isolation_group_add_dev(struct isolation_group *group,
+				   struct device *dev);
+extern int isolation_group_del_dev(struct device *dev);
+extern int isolation_register_notifier(struct bus_type *bus,
+				       struct notifier_block *nb);
+extern int isolation_unregister_notifier(struct bus_type *bus,
+					 struct notifier_block *nb);
+extern int isolation_group_set_iommu_ops(struct isolation_group *group,
+					 struct iommu_ops *iommu_ops);
+extern int isolation_group_init_iommu_domain(struct isolation_group *group);
+extern void isolation_group_free_iommu_domain(struct isolation_group *group);
+extern int isolation_group_iommu_domain_add_group(
+	struct isolation_group *groupa, struct isolation_group *groupb);
+extern bool isolation_group_managed(struct isolation_group *group);
+extern int isolation_group_init_manager(struct isolation_group *group);
+extern void isolation_group_free_manager(struct isolation_group *group);
+extern int isolation_group_manager_add_driver(struct isolation_group *group,
+					      struct device_driver *drv);
+extern void isolation_group_manager_del_driver(struct isolation_group *group,
+					       struct device_driver *drv);
+extern bool isolation_group_manager_driver(struct isolation_group *group,
+					   struct device_driver *drv);
+extern bool isolation_group_manager_locked(struct isolation_group *group);
+
+#define to_isolation_group(dev)	((dev)->isolation_group)
+
+#else /* CONFIG_ISOLATION_GROUPS */
+
+static inline struct isolation_group *isolation_create_group(void)
+{
+	return NULL;
+}
+
+static inline int isolation_free_group(struct isolation_group *group)
+{
+	return 0;
+}
+
+static inline int isolation_group_add_dev(struct isolation_group *group,
+					  struct device *dev)
+{
+	return 0;
+}
+
+static inline int isolation_group_del_dev(struct device *dev)
+{
+	return 0;
+}
+
+static int isolation_register_notifier(struct bus_type *bus,
+				       struct notifier_block *nb)
+{
+	return 0;
+}
+
+static int isolation_unregister_notifier(struct bus_type *bus,
+					 struct notifier_block *nb)
+{
+	return 0;
+}
+
+static int isolation_group_set_iommu_ops(struct isolation_group *group,
+					 struct iommu_ops *iommu_ops)
+{
+	return -ENOSYS;
+}
+
+static int isolation_group_init_iommu_domain(struct isolation_group *group)
+{
+	return -ENOSYS;
+}
+
+static void isolation_group_free_iommu_domain(struct isolation_group *group)
+{
+}
+
+static int isolation_group_iommu_domain_add_group(
+	struct isolation_group *groupa, struct isolation_group *groupb)
+{
+	return -ENOSYS;
+}
+
+static int isolation_group_init_manager(struct isolation_group *group)
+{
+	return -ENOSYS;
+}
+
+static void isolation_group_free_manager(struct isolation_group *group)
+{
+}
+
+static int isolation_group_manager_add_driver(struct isolation_group *group,
+					      struct device_driver *drv)
+{
+	return -ENOSYS;
+}
+
+static void isolation_group_manager_del_driver(struct isolation_group *group,
+					       struct device_driver *drv)
+{
+}
+
+static bool isolation_group_manager_locked(struct isolation_group *group)
+{
+	return false;
+}
+
+#define to_isolation_group(dev)	(NULL)
+
+#define isolation_group_managed(group) (false)
+
+#define isolation_group_manager_driver(group, drv) (false)
+
+#endif /* CONFIG_ISOLATION_GROUPS */
+
+#endif /* _LINUX_ISOLATION_H */