diff mbox

[kernel,v9,31/32] vfio: powerpc/spapr: Support multiple groups in one container if possible

Message ID 1429964096-11524-32-git-send-email-aik@ozlabs.ru (mailing list archive)
State Superseded
Delegated to: Benjamin Herrenschmidt
Headers show

Commit Message

Alexey Kardashevskiy April 25, 2015, 12:14 p.m. UTC
At the moment only one group per container is supported.
POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
IOMMU group so we can relax this limitation and support multiple groups
per container.

This adds TCE table descriptors to a container and uses iommu_table_group_ops
to create/set DMA windows on IOMMU groups so the same TCE tables will be
shared between several IOMMU groups.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
Changes:
v7:
* updated doc
---
 Documentation/vfio.txt              |   8 +-
 drivers/vfio/vfio_iommu_spapr_tce.c | 268 ++++++++++++++++++++++++++----------
 2 files changed, 199 insertions(+), 77 deletions(-)

Comments

David Gibson April 30, 2015, 7:22 a.m. UTC | #1
On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
> At the moment only one group per container is supported.
> POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
> IOMMU group so we can relax this limitation and support multiple groups
> per container.

It's not obvious why allowing multiple TCE tables per PE has any
pearing on allowing multiple groups per container.

> This adds TCE table descriptors to a container and uses iommu_table_group_ops
> to create/set DMA windows on IOMMU groups so the same TCE tables will be
> shared between several IOMMU groups.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> [aw: for the vfio related changes]
> Acked-by: Alex Williamson <alex.williamson@redhat.com>
> ---
> Changes:
> v7:
> * updated doc
> ---
>  Documentation/vfio.txt              |   8 +-
>  drivers/vfio/vfio_iommu_spapr_tce.c | 268 ++++++++++++++++++++++++++----------
>  2 files changed, 199 insertions(+), 77 deletions(-)
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index 94328c8..7dcf2b5 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
>  
>  This implementation has some specifics:
>  
> -1) Only one IOMMU group per container is supported as an IOMMU group
> -represents the minimal entity which isolation can be guaranteed for and
> -groups are allocated statically, one per a Partitionable Endpoint (PE)
> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
> +container is supported as an IOMMU table is allocated at the boot time,
> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
>  (PE is often a PCI domain but not always).

I thought the more fundamental problem was that different PEs tended
to use disjoint bus address ranges, so even by duplicating put_tce
across PEs you couldn't have a common address space.

> +Newer systems (POWER8 with IODA2) have improved hardware design which allows
> +to remove this limitation and have multiple IOMMU groups per a VFIO container.
>  
>  2) The hardware supports so called DMA windows - the PCI address range
>  within which DMA transfer is allowed, any attempt to access address space
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index a7d6729..970e3a2 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -82,6 +82,11 @@ static void decrement_locked_vm(long npages)
>   * into DMA'ble space using the IOMMU
>   */
>  
> +struct tce_iommu_group {
> +	struct list_head next;
> +	struct iommu_group *grp;
> +};
> +
>  /*
>   * The container descriptor supports only a single group per container.
>   * Required by the API as the container is not supplied with the IOMMU group
> @@ -89,10 +94,11 @@ static void decrement_locked_vm(long npages)
>   */
>  struct tce_container {
>  	struct mutex lock;
> -	struct iommu_group *grp;
>  	bool enabled;
>  	unsigned long locked_pages;
>  	bool v2;
> +	struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];

Hrm,  so here we have more copies of the full iommu_table structures,
which again muddies the lifetime.  The table_group pointer is
presumably meaningless in these copies, which seems dangerously
confusing.

> +	struct list_head group_list;
>  };
>  
>  static long tce_unregister_pages(struct tce_container *container,
> @@ -154,20 +160,20 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
>  	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
>  }
>  
> +static inline bool tce_groups_attached(struct tce_container *container)
> +{
> +	return !list_empty(&container->group_list);
> +}
> +
>  static struct iommu_table *spapr_tce_find_table(
>  		struct tce_container *container,
>  		phys_addr_t ioba)
>  {
>  	long i;
>  	struct iommu_table *ret = NULL;
> -	struct iommu_table_group *table_group;
> -
> -	table_group = iommu_group_get_iommudata(container->grp);
> -	if (!table_group)
> -		return NULL;
>  
>  	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> -		struct iommu_table *tbl = &table_group->tables[i];
> +		struct iommu_table *tbl = &container->tables[i];
>  		unsigned long entry = ioba >> tbl->it_page_shift;
>  		unsigned long start = tbl->it_offset;
>  		unsigned long end = start + tbl->it_size;
> @@ -186,9 +192,7 @@ static int tce_iommu_enable(struct tce_container *container)
>  	int ret = 0;
>  	unsigned long locked;
>  	struct iommu_table_group *table_group;
> -
> -	if (!container->grp)
> -		return -ENXIO;
> +	struct tce_iommu_group *tcegrp;
>  
>  	if (!current->mm)
>  		return -ESRCH; /* process exited */
> @@ -225,7 +229,12 @@ static int tce_iommu_enable(struct tce_container *container)
>  	 * as there is no way to know how much we should increment
>  	 * the locked_vm counter.
>  	 */
> -	table_group = iommu_group_get_iommudata(container->grp);
> +	if (!tce_groups_attached(container))
> +		return -ENODEV;
> +
> +	tcegrp = list_first_entry(&container->group_list,
> +			struct tce_iommu_group, next);
> +	table_group = iommu_group_get_iommudata(tcegrp->grp);
>  	if (!table_group)
>  		return -ENODEV;
>  
> @@ -257,6 +266,48 @@ static void tce_iommu_disable(struct tce_container *container)
>  	decrement_locked_vm(container->locked_pages);
>  }
>  
> +static long tce_iommu_create_table(struct iommu_table_group *table_group,
> +			int num,
> +			__u32 page_shift,
> +			__u64 window_size,
> +			__u32 levels,
> +			struct iommu_table *tbl)

With multiple groups (and therefore PEs) per container, this seems
wrong.  There's only one table_group per PE, so what's special about
PE whose table group is passed in here.

> +{
> +	long ret, table_size;
> +
> +	table_size = table_group->ops->get_table_size(page_shift, window_size,
> +			levels);
> +	if (!table_size)
> +		return -EINVAL;
> +
> +	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
> +	if (ret)
> +		return ret;
> +
> +	ret = table_group->ops->create_table(table_group, num,
> +			page_shift, window_size, levels, tbl);
> +
> +	WARN_ON(!ret && !tbl->it_ops->free);
> +	WARN_ON(!ret && (tbl->it_allocated_size != table_size));
> +
> +	if (ret)
> +		decrement_locked_vm(table_size >> PAGE_SHIFT);
> +
> +	return ret;
> +}
> +
> +static void tce_iommu_free_table(struct iommu_table *tbl)
> +{
> +	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
> +
> +	if (!tbl->it_size)
> +		return;
> +
> +	tbl->it_ops->free(tbl);

So, this is exactly the case where the lifetimes are badly confusing.
How can you be confident here that another copy of the iommu_table
struct isn't referencing the same TCE tables?

> +	decrement_locked_vm(pages);
> +	memset(tbl, 0, sizeof(*tbl));
> +}
> +
>  static void *tce_iommu_open(unsigned long arg)
>  {
>  	struct tce_container *container;
> @@ -271,19 +322,41 @@ static void *tce_iommu_open(unsigned long arg)
>  		return ERR_PTR(-ENOMEM);
>  
>  	mutex_init(&container->lock);
> +	INIT_LIST_HEAD_RCU(&container->group_list);

I see no other mentions of rcu related to this list, which doesn't
seem right.

>  	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
>  
>  	return container;
>  }
>  
> +static int tce_iommu_clear(struct tce_container *container,
> +		struct iommu_table *tbl,
> +		unsigned long entry, unsigned long pages);
> +
>  static void tce_iommu_release(void *iommu_data)
>  {
>  	struct tce_container *container = iommu_data;
> +	struct iommu_table_group *table_group;
> +	struct tce_iommu_group *tcegrp;
> +	long i;
>  
> -	WARN_ON(container->grp);
> +	while (tce_groups_attached(container)) {
> +		tcegrp = list_first_entry(&container->group_list,
> +				struct tce_iommu_group, next);
> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
> +		tce_iommu_detach_group(iommu_data, tcegrp->grp);
> +	}
>  
> -	if (container->grp)
> -		tce_iommu_detach_group(iommu_data, container->grp);
> +	/*
> +	 * If VFIO created a table, it was not disposed
> +	 * by tce_iommu_detach_group() so do it now.
> +	 */
> +	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> +		struct iommu_table *tbl = &container->tables[i];
> +
> +		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> +		tce_iommu_free_table(tbl);
> +	}
>  
>  	tce_iommu_disable(container);
>  	mutex_destroy(&container->lock);
> @@ -509,12 +582,15 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>  	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>  		struct vfio_iommu_spapr_tce_info info;
> +		struct tce_iommu_group *tcegrp;
>  		struct iommu_table_group *table_group;
>  
> -		if (WARN_ON(!container->grp))
> +		if (!tce_groups_attached(container))
>  			return -ENXIO;
>  
> -		table_group = iommu_group_get_iommudata(container->grp);
> +		tcegrp = list_first_entry(&container->group_list,
> +				struct tce_iommu_group, next);
> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
>  
>  		if (!table_group)
>  			return -ENXIO;
> @@ -707,12 +783,20 @@ static long tce_iommu_ioctl(void *iommu_data,
>  		tce_iommu_disable(container);
>  		mutex_unlock(&container->lock);
>  		return 0;
> -	case VFIO_EEH_PE_OP:
> -		if (!container->grp)
> -			return -ENODEV;
>  
> -		return vfio_spapr_iommu_eeh_ioctl(container->grp,
> -						  cmd, arg);
> +	case VFIO_EEH_PE_OP: {
> +		struct tce_iommu_group *tcegrp;
> +
> +		ret = 0;
> +		list_for_each_entry(tcegrp, &container->group_list, next) {
> +			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
> +					cmd, arg);
> +			if (ret)
> +				return ret;

Hrm.  It occurs to me that EEH may need a way of referencing
individual groups.  Even if multiple PEs are referencing the same TCE
tables, presumably EEH will isolate them individually.

> +		}
> +		return ret;
> +	}
> +
>  	}
>  
>  	return -ENOTTY;
> @@ -724,11 +808,14 @@ static void tce_iommu_release_ownership(struct tce_container *container,
>  	int i;
>  
>  	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> -		struct iommu_table *tbl = &table_group->tables[i];
> +		struct iommu_table *tbl = &container->tables[i];
>  
>  		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
>  		if (tbl->it_map)
>  			iommu_release_ownership(tbl);
> +
> +		/* Reset the container's copy of the table descriptor */
> +		memset(tbl, 0, sizeof(*tbl));
>  	}
>  }
>  
> @@ -758,38 +845,56 @@ static int tce_iommu_take_ownership(struct iommu_table_group *table_group)
>  static int tce_iommu_attach_group(void *iommu_data,
>  		struct iommu_group *iommu_group)
>  {
> -	int ret;
> +	int ret, i;
>  	struct tce_container *container = iommu_data;
>  	struct iommu_table_group *table_group;
> +	struct tce_iommu_group *tcegrp = NULL;
> +	bool first_group = !tce_groups_attached(container);
>  
>  	mutex_lock(&container->lock);
>  
>  	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
>  			iommu_group_id(iommu_group), iommu_group); */
> -	if (container->grp) {
> -		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
> -				iommu_group_id(container->grp),
> -				iommu_group_id(iommu_group));
> -		ret = -EBUSY;
> -		goto unlock_exit;
> -	}
> -
> -	if (container->enabled) {
> -		pr_err("tce_vfio: attaching group #%u to enabled container\n",
> -				iommu_group_id(iommu_group));
> -		ret = -EBUSY;
> -		goto unlock_exit;
> -	}
> -
>  	table_group = iommu_group_get_iommudata(iommu_group);
> -	if (!table_group) {
> -		ret = -ENXIO;
> +
> +	if (!first_group && (!table_group->ops ||
> +			!table_group->ops->take_ownership ||
> +			!table_group->ops->release_ownership)) {
> +		ret = -EBUSY;
> +		goto unlock_exit;
> +	}
> +
> +	/* Check if new group has the same iommu_ops (i.e. compatible) */
> +	list_for_each_entry(tcegrp, &container->group_list, next) {
> +		struct iommu_table_group *table_group_tmp;
> +
> +		if (tcegrp->grp == iommu_group) {
> +			pr_warn("tce_vfio: Group %d is already attached\n",
> +					iommu_group_id(iommu_group));
> +			ret = -EBUSY;
> +			goto unlock_exit;
> +		}
> +		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
> +		if (table_group_tmp->ops != table_group->ops) {
> +			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
> +					iommu_group_id(iommu_group),
> +					iommu_group_id(tcegrp->grp));
> +			ret = -EPERM;
> +			goto unlock_exit;
> +		}
> +	}
> +
> +	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
> +	if (!tcegrp) {
> +		ret = -ENOMEM;
>  		goto unlock_exit;
>  	}
>  
>  	if (!table_group->ops || !table_group->ops->take_ownership ||
>  			!table_group->ops->release_ownership) {
>  		ret = tce_iommu_take_ownership(table_group);
> +		if (!ret)
> +			container->tables[0] = table_group->tables[0];
>  	} else if (!table_group->ops->create_table ||
>  			!table_group->ops->set_window) {
>  		WARN_ON_ONCE(1);
> @@ -801,23 +906,46 @@ static int tce_iommu_attach_group(void *iommu_data,
>  		 * the pages that has been explicitly mapped into the iommu
>  		 */
>  		table_group->ops->take_ownership(table_group);
> -		ret = table_group->ops->create_table(table_group,
> -				0, /* window number */
> -				IOMMU_PAGE_SHIFT_4K,
> -				table_group->tce32_size,
> -				1, /* default levels */
> -				&table_group->tables[0]);
> -		if (!ret)
> -			ret = table_group->ops->set_window(table_group, 0,
> -					&table_group->tables[0]);
> +		/*
> +		 * If it the first group attached, check if there is
> +		 * a default DMA window and create one if none as
> +		 * the userspace expects it to exist.
> +		 */
> +		if (first_group && !container->tables[0].it_size) {
> +			ret = tce_iommu_create_table(table_group,
> +					0, /* window number */
> +					IOMMU_PAGE_SHIFT_4K,
> +					table_group->tce32_size,
> +					1, /* default levels */
> +					&container->tables[0]);
> +			if (ret)
> +				goto unlock_exit;
> +		}
> +
> +		/* Set all windows to the new group */
> +		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> +			struct iommu_table *tbl = &container->tables[i];
> +
> +			if (!tbl->it_size)
> +				continue;
> +
> +			/* Set the default window to a new group */
> +			ret = table_group->ops->set_window(table_group, i, tbl);
> +			if (ret)
> +				break;
> +		}
>  	}
>  
>  	if (ret)
>  		goto unlock_exit;
>  
> -	container->grp = iommu_group;
> +	tcegrp->grp = iommu_group;
> +	list_add(&tcegrp->next, &container->group_list);
>  
>  unlock_exit:
> +	if (ret && tcegrp)
> +		kfree(tcegrp);
> +
>  	mutex_unlock(&container->lock);
>  
>  	return ret;
> @@ -828,25 +956,27 @@ static void tce_iommu_detach_group(void *iommu_data,
>  {
>  	struct tce_container *container = iommu_data;
>  	struct iommu_table_group *table_group;
> +	struct tce_iommu_group *tcegrp;
>  	long i;
> +	bool found = false;
>  
>  	mutex_lock(&container->lock);
> -	if (iommu_group != container->grp) {
> -		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
> -				iommu_group_id(iommu_group),
> -				iommu_group_id(container->grp));
> +
> +	list_for_each_entry(tcegrp, &container->group_list, next) {
> +		if (tcegrp->grp == iommu_group) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	if (!found) {
> +		pr_warn("tce_vfio: detaching unattached group #%u\n",
> +				iommu_group_id(iommu_group));
>  		goto unlock_exit;
>  	}
>  
> -	if (container->enabled) {
> -		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
> -				iommu_group_id(container->grp));
> -		tce_iommu_disable(container);
> -	}
> -
> -	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
> -	   iommu_group_id(iommu_group), iommu_group); */
> -	container->grp = NULL;
> +	list_del(&tcegrp->next);
> +	kfree(tcegrp);
>  
>  	table_group = iommu_group_get_iommudata(iommu_group);
>  	BUG_ON(!table_group);
> @@ -857,18 +987,8 @@ static void tce_iommu_detach_group(void *iommu_data,
>  	else if (!table_group->ops->unset_window)
>  		WARN_ON_ONCE(1);
>  	else {
> -		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> -			struct iommu_table tbl = table_group->tables[i];
> -
> -			if (!tbl.it_size)
> -				continue;
> -
> +		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
>  			table_group->ops->unset_window(table_group, i);
> -			tce_iommu_clear(container, &tbl,
> -					tbl.it_offset, tbl.it_size);
> -			if (tbl.it_ops->free)
> -				tbl.it_ops->free(&tbl);
> -		}
>  
>  		table_group->ops->release_ownership(table_group);
>  	}
Alexey Kardashevskiy April 30, 2015, 9:33 a.m. UTC | #2
On 04/30/2015 05:22 PM, David Gibson wrote:
> On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
>> At the moment only one group per container is supported.
>> POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
>> IOMMU group so we can relax this limitation and support multiple groups
>> per container.
>
> It's not obvious why allowing multiple TCE tables per PE has any
> pearing on allowing multiple groups per container.


This patchset is a global TCE tables rework (patches 1..30, roughly) with 2 
outcomes:
1. reusing the same IOMMU table for multiple groups - patch 31;
2. allowing dynamic create/remove of IOMMU tables - patch 32.

I can remove this one from the patchset and post it separately later but 
since 1..30 aim to support both 1) and 2), I'd think I better keep them all 
together (might explain some of changes I do in 1..30).



>> This adds TCE table descriptors to a container and uses iommu_table_group_ops
>> to create/set DMA windows on IOMMU groups so the same TCE tables will be
>> shared between several IOMMU groups.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> [aw: for the vfio related changes]
>> Acked-by: Alex Williamson <alex.williamson@redhat.com>
>> ---
>> Changes:
>> v7:
>> * updated doc
>> ---
>>   Documentation/vfio.txt              |   8 +-
>>   drivers/vfio/vfio_iommu_spapr_tce.c | 268 ++++++++++++++++++++++++++----------
>>   2 files changed, 199 insertions(+), 77 deletions(-)
>>
>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>> index 94328c8..7dcf2b5 100644
>> --- a/Documentation/vfio.txt
>> +++ b/Documentation/vfio.txt
>> @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
>>
>>   This implementation has some specifics:
>>
>> -1) Only one IOMMU group per container is supported as an IOMMU group
>> -represents the minimal entity which isolation can be guaranteed for and
>> -groups are allocated statically, one per a Partitionable Endpoint (PE)
>> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
>> +container is supported as an IOMMU table is allocated at the boot time,
>> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
>>   (PE is often a PCI domain but not always).
>
> I thought the more fundamental problem was that different PEs tended
> to use disjoint bus address ranges, so even by duplicating put_tce
> across PEs you couldn't have a common address space.


Sorry, I am not following you here.

By duplicating put_tce, I can have multiple IOMMU groups on the same 
virtual PHB in QEMU, "[PATCH qemu v7 04/14] spapr_pci_vfio: Enable multiple 
groups per container" does this, the address ranges will the same.

What I cannot do on p5ioc2 is programming the same table to multiple 
physical PHBs (or I could but it is very different than IODA2 and pretty 
ugly and might not always be possible because I would have to allocate 
these pages from some common pool and face problems like fragmentation).



>> +Newer systems (POWER8 with IODA2) have improved hardware design which allows
>> +to remove this limitation and have multiple IOMMU groups per a VFIO container.
>>
>>   2) The hardware supports so called DMA windows - the PCI address range
>>   within which DMA transfer is allowed, any attempt to access address space
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index a7d6729..970e3a2 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -82,6 +82,11 @@ static void decrement_locked_vm(long npages)
>>    * into DMA'ble space using the IOMMU
>>    */
>>
>> +struct tce_iommu_group {
>> +	struct list_head next;
>> +	struct iommu_group *grp;
>> +};
>> +
>>   /*
>>    * The container descriptor supports only a single group per container.
>>    * Required by the API as the container is not supplied with the IOMMU group
>> @@ -89,10 +94,11 @@ static void decrement_locked_vm(long npages)
>>    */
>>   struct tce_container {
>>   	struct mutex lock;
>> -	struct iommu_group *grp;
>>   	bool enabled;
>>   	unsigned long locked_pages;
>>   	bool v2;
>> +	struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
>
> Hrm,  so here we have more copies of the full iommu_table structures,
> which again muddies the lifetime.  The table_group pointer is
> presumably meaningless in these copies, which seems dangerously
> confusing.


Ouch. This is bad. No, table_group is not pointless here as it is used to 
get to the PE number to invalidate TCE cache. I just realized although I 
need to update just a single table, I still have to invalidate TCE cache 
for every attached group/PE so I need a list of iommu_table_group's here, 
not a single pointer...



>> +	struct list_head group_list;
>>   };
>>
>>   static long tce_unregister_pages(struct tce_container *container,
>> @@ -154,20 +160,20 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
>>   	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
>>   }
>>
>> +static inline bool tce_groups_attached(struct tce_container *container)
>> +{
>> +	return !list_empty(&container->group_list);
>> +}
>> +
>>   static struct iommu_table *spapr_tce_find_table(
>>   		struct tce_container *container,
>>   		phys_addr_t ioba)
>>   {
>>   	long i;
>>   	struct iommu_table *ret = NULL;
>> -	struct iommu_table_group *table_group;
>> -
>> -	table_group = iommu_group_get_iommudata(container->grp);
>> -	if (!table_group)
>> -		return NULL;
>>
>>   	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>> -		struct iommu_table *tbl = &table_group->tables[i];
>> +		struct iommu_table *tbl = &container->tables[i];
>>   		unsigned long entry = ioba >> tbl->it_page_shift;
>>   		unsigned long start = tbl->it_offset;
>>   		unsigned long end = start + tbl->it_size;
>> @@ -186,9 +192,7 @@ static int tce_iommu_enable(struct tce_container *container)
>>   	int ret = 0;
>>   	unsigned long locked;
>>   	struct iommu_table_group *table_group;
>> -
>> -	if (!container->grp)
>> -		return -ENXIO;
>> +	struct tce_iommu_group *tcegrp;
>>
>>   	if (!current->mm)
>>   		return -ESRCH; /* process exited */
>> @@ -225,7 +229,12 @@ static int tce_iommu_enable(struct tce_container *container)
>>   	 * as there is no way to know how much we should increment
>>   	 * the locked_vm counter.
>>   	 */
>> -	table_group = iommu_group_get_iommudata(container->grp);
>> +	if (!tce_groups_attached(container))
>> +		return -ENODEV;
>> +
>> +	tcegrp = list_first_entry(&container->group_list,
>> +			struct tce_iommu_group, next);
>> +	table_group = iommu_group_get_iommudata(tcegrp->grp);
>>   	if (!table_group)
>>   		return -ENODEV;
>>
>> @@ -257,6 +266,48 @@ static void tce_iommu_disable(struct tce_container *container)
>>   	decrement_locked_vm(container->locked_pages);
>>   }
>>
>> +static long tce_iommu_create_table(struct iommu_table_group *table_group,
>> +			int num,
>> +			__u32 page_shift,
>> +			__u64 window_size,
>> +			__u32 levels,
>> +			struct iommu_table *tbl)
>
> With multiple groups (and therefore PEs) per container, this seems
> wrong.  There's only one table_group per PE, so what's special about
> PE whose table group is passed in here.


The created table is allocated at the same node as table_group 
(pe->phb->hose->node). This does not make much sense if we put multiple 
groups to the same container but we will recommend people to avoid putting 
groups from different NUMA nodes to the same container.

Also, the allocated table gets bus offset initialized in create_table() 
(which is IODA2-specific knowledge). It is there to emphasize the fact that 
we do not get to choose where to map the window on a bus, it is hardcoded 
and easier to deal with the tables which have offset set once - I could add 
a bus_offset parameter to set_window() but it would be converted back to 
the window number.



>> +{
>> +	long ret, table_size;
>> +
>> +	table_size = table_group->ops->get_table_size(page_shift, window_size,
>> +			levels);
>> +	if (!table_size)
>> +		return -EINVAL;
>> +
>> +	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = table_group->ops->create_table(table_group, num,
>> +			page_shift, window_size, levels, tbl);
>> +
>> +	WARN_ON(!ret && !tbl->it_ops->free);
>> +	WARN_ON(!ret && (tbl->it_allocated_size != table_size));
>> +
>> +	if (ret)
>> +		decrement_locked_vm(table_size >> PAGE_SHIFT);
>> +
>> +	return ret;
>> +}
>> +
>> +static void tce_iommu_free_table(struct iommu_table *tbl)
>> +{
>> +	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
>> +
>> +	if (!tbl->it_size)
>> +		return;
>> +
>> +	tbl->it_ops->free(tbl);
>
> So, this is exactly the case where the lifetimes are badly confusing.
> How can you be confident here that another copy of the iommu_table
> struct isn't referencing the same TCE tables?


Create/remove window is handled by a single file driver. It is not like 
many of there tables. But yes, valid point.



>> +	decrement_locked_vm(pages);
>> +	memset(tbl, 0, sizeof(*tbl));
>> +}
>> +
>>   static void *tce_iommu_open(unsigned long arg)
>>   {
>>   	struct tce_container *container;
>> @@ -271,19 +322,41 @@ static void *tce_iommu_open(unsigned long arg)
>>   		return ERR_PTR(-ENOMEM);
>>
>>   	mutex_init(&container->lock);
>> +	INIT_LIST_HEAD_RCU(&container->group_list);
>
> I see no other mentions of rcu related to this list, which doesn't
> seem right.
>
>>   	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
>>
>>   	return container;
>>   }
>>
>> +static int tce_iommu_clear(struct tce_container *container,
>> +		struct iommu_table *tbl,
>> +		unsigned long entry, unsigned long pages);
>> +
>>   static void tce_iommu_release(void *iommu_data)
>>   {
>>   	struct tce_container *container = iommu_data;
>> +	struct iommu_table_group *table_group;
>> +	struct tce_iommu_group *tcegrp;
>> +	long i;
>>
>> -	WARN_ON(container->grp);
>> +	while (tce_groups_attached(container)) {
>> +		tcegrp = list_first_entry(&container->group_list,
>> +				struct tce_iommu_group, next);
>> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
>> +		tce_iommu_detach_group(iommu_data, tcegrp->grp);
>> +	}
>>
>> -	if (container->grp)
>> -		tce_iommu_detach_group(iommu_data, container->grp);
>> +	/*
>> +	 * If VFIO created a table, it was not disposed
>> +	 * by tce_iommu_detach_group() so do it now.
>> +	 */
>> +	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
>> +		struct iommu_table *tbl = &container->tables[i];
>> +
>> +		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
>> +		tce_iommu_free_table(tbl);
>> +	}
>>
>>   	tce_iommu_disable(container);
>>   	mutex_destroy(&container->lock);
>> @@ -509,12 +582,15 @@ static long tce_iommu_ioctl(void *iommu_data,
>>
>>   	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>>   		struct vfio_iommu_spapr_tce_info info;
>> +		struct tce_iommu_group *tcegrp;
>>   		struct iommu_table_group *table_group;
>>
>> -		if (WARN_ON(!container->grp))
>> +		if (!tce_groups_attached(container))
>>   			return -ENXIO;
>>
>> -		table_group = iommu_group_get_iommudata(container->grp);
>> +		tcegrp = list_first_entry(&container->group_list,
>> +				struct tce_iommu_group, next);
>> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
>>
>>   		if (!table_group)
>>   			return -ENXIO;
>> @@ -707,12 +783,20 @@ static long tce_iommu_ioctl(void *iommu_data,
>>   		tce_iommu_disable(container);
>>   		mutex_unlock(&container->lock);
>>   		return 0;
>> -	case VFIO_EEH_PE_OP:
>> -		if (!container->grp)
>> -			return -ENODEV;
>>
>> -		return vfio_spapr_iommu_eeh_ioctl(container->grp,
>> -						  cmd, arg);
>> +	case VFIO_EEH_PE_OP: {
>> +		struct tce_iommu_group *tcegrp;
>> +
>> +		ret = 0;
>> +		list_for_each_entry(tcegrp, &container->group_list, next) {
>> +			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
>> +					cmd, arg);
>> +			if (ret)
>> +				return ret;
>
> Hrm.  It occurs to me that EEH may need a way of referencing
> individual groups.  Even if multiple PEs are referencing the same TCE
> tables, presumably EEH will isolate them individually.


Well. I asked our EEH guy Gavin, he did not object to this change but I'll 
double check :)
Benjamin Herrenschmidt May 1, 2015, 12:46 a.m. UTC | #3
On Thu, 2015-04-30 at 19:33 +1000, Alexey Kardashevskiy wrote:
> On 04/30/2015 05:22 PM, David Gibson wrote:
> > On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
> >> At the moment only one group per container is supported.
> >> POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
> >> IOMMU group so we can relax this limitation and support multiple groups
> >> per container.
> >
> > It's not obvious why allowing multiple TCE tables per PE has any
> > pearing on allowing multiple groups per container.
> 
> 
> This patchset is a global TCE tables rework (patches 1..30, roughly) with 2 
> outcomes:
> 1. reusing the same IOMMU table for multiple groups - patch 31;
> 2. allowing dynamic create/remove of IOMMU tables - patch 32.
> 
> I can remove this one from the patchset and post it separately later but 
> since 1..30 aim to support both 1) and 2), I'd think I better keep them all 
> together (might explain some of changes I do in 1..30).

I think you are talking past each other :-)

But yes, having 2 tables per group is orthogonal to the ability of
having multiple groups per container.

The latter is made possible on P8 in large part because each PE has its
own DMA address space (unlike P5IOC2 or P7IOC where a single address
space is segmented).

Also, on P8 you can actually make the TVT entries point to the same
table in memory, thus removing the need to duplicate the actual
tables (though you still have to duplicate the invalidations). I would
however recommend only sharing the table that way within a chip/node.

 .../..

> >>
> >> -1) Only one IOMMU group per container is supported as an IOMMU group
> >> -represents the minimal entity which isolation can be guaranteed for and
> >> -groups are allocated statically, one per a Partitionable Endpoint (PE)
> >> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
> >> +container is supported as an IOMMU table is allocated at the boot time,
> >> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
> >>   (PE is often a PCI domain but not always).

> > I thought the more fundamental problem was that different PEs tended
> > to use disjoint bus address ranges, so even by duplicating put_tce
> > across PEs you couldn't have a common address space.

Yes. This is the problem with P7IOC and earlier. It *could* be doable on
P7IOC by making them the same PE but let's not go there.

> Sorry, I am not following you here.
> 
> By duplicating put_tce, I can have multiple IOMMU groups on the same 
> virtual PHB in QEMU, "[PATCH qemu v7 04/14] spapr_pci_vfio: Enable multiple 
> groups per container" does this, the address ranges will the same.

But that is only possible on P8 because only there do we have separate
address spaces between PEs.

> What I cannot do on p5ioc2 is programming the same table to multiple 
> physical PHBs (or I could but it is very different than IODA2 and pretty 
> ugly and might not always be possible because I would have to allocate 
> these pages from some common pool and face problems like fragmentation).

And P7IOC has a similar issue. The DMA address top bits indexes the
window on P7IOC within a shared address space. It's possible to
configure a TVT to cover multiple devices but with very serious
limitations.

> >> +Newer systems (POWER8 with IODA2) have improved hardware design which allows
> >> +to remove this limitation and have multiple IOMMU groups per a VFIO container.
> >>
> >>   2) The hardware supports so called DMA windows - the PCI address range
> >>   within which DMA transfer is allowed, any attempt to access address space
> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> index a7d6729..970e3a2 100644
> >> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >> @@ -82,6 +82,11 @@ static void decrement_locked_vm(long npages)
> >>    * into DMA'ble space using the IOMMU
> >>    */
> >>
> >> +struct tce_iommu_group {
> >> +	struct list_head next;
> >> +	struct iommu_group *grp;
> >> +};
> >> +
> >>   /*
> >>    * The container descriptor supports only a single group per container.
> >>    * Required by the API as the container is not supplied with the IOMMU group
> >> @@ -89,10 +94,11 @@ static void decrement_locked_vm(long npages)
> >>    */
> >>   struct tce_container {
> >>   	struct mutex lock;
> >> -	struct iommu_group *grp;
> >>   	bool enabled;
> >>   	unsigned long locked_pages;
> >>   	bool v2;
> >> +	struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> >
> > Hrm,  so here we have more copies of the full iommu_table structures,
> > which again muddies the lifetime.  The table_group pointer is
> > presumably meaningless in these copies, which seems dangerously
> > confusing.
> 
> 
> Ouch. This is bad. No, table_group is not pointless here as it is used to 
> get to the PE number to invalidate TCE cache. I just realized although I 
> need to update just a single table, I still have to invalidate TCE cache 
> for every attached group/PE so I need a list of iommu_table_group's here, 
> not a single pointer...
> 
> 
> 
> >> +	struct list_head group_list;
> >>   };
> >>
> >>   static long tce_unregister_pages(struct tce_container *container,
> >> @@ -154,20 +160,20 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
> >>   	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
> >>   }
> >>
> >> +static inline bool tce_groups_attached(struct tce_container *container)
> >> +{
> >> +	return !list_empty(&container->group_list);
> >> +}
> >> +
> >>   static struct iommu_table *spapr_tce_find_table(
> >>   		struct tce_container *container,
> >>   		phys_addr_t ioba)
> >>   {
> >>   	long i;
> >>   	struct iommu_table *ret = NULL;
> >> -	struct iommu_table_group *table_group;
> >> -
> >> -	table_group = iommu_group_get_iommudata(container->grp);
> >> -	if (!table_group)
> >> -		return NULL;
> >>
> >>   	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> >> -		struct iommu_table *tbl = &table_group->tables[i];
> >> +		struct iommu_table *tbl = &container->tables[i];
> >>   		unsigned long entry = ioba >> tbl->it_page_shift;
> >>   		unsigned long start = tbl->it_offset;
> >>   		unsigned long end = start + tbl->it_size;
> >> @@ -186,9 +192,7 @@ static int tce_iommu_enable(struct tce_container *container)
> >>   	int ret = 0;
> >>   	unsigned long locked;
> >>   	struct iommu_table_group *table_group;
> >> -
> >> -	if (!container->grp)
> >> -		return -ENXIO;
> >> +	struct tce_iommu_group *tcegrp;
> >>
> >>   	if (!current->mm)
> >>   		return -ESRCH; /* process exited */
> >> @@ -225,7 +229,12 @@ static int tce_iommu_enable(struct tce_container *container)
> >>   	 * as there is no way to know how much we should increment
> >>   	 * the locked_vm counter.
> >>   	 */
> >> -	table_group = iommu_group_get_iommudata(container->grp);
> >> +	if (!tce_groups_attached(container))
> >> +		return -ENODEV;
> >> +
> >> +	tcegrp = list_first_entry(&container->group_list,
> >> +			struct tce_iommu_group, next);
> >> +	table_group = iommu_group_get_iommudata(tcegrp->grp);
> >>   	if (!table_group)
> >>   		return -ENODEV;
> >>
> >> @@ -257,6 +266,48 @@ static void tce_iommu_disable(struct tce_container *container)
> >>   	decrement_locked_vm(container->locked_pages);
> >>   }
> >>
> >> +static long tce_iommu_create_table(struct iommu_table_group *table_group,
> >> +			int num,
> >> +			__u32 page_shift,
> >> +			__u64 window_size,
> >> +			__u32 levels,
> >> +			struct iommu_table *tbl)
> >
> > With multiple groups (and therefore PEs) per container, this seems
> > wrong.  There's only one table_group per PE, so what's special about
> > PE whose table group is passed in here.
> 
> 
> The created table is allocated at the same node as table_group 
> (pe->phb->hose->node). This does not make much sense if we put multiple 
> groups to the same container but we will recommend people to avoid putting 
> groups from different NUMA nodes to the same container.
> 
> Also, the allocated table gets bus offset initialized in create_table() 
> (which is IODA2-specific knowledge). It is there to emphasize the fact that 
> we do not get to choose where to map the window on a bus, it is hardcoded 
> and easier to deal with the tables which have offset set once - I could add 
> a bus_offset parameter to set_window() but it would be converted back to 
> the window number.
> 
> 
> 
> >> +{
> >> +	long ret, table_size;
> >> +
> >> +	table_size = table_group->ops->get_table_size(page_shift, window_size,
> >> +			levels);
> >> +	if (!table_size)
> >> +		return -EINVAL;
> >> +
> >> +	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	ret = table_group->ops->create_table(table_group, num,
> >> +			page_shift, window_size, levels, tbl);
> >> +
> >> +	WARN_ON(!ret && !tbl->it_ops->free);
> >> +	WARN_ON(!ret && (tbl->it_allocated_size != table_size));
> >> +
> >> +	if (ret)
> >> +		decrement_locked_vm(table_size >> PAGE_SHIFT);
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void tce_iommu_free_table(struct iommu_table *tbl)
> >> +{
> >> +	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
> >> +
> >> +	if (!tbl->it_size)
> >> +		return;
> >> +
> >> +	tbl->it_ops->free(tbl);
> >
> > So, this is exactly the case where the lifetimes are badly confusing.
> > How can you be confident here that another copy of the iommu_table
> > struct isn't referencing the same TCE tables?
> 
> 
> Create/remove window is handled by a single file driver. It is not like 
> many of there tables. But yes, valid point.
> 
> 
> 
> >> +	decrement_locked_vm(pages);
> >> +	memset(tbl, 0, sizeof(*tbl));
> >> +}
> >> +
> >>   static void *tce_iommu_open(unsigned long arg)
> >>   {
> >>   	struct tce_container *container;
> >> @@ -271,19 +322,41 @@ static void *tce_iommu_open(unsigned long arg)
> >>   		return ERR_PTR(-ENOMEM);
> >>
> >>   	mutex_init(&container->lock);
> >> +	INIT_LIST_HEAD_RCU(&container->group_list);
> >
> > I see no other mentions of rcu related to this list, which doesn't
> > seem right.
> >
> >>   	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
> >>
> >>   	return container;
> >>   }
> >>
> >> +static int tce_iommu_clear(struct tce_container *container,
> >> +		struct iommu_table *tbl,
> >> +		unsigned long entry, unsigned long pages);
> >> +
> >>   static void tce_iommu_release(void *iommu_data)
> >>   {
> >>   	struct tce_container *container = iommu_data;
> >> +	struct iommu_table_group *table_group;
> >> +	struct tce_iommu_group *tcegrp;
> >> +	long i;
> >>
> >> -	WARN_ON(container->grp);
> >> +	while (tce_groups_attached(container)) {
> >> +		tcegrp = list_first_entry(&container->group_list,
> >> +				struct tce_iommu_group, next);
> >> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
> >> +		tce_iommu_detach_group(iommu_data, tcegrp->grp);
> >> +	}
> >>
> >> -	if (container->grp)
> >> -		tce_iommu_detach_group(iommu_data, container->grp);
> >> +	/*
> >> +	 * If VFIO created a table, it was not disposed
> >> +	 * by tce_iommu_detach_group() so do it now.
> >> +	 */
> >> +	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> >> +		struct iommu_table *tbl = &container->tables[i];
> >> +
> >> +		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> >> +		tce_iommu_free_table(tbl);
> >> +	}
> >>
> >>   	tce_iommu_disable(container);
> >>   	mutex_destroy(&container->lock);
> >> @@ -509,12 +582,15 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>
> >>   	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>   		struct vfio_iommu_spapr_tce_info info;
> >> +		struct tce_iommu_group *tcegrp;
> >>   		struct iommu_table_group *table_group;
> >>
> >> -		if (WARN_ON(!container->grp))
> >> +		if (!tce_groups_attached(container))
> >>   			return -ENXIO;
> >>
> >> -		table_group = iommu_group_get_iommudata(container->grp);
> >> +		tcegrp = list_first_entry(&container->group_list,
> >> +				struct tce_iommu_group, next);
> >> +		table_group = iommu_group_get_iommudata(tcegrp->grp);
> >>
> >>   		if (!table_group)
> >>   			return -ENXIO;
> >> @@ -707,12 +783,20 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>   		tce_iommu_disable(container);
> >>   		mutex_unlock(&container->lock);
> >>   		return 0;
> >> -	case VFIO_EEH_PE_OP:
> >> -		if (!container->grp)
> >> -			return -ENODEV;
> >>
> >> -		return vfio_spapr_iommu_eeh_ioctl(container->grp,
> >> -						  cmd, arg);
> >> +	case VFIO_EEH_PE_OP: {
> >> +		struct tce_iommu_group *tcegrp;
> >> +
> >> +		ret = 0;
> >> +		list_for_each_entry(tcegrp, &container->group_list, next) {
> >> +			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
> >> +					cmd, arg);
> >> +			if (ret)
> >> +				return ret;
> >
> > Hrm.  It occurs to me that EEH may need a way of referencing
> > individual groups.  Even if multiple PEs are referencing the same TCE
> > tables, presumably EEH will isolate them individually.
> 
> 
> Well. I asked our EEH guy Gavin, he did not object to this change but I'll 
> double check :)
> 
> 
>
David Gibson May 1, 2015, 4:33 a.m. UTC | #4
On Thu, Apr 30, 2015 at 07:33:09PM +1000, Alexey Kardashevskiy wrote:
> On 04/30/2015 05:22 PM, David Gibson wrote:
> >On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
> >>At the moment only one group per container is supported.
> >>POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
> >>IOMMU group so we can relax this limitation and support multiple groups
> >>per container.
> >
> >It's not obvious why allowing multiple TCE tables per PE has any
> >pearing on allowing multiple groups per container.
> 
> 
> This patchset is a global TCE tables rework (patches 1..30, roughly) with 2
> outcomes:
> 1. reusing the same IOMMU table for multiple groups - patch 31;
> 2. allowing dynamic create/remove of IOMMU tables - patch 32.
> 
> I can remove this one from the patchset and post it separately later but
> since 1..30 aim to support both 1) and 2), I'd think I better keep them all
> together (might explain some of changes I do in 1..30).

The combined patchset is fine.  My comment is because your commit
message says that multiple groups are possible *because* 2 TCE tables
per group are allowed, and it's not at all clear why one follows from
the other.

> >>This adds TCE table descriptors to a container and uses iommu_table_group_ops
> >>to create/set DMA windows on IOMMU groups so the same TCE tables will be
> >>shared between several IOMMU groups.
> >>
> >>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>[aw: for the vfio related changes]
> >>Acked-by: Alex Williamson <alex.williamson@redhat.com>
> >>---
> >>Changes:
> >>v7:
> >>* updated doc
> >>---
> >>  Documentation/vfio.txt              |   8 +-
> >>  drivers/vfio/vfio_iommu_spapr_tce.c | 268 ++++++++++++++++++++++++++----------
> >>  2 files changed, 199 insertions(+), 77 deletions(-)
> >>
> >>diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> >>index 94328c8..7dcf2b5 100644
> >>--- a/Documentation/vfio.txt
> >>+++ b/Documentation/vfio.txt
> >>@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
> >>
> >>  This implementation has some specifics:
> >>
> >>-1) Only one IOMMU group per container is supported as an IOMMU group
> >>-represents the minimal entity which isolation can be guaranteed for and
> >>-groups are allocated statically, one per a Partitionable Endpoint (PE)
> >>+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
> >>+container is supported as an IOMMU table is allocated at the boot time,
> >>+one table per a IOMMU group which is a Partitionable Endpoint (PE)
> >>  (PE is often a PCI domain but not always).
> >
> >I thought the more fundamental problem was that different PEs tended
> >to use disjoint bus address ranges, so even by duplicating put_tce
> >across PEs you couldn't have a common address space.
> 
> 
> Sorry, I am not following you here.
> 
> By duplicating put_tce, I can have multiple IOMMU groups on the same virtual
> PHB in QEMU, "[PATCH qemu v7 04/14] spapr_pci_vfio: Enable multiple groups
> per container" does this, the address ranges will the same.

Oh, ok.  For some reason I thought that (at least on the older
machines) the different PEs used different and not easily changeable
DMA windows in bus addresses space.

> What I cannot do on p5ioc2 is programming the same table to multiple
> physical PHBs (or I could but it is very different than IODA2 and pretty
> ugly and might not always be possible because I would have to allocate these
> pages from some common pool and face problems like fragmentation).

So allowing multiple groups per container should be possible (at the
kernel rather than qemu level) by writing the same value to multiple
TCE tables.  I guess its not worth doing for just the almost-obsolete
IOMMUs though.

> 
> 
> 
> >>+Newer systems (POWER8 with IODA2) have improved hardware design which allows
> >>+to remove this limitation and have multiple IOMMU groups per a VFIO container.
> >>
> >>  2) The hardware supports so called DMA windows - the PCI address range
> >>  within which DMA transfer is allowed, any attempt to access address space
> >>diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>index a7d6729..970e3a2 100644
> >>--- a/drivers/vfio/vfio_iommu_spapr_tce.c
> >>+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> >>@@ -82,6 +82,11 @@ static void decrement_locked_vm(long npages)
> >>   * into DMA'ble space using the IOMMU
> >>   */
> >>
> >>+struct tce_iommu_group {
> >>+	struct list_head next;
> >>+	struct iommu_group *grp;
> >>+};
> >>+
> >>  /*
> >>   * The container descriptor supports only a single group per container.
> >>   * Required by the API as the container is not supplied with the IOMMU group
> >>@@ -89,10 +94,11 @@ static void decrement_locked_vm(long npages)
> >>   */
> >>  struct tce_container {
> >>  	struct mutex lock;
> >>-	struct iommu_group *grp;
> >>  	bool enabled;
> >>  	unsigned long locked_pages;
> >>  	bool v2;
> >>+	struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
> >
> >Hrm,  so here we have more copies of the full iommu_table structures,
> >which again muddies the lifetime.  The table_group pointer is
> >presumably meaningless in these copies, which seems dangerously
> >confusing.
> 
> 
> Ouch. This is bad. No, table_group is not pointless here as it is used to
> get to the PE number to invalidate TCE cache. I just realized although I
> need to update just a single table, I still have to invalidate TCE cache for
> every attached group/PE so I need a list of iommu_table_group's here, not a
> single pointer...

Right.

> >>+	struct list_head group_list;
> >>  };
> >>
> >>  static long tce_unregister_pages(struct tce_container *container,
> >>@@ -154,20 +160,20 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
> >>  	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
> >>  }
> >>
> >>+static inline bool tce_groups_attached(struct tce_container *container)
> >>+{
> >>+	return !list_empty(&container->group_list);
> >>+}
> >>+
> >>  static struct iommu_table *spapr_tce_find_table(
> >>  		struct tce_container *container,
> >>  		phys_addr_t ioba)
> >>  {
> >>  	long i;
> >>  	struct iommu_table *ret = NULL;
> >>-	struct iommu_table_group *table_group;
> >>-
> >>-	table_group = iommu_group_get_iommudata(container->grp);
> >>-	if (!table_group)
> >>-		return NULL;
> >>
> >>  	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> >>-		struct iommu_table *tbl = &table_group->tables[i];
> >>+		struct iommu_table *tbl = &container->tables[i];
> >>  		unsigned long entry = ioba >> tbl->it_page_shift;
> >>  		unsigned long start = tbl->it_offset;
> >>  		unsigned long end = start + tbl->it_size;
> >>@@ -186,9 +192,7 @@ static int tce_iommu_enable(struct tce_container *container)
> >>  	int ret = 0;
> >>  	unsigned long locked;
> >>  	struct iommu_table_group *table_group;
> >>-
> >>-	if (!container->grp)
> >>-		return -ENXIO;
> >>+	struct tce_iommu_group *tcegrp;
> >>
> >>  	if (!current->mm)
> >>  		return -ESRCH; /* process exited */
> >>@@ -225,7 +229,12 @@ static int tce_iommu_enable(struct tce_container *container)
> >>  	 * as there is no way to know how much we should increment
> >>  	 * the locked_vm counter.
> >>  	 */
> >>-	table_group = iommu_group_get_iommudata(container->grp);
> >>+	if (!tce_groups_attached(container))
> >>+		return -ENODEV;
> >>+
> >>+	tcegrp = list_first_entry(&container->group_list,
> >>+			struct tce_iommu_group, next);
> >>+	table_group = iommu_group_get_iommudata(tcegrp->grp);
> >>  	if (!table_group)
> >>  		return -ENODEV;
> >>
> >>@@ -257,6 +266,48 @@ static void tce_iommu_disable(struct tce_container *container)
> >>  	decrement_locked_vm(container->locked_pages);
> >>  }
> >>
> >>+static long tce_iommu_create_table(struct iommu_table_group *table_group,
> >>+			int num,
> >>+			__u32 page_shift,
> >>+			__u64 window_size,
> >>+			__u32 levels,
> >>+			struct iommu_table *tbl)
> >
> >With multiple groups (and therefore PEs) per container, this seems
> >wrong.  There's only one table_group per PE, so what's special about
> >PE whose table group is passed in here.
> 
> 
> The created table is allocated at the same node as table_group
> (pe->phb->hose->node). This does not make much sense if we put multiple
> groups to the same container but we will recommend people to avoid putting
> groups from different NUMA nodes to the same container.

Ok.  I guess the point is that the first PE attached has a special
bearing on how the table is configured, and that's what the
table_group here represents.

> Also, the allocated table gets bus offset initialized in create_table()
> (which is IODA2-specific knowledge). It is there to emphasize the fact that
> we do not get to choose where to map the window on a bus, it is hardcoded
> and easier to deal with the tables which have offset set once - I could add
> a bus_offset parameter to set_window() but it would be converted back to the
> window number.
> 
> 
> 
> >>+{
> >>+	long ret, table_size;
> >>+
> >>+	table_size = table_group->ops->get_table_size(page_shift, window_size,
> >>+			levels);
> >>+	if (!table_size)
> >>+		return -EINVAL;
> >>+
> >>+	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
> >>+	if (ret)
> >>+		return ret;
> >>+
> >>+	ret = table_group->ops->create_table(table_group, num,
> >>+			page_shift, window_size, levels, tbl);
> >>+
> >>+	WARN_ON(!ret && !tbl->it_ops->free);
> >>+	WARN_ON(!ret && (tbl->it_allocated_size != table_size));
> >>+
> >>+	if (ret)
> >>+		decrement_locked_vm(table_size >> PAGE_SHIFT);
> >>+
> >>+	return ret;
> >>+}
> >>+
> >>+static void tce_iommu_free_table(struct iommu_table *tbl)
> >>+{
> >>+	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
> >>+
> >>+	if (!tbl->it_size)
> >>+		return;
> >>+
> >>+	tbl->it_ops->free(tbl);
> >
> >So, this is exactly the case where the lifetimes are badly confusing.
> >How can you be confident here that another copy of the iommu_table
> >struct isn't referencing the same TCE tables?
> 
> 
> Create/remove window is handled by a single file driver. It is not like many
> of there tables. But yes, valid point.
> 
> 
> 
> >>+	decrement_locked_vm(pages);
> >>+	memset(tbl, 0, sizeof(*tbl));
> >>+}
> >>+
> >>  static void *tce_iommu_open(unsigned long arg)
> >>  {
> >>  	struct tce_container *container;
> >>@@ -271,19 +322,41 @@ static void *tce_iommu_open(unsigned long arg)
> >>  		return ERR_PTR(-ENOMEM);
> >>
> >>  	mutex_init(&container->lock);
> >>+	INIT_LIST_HEAD_RCU(&container->group_list);
> >
> >I see no other mentions of rcu related to this list, which doesn't
> >seem right.
> >
> >>  	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
> >>
> >>  	return container;
> >>  }
> >>
> >>+static int tce_iommu_clear(struct tce_container *container,
> >>+		struct iommu_table *tbl,
> >>+		unsigned long entry, unsigned long pages);
> >>+
> >>  static void tce_iommu_release(void *iommu_data)
> >>  {
> >>  	struct tce_container *container = iommu_data;
> >>+	struct iommu_table_group *table_group;
> >>+	struct tce_iommu_group *tcegrp;
> >>+	long i;
> >>
> >>-	WARN_ON(container->grp);
> >>+	while (tce_groups_attached(container)) {
> >>+		tcegrp = list_first_entry(&container->group_list,
> >>+				struct tce_iommu_group, next);
> >>+		table_group = iommu_group_get_iommudata(tcegrp->grp);
> >>+		tce_iommu_detach_group(iommu_data, tcegrp->grp);
> >>+	}
> >>
> >>-	if (container->grp)
> >>-		tce_iommu_detach_group(iommu_data, container->grp);
> >>+	/*
> >>+	 * If VFIO created a table, it was not disposed
> >>+	 * by tce_iommu_detach_group() so do it now.
> >>+	 */
> >>+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
> >>+		struct iommu_table *tbl = &container->tables[i];
> >>+
> >>+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
> >>+		tce_iommu_free_table(tbl);
> >>+	}
> >>
> >>  	tce_iommu_disable(container);
> >>  	mutex_destroy(&container->lock);
> >>@@ -509,12 +582,15 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>
> >>  	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
> >>  		struct vfio_iommu_spapr_tce_info info;
> >>+		struct tce_iommu_group *tcegrp;
> >>  		struct iommu_table_group *table_group;
> >>
> >>-		if (WARN_ON(!container->grp))
> >>+		if (!tce_groups_attached(container))
> >>  			return -ENXIO;
> >>
> >>-		table_group = iommu_group_get_iommudata(container->grp);
> >>+		tcegrp = list_first_entry(&container->group_list,
> >>+				struct tce_iommu_group, next);
> >>+		table_group = iommu_group_get_iommudata(tcegrp->grp);
> >>
> >>  		if (!table_group)
> >>  			return -ENXIO;
> >>@@ -707,12 +783,20 @@ static long tce_iommu_ioctl(void *iommu_data,
> >>  		tce_iommu_disable(container);
> >>  		mutex_unlock(&container->lock);
> >>  		return 0;
> >>-	case VFIO_EEH_PE_OP:
> >>-		if (!container->grp)
> >>-			return -ENODEV;
> >>
> >>-		return vfio_spapr_iommu_eeh_ioctl(container->grp,
> >>-						  cmd, arg);
> >>+	case VFIO_EEH_PE_OP: {
> >>+		struct tce_iommu_group *tcegrp;
> >>+
> >>+		ret = 0;
> >>+		list_for_each_entry(tcegrp, &container->group_list, next) {
> >>+			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
> >>+					cmd, arg);
> >>+			if (ret)
> >>+				return ret;
> >
> >Hrm.  It occurs to me that EEH may need a way of referencing
> >individual groups.  Even if multiple PEs are referencing the same TCE
> >tables, presumably EEH will isolate them individually.
> 
> 
> Well. I asked our EEH guy Gavin, he did not object to this change but I'll
> double check :)
> 
> 
>
David Gibson May 1, 2015, 4:44 a.m. UTC | #5
On Fri, May 01, 2015 at 10:46:08AM +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2015-04-30 at 19:33 +1000, Alexey Kardashevskiy wrote:
> > On 04/30/2015 05:22 PM, David Gibson wrote:
> > > On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
> > >> At the moment only one group per container is supported.
> > >> POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
> > >> IOMMU group so we can relax this limitation and support multiple groups
> > >> per container.
> > >
> > > It's not obvious why allowing multiple TCE tables per PE has any
> > > pearing on allowing multiple groups per container.
> > 
> > 
> > This patchset is a global TCE tables rework (patches 1..30, roughly) with 2 
> > outcomes:
> > 1. reusing the same IOMMU table for multiple groups - patch 31;
> > 2. allowing dynamic create/remove of IOMMU tables - patch 32.
> > 
> > I can remove this one from the patchset and post it separately later but 
> > since 1..30 aim to support both 1) and 2), I'd think I better keep them all 
> > together (might explain some of changes I do in 1..30).
> 
> I think you are talking past each other :-)
> 
> But yes, having 2 tables per group is orthogonal to the ability of
> having multiple groups per container.
> 
> The latter is made possible on P8 in large part because each PE has its
> own DMA address space (unlike P5IOC2 or P7IOC where a single address
> space is segmented).
> 
> Also, on P8 you can actually make the TVT entries point to the same
> table in memory, thus removing the need to duplicate the actual
> tables (though you still have to duplicate the invalidations). I would
> however recommend only sharing the table that way within a chip/node.
> 
>  .../..
> 
> > >>
> > >> -1) Only one IOMMU group per container is supported as an IOMMU group
> > >> -represents the minimal entity which isolation can be guaranteed for and
> > >> -groups are allocated statically, one per a Partitionable Endpoint (PE)
> > >> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
> > >> +container is supported as an IOMMU table is allocated at the boot time,
> > >> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
> > >>   (PE is often a PCI domain but not always).
> 
> > > I thought the more fundamental problem was that different PEs tended
> > > to use disjoint bus address ranges, so even by duplicating put_tce
> > > across PEs you couldn't have a common address space.
> 
> Yes. This is the problem with P7IOC and earlier. It *could* be doable on
> P7IOC by making them the same PE but let's not go there.
> 
> > Sorry, I am not following you here.
> > 
> > By duplicating put_tce, I can have multiple IOMMU groups on the same 
> > virtual PHB in QEMU, "[PATCH qemu v7 04/14] spapr_pci_vfio: Enable multiple 
> > groups per container" does this, the address ranges will the same.
> 
> But that is only possible on P8 because only there do we have separate
> address spaces between PEs.
> 
> > What I cannot do on p5ioc2 is programming the same table to multiple 
> > physical PHBs (or I could but it is very different than IODA2 and pretty 
> > ugly and might not always be possible because I would have to allocate 
> > these pages from some common pool and face problems like fragmentation).
> 
> And P7IOC has a similar issue. The DMA address top bits indexes the
> window on P7IOC within a shared address space. It's possible to
> configure a TVT to cover multiple devices but with very serious
> limitations.

Ok.  To check my understanding does this sound reasonable:

  * The table_group more-or-less represents a PE, but in a way you can
    reference without first knowing the specific IOMMU hardware type.

  * When attaching multiple groups to the same container, the first PE
    (i.e. table_group) attached is used as a representative so that
    subsequent groups can be checked for compatibility with the first
    PE and therefore all PEs currently included in the container

     - This is why the table_group appears in some places where it
       doesn't seem sensible from a pure object ownership point of
       view
Alexey Kardashevskiy May 1, 2015, 6:05 a.m. UTC | #6
On 05/01/2015 02:33 PM, David Gibson wrote:
> On Thu, Apr 30, 2015 at 07:33:09PM +1000, Alexey Kardashevskiy wrote:
>> On 04/30/2015 05:22 PM, David Gibson wrote:
>>> On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
>>>> At the moment only one group per container is supported.
>>>> POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
>>>> IOMMU group so we can relax this limitation and support multiple groups
>>>> per container.
>>>
>>> It's not obvious why allowing multiple TCE tables per PE has any
>>> pearing on allowing multiple groups per container.
>>
>>
>> This patchset is a global TCE tables rework (patches 1..30, roughly) with 2
>> outcomes:
>> 1. reusing the same IOMMU table for multiple groups - patch 31;
>> 2. allowing dynamic create/remove of IOMMU tables - patch 32.
>>
>> I can remove this one from the patchset and post it separately later but
>> since 1..30 aim to support both 1) and 2), I'd think I better keep them all
>> together (might explain some of changes I do in 1..30).
>
> The combined patchset is fine.  My comment is because your commit
> message says that multiple groups are possible *because* 2 TCE tables
> per group are allowed, and it's not at all clear why one follows from
> the other.


Ah. That's wrong indeed, I'll fix it.


>>>> This adds TCE table descriptors to a container and uses iommu_table_group_ops
>>>> to create/set DMA windows on IOMMU groups so the same TCE tables will be
>>>> shared between several IOMMU groups.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> [aw: for the vfio related changes]
>>>> Acked-by: Alex Williamson <alex.williamson@redhat.com>
>>>> ---
>>>> Changes:
>>>> v7:
>>>> * updated doc
>>>> ---
>>>>   Documentation/vfio.txt              |   8 +-
>>>>   drivers/vfio/vfio_iommu_spapr_tce.c | 268 ++++++++++++++++++++++++++----------
>>>>   2 files changed, 199 insertions(+), 77 deletions(-)
>>>>
>>>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>>>> index 94328c8..7dcf2b5 100644
>>>> --- a/Documentation/vfio.txt
>>>> +++ b/Documentation/vfio.txt
>>>> @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
>>>>
>>>>   This implementation has some specifics:
>>>>
>>>> -1) Only one IOMMU group per container is supported as an IOMMU group
>>>> -represents the minimal entity which isolation can be guaranteed for and
>>>> -groups are allocated statically, one per a Partitionable Endpoint (PE)
>>>> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
>>>> +container is supported as an IOMMU table is allocated at the boot time,
>>>> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
>>>>   (PE is often a PCI domain but not always).
>>>
>>> I thought the more fundamental problem was that different PEs tended
>>> to use disjoint bus address ranges, so even by duplicating put_tce
>>> across PEs you couldn't have a common address space.
>>
>>
>> Sorry, I am not following you here.
>>
>> By duplicating put_tce, I can have multiple IOMMU groups on the same virtual
>> PHB in QEMU, "[PATCH qemu v7 04/14] spapr_pci_vfio: Enable multiple groups
>> per container" does this, the address ranges will the same.
>
> Oh, ok.  For some reason I thought that (at least on the older
> machines) the different PEs used different and not easily changeable
> DMA windows in bus addresses space.


They do use different tables (which VFIO does not get to remove/create and 
uses these old helpers - iommu_take/release_ownership), correct. But all 
these windows are mapped at zero on a PE's PCI bus and nothing prevents me 
from updating all these tables with the same TCE values when handling 
H_PUT_TCE. Yes it is slow but it works (bit more details below).



>> What I cannot do on p5ioc2 is programming the same table to multiple
>> physical PHBs (or I could but it is very different than IODA2 and pretty
>> ugly and might not always be possible because I would have to allocate these
>> pages from some common pool and face problems like fragmentation).
>
> So allowing multiple groups per container should be possible (at the
> kernel rather than qemu level) by writing the same value to multiple
> TCE tables.  I guess its not worth doing for just the almost-obsolete
> IOMMUs though.


It is done at QEMU level though. As it works now, QEMU opens a group, walks 
through all existing containers and tries attaching a new group there. If 
it succeeded (x86 always; POWER8 after this patch), a TCE table is shared. 
If it failed, QEMU creates another container, attaches it to the same 
VFIO/PHB address space and attaches a group there.

Then the only thing left is repeating ioctl() in vfio_container_ioctl() for 
every container in the VFIO address space; this is what that QEMU patch 
does (the first version of that patch called ioctl() only for the first 
container in the address space).

 From the kernel prospective there are 2 isolated containers; I'd like to 
keep it this way.

btw thanks for the detailed review :)
David Gibson May 5, 2015, 11:50 a.m. UTC | #7
On Fri, May 01, 2015 at 04:05:24PM +1000, Alexey Kardashevskiy wrote:
> On 05/01/2015 02:33 PM, David Gibson wrote:
> >On Thu, Apr 30, 2015 at 07:33:09PM +1000, Alexey Kardashevskiy wrote:
> >>On 04/30/2015 05:22 PM, David Gibson wrote:
> >>>On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
> >>>>At the moment only one group per container is supported.
> >>>>POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
> >>>>IOMMU group so we can relax this limitation and support multiple groups
> >>>>per container.
> >>>
> >>>It's not obvious why allowing multiple TCE tables per PE has any
> >>>pearing on allowing multiple groups per container.
> >>
> >>
> >>This patchset is a global TCE tables rework (patches 1..30, roughly) with 2
> >>outcomes:
> >>1. reusing the same IOMMU table for multiple groups - patch 31;
> >>2. allowing dynamic create/remove of IOMMU tables - patch 32.
> >>
> >>I can remove this one from the patchset and post it separately later but
> >>since 1..30 aim to support both 1) and 2), I'd think I better keep them all
> >>together (might explain some of changes I do in 1..30).
> >
> >The combined patchset is fine.  My comment is because your commit
> >message says that multiple groups are possible *because* 2 TCE tables
> >per group are allowed, and it's not at all clear why one follows from
> >the other.
> 
> 
> Ah. That's wrong indeed, I'll fix it.
> 
> 
> >>>>This adds TCE table descriptors to a container and uses iommu_table_group_ops
> >>>>to create/set DMA windows on IOMMU groups so the same TCE tables will be
> >>>>shared between several IOMMU groups.
> >>>>
> >>>>Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>>[aw: for the vfio related changes]
> >>>>Acked-by: Alex Williamson <alex.williamson@redhat.com>
> >>>>---
> >>>>Changes:
> >>>>v7:
> >>>>* updated doc
> >>>>---
> >>>>  Documentation/vfio.txt              |   8 +-
> >>>>  drivers/vfio/vfio_iommu_spapr_tce.c | 268 ++++++++++++++++++++++++++----------
> >>>>  2 files changed, 199 insertions(+), 77 deletions(-)
> >>>>
> >>>>diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> >>>>index 94328c8..7dcf2b5 100644
> >>>>--- a/Documentation/vfio.txt
> >>>>+++ b/Documentation/vfio.txt
> >>>>@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
> >>>>
> >>>>  This implementation has some specifics:
> >>>>
> >>>>-1) Only one IOMMU group per container is supported as an IOMMU group
> >>>>-represents the minimal entity which isolation can be guaranteed for and
> >>>>-groups are allocated statically, one per a Partitionable Endpoint (PE)
> >>>>+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
> >>>>+container is supported as an IOMMU table is allocated at the boot time,
> >>>>+one table per a IOMMU group which is a Partitionable Endpoint (PE)
> >>>>  (PE is often a PCI domain but not always).
> >>>
> >>>I thought the more fundamental problem was that different PEs tended
> >>>to use disjoint bus address ranges, so even by duplicating put_tce
> >>>across PEs you couldn't have a common address space.
> >>
> >>
> >>Sorry, I am not following you here.
> >>
> >>By duplicating put_tce, I can have multiple IOMMU groups on the same virtual
> >>PHB in QEMU, "[PATCH qemu v7 04/14] spapr_pci_vfio: Enable multiple groups
> >>per container" does this, the address ranges will the same.
> >
> >Oh, ok.  For some reason I thought that (at least on the older
> >machines) the different PEs used different and not easily changeable
> >DMA windows in bus addresses space.
> 
> 
> They do use different tables (which VFIO does not get to remove/create and
> uses these old helpers - iommu_take/release_ownership), correct. But all
> these windows are mapped at zero on a PE's PCI bus and nothing prevents me
> from updating all these tables with the same TCE values when handling
> H_PUT_TCE. Yes it is slow but it works (bit more details below).

Um.. I'm pretty sure that contradicts what Ben was saying on the
thread.

> >>What I cannot do on p5ioc2 is programming the same table to multiple
> >>physical PHBs (or I could but it is very different than IODA2 and pretty
> >>ugly and might not always be possible because I would have to allocate these
> >>pages from some common pool and face problems like fragmentation).
> >
> >So allowing multiple groups per container should be possible (at the
> >kernel rather than qemu level) by writing the same value to multiple
> >TCE tables.  I guess its not worth doing for just the almost-obsolete
> >IOMMUs though.
> 
> 
> It is done at QEMU level though. As it works now, QEMU opens a group, walks
> through all existing containers and tries attaching a new group there. If it
> succeeded (x86 always; POWER8 after this patch), a TCE table is shared. If
> it failed, QEMU creates another container, attaches it to the same VFIO/PHB
> address space and attaches a group there.
> 
> Then the only thing left is repeating ioctl() in vfio_container_ioctl() for
> every container in the VFIO address space; this is what that QEMU patch does
> (the first version of that patch called ioctl() only for the first container
> in the address space).
> 
> From the kernel prospective there are 2 isolated containers; I'd like to
> keep it this way.
> 
> btw thanks for the detailed review :)
>
Alexey Kardashevskiy May 11, 2015, 2:26 a.m. UTC | #8
On 05/05/2015 09:50 PM, David Gibson wrote:
> On Fri, May 01, 2015 at 04:05:24PM +1000, Alexey Kardashevskiy wrote:
>> On 05/01/2015 02:33 PM, David Gibson wrote:
>>> On Thu, Apr 30, 2015 at 07:33:09PM +1000, Alexey Kardashevskiy wrote:
>>>> On 04/30/2015 05:22 PM, David Gibson wrote:
>>>>> On Sat, Apr 25, 2015 at 10:14:55PM +1000, Alexey Kardashevskiy wrote:
>>>>>> At the moment only one group per container is supported.
>>>>>> POWER8 CPUs have more flexible design and allows naving 2 TCE tables per
>>>>>> IOMMU group so we can relax this limitation and support multiple groups
>>>>>> per container.
>>>>>
>>>>> It's not obvious why allowing multiple TCE tables per PE has any
>>>>> pearing on allowing multiple groups per container.
>>>>
>>>>
>>>> This patchset is a global TCE tables rework (patches 1..30, roughly) with 2
>>>> outcomes:
>>>> 1. reusing the same IOMMU table for multiple groups - patch 31;
>>>> 2. allowing dynamic create/remove of IOMMU tables - patch 32.
>>>>
>>>> I can remove this one from the patchset and post it separately later but
>>>> since 1..30 aim to support both 1) and 2), I'd think I better keep them all
>>>> together (might explain some of changes I do in 1..30).
>>>
>>> The combined patchset is fine.  My comment is because your commit
>>> message says that multiple groups are possible *because* 2 TCE tables
>>> per group are allowed, and it's not at all clear why one follows from
>>> the other.
>>
>>
>> Ah. That's wrong indeed, I'll fix it.
>>
>>
>>>>>> This adds TCE table descriptors to a container and uses iommu_table_group_ops
>>>>>> to create/set DMA windows on IOMMU groups so the same TCE tables will be
>>>>>> shared between several IOMMU groups.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>>> [aw: for the vfio related changes]
>>>>>> Acked-by: Alex Williamson <alex.williamson@redhat.com>
>>>>>> ---
>>>>>> Changes:
>>>>>> v7:
>>>>>> * updated doc
>>>>>> ---
>>>>>>   Documentation/vfio.txt              |   8 +-
>>>>>>   drivers/vfio/vfio_iommu_spapr_tce.c | 268 ++++++++++++++++++++++++++----------
>>>>>>   2 files changed, 199 insertions(+), 77 deletions(-)
>>>>>>
>>>>>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>>>>>> index 94328c8..7dcf2b5 100644
>>>>>> --- a/Documentation/vfio.txt
>>>>>> +++ b/Documentation/vfio.txt
>>>>>> @@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
>>>>>>
>>>>>>   This implementation has some specifics:
>>>>>>
>>>>>> -1) Only one IOMMU group per container is supported as an IOMMU group
>>>>>> -represents the minimal entity which isolation can be guaranteed for and
>>>>>> -groups are allocated statically, one per a Partitionable Endpoint (PE)
>>>>>> +1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
>>>>>> +container is supported as an IOMMU table is allocated at the boot time,
>>>>>> +one table per a IOMMU group which is a Partitionable Endpoint (PE)
>>>>>>   (PE is often a PCI domain but not always).
>>>>>
>>>>> I thought the more fundamental problem was that different PEs tended
>>>>> to use disjoint bus address ranges, so even by duplicating put_tce
>>>>> across PEs you couldn't have a common address space.
>>>>
>>>>
>>>> Sorry, I am not following you here.
>>>>
>>>> By duplicating put_tce, I can have multiple IOMMU groups on the same virtual
>>>> PHB in QEMU, "[PATCH qemu v7 04/14] spapr_pci_vfio: Enable multiple groups
>>>> per container" does this, the address ranges will the same.
>>>
>>> Oh, ok.  For some reason I thought that (at least on the older
>>> machines) the different PEs used different and not easily changeable
>>> DMA windows in bus addresses space.
>>
>>
>> They do use different tables (which VFIO does not get to remove/create and
>> uses these old helpers - iommu_take/release_ownership), correct. But all
>> these windows are mapped at zero on a PE's PCI bus and nothing prevents me
>> from updating all these tables with the same TCE values when handling
>> H_PUT_TCE. Yes it is slow but it works (bit more details below).
>
> Um.. I'm pretty sure that contradicts what Ben was saying on the
> thread.


True, it does contradict, I do not know why he said what he said :)
diff mbox

Patch

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 94328c8..7dcf2b5 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -289,10 +289,12 @@  PPC64 sPAPR implementation note
 
 This implementation has some specifics:
 
-1) Only one IOMMU group per container is supported as an IOMMU group
-represents the minimal entity which isolation can be guaranteed for and
-groups are allocated statically, one per a Partitionable Endpoint (PE)
+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
+container is supported as an IOMMU table is allocated at the boot time,
+one table per a IOMMU group which is a Partitionable Endpoint (PE)
 (PE is often a PCI domain but not always).
+Newer systems (POWER8 with IODA2) have improved hardware design which allows
+to remove this limitation and have multiple IOMMU groups per a VFIO container.
 
 2) The hardware supports so called DMA windows - the PCI address range
 within which DMA transfer is allowed, any attempt to access address space
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index a7d6729..970e3a2 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -82,6 +82,11 @@  static void decrement_locked_vm(long npages)
  * into DMA'ble space using the IOMMU
  */
 
+struct tce_iommu_group {
+	struct list_head next;
+	struct iommu_group *grp;
+};
+
 /*
  * The container descriptor supports only a single group per container.
  * Required by the API as the container is not supplied with the IOMMU group
@@ -89,10 +94,11 @@  static void decrement_locked_vm(long npages)
  */
 struct tce_container {
 	struct mutex lock;
-	struct iommu_group *grp;
 	bool enabled;
 	unsigned long locked_pages;
 	bool v2;
+	struct iommu_table tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+	struct list_head group_list;
 };
 
 static long tce_unregister_pages(struct tce_container *container,
@@ -154,20 +160,20 @@  static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 }
 
+static inline bool tce_groups_attached(struct tce_container *container)
+{
+	return !list_empty(&container->group_list);
+}
+
 static struct iommu_table *spapr_tce_find_table(
 		struct tce_container *container,
 		phys_addr_t ioba)
 {
 	long i;
 	struct iommu_table *ret = NULL;
-	struct iommu_table_group *table_group;
-
-	table_group = iommu_group_get_iommudata(container->grp);
-	if (!table_group)
-		return NULL;
 
 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-		struct iommu_table *tbl = &table_group->tables[i];
+		struct iommu_table *tbl = &container->tables[i];
 		unsigned long entry = ioba >> tbl->it_page_shift;
 		unsigned long start = tbl->it_offset;
 		unsigned long end = start + tbl->it_size;
@@ -186,9 +192,7 @@  static int tce_iommu_enable(struct tce_container *container)
 	int ret = 0;
 	unsigned long locked;
 	struct iommu_table_group *table_group;
-
-	if (!container->grp)
-		return -ENXIO;
+	struct tce_iommu_group *tcegrp;
 
 	if (!current->mm)
 		return -ESRCH; /* process exited */
@@ -225,7 +229,12 @@  static int tce_iommu_enable(struct tce_container *container)
 	 * as there is no way to know how much we should increment
 	 * the locked_vm counter.
 	 */
-	table_group = iommu_group_get_iommudata(container->grp);
+	if (!tce_groups_attached(container))
+		return -ENODEV;
+
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
 	if (!table_group)
 		return -ENODEV;
 
@@ -257,6 +266,48 @@  static void tce_iommu_disable(struct tce_container *container)
 	decrement_locked_vm(container->locked_pages);
 }
 
+static long tce_iommu_create_table(struct iommu_table_group *table_group,
+			int num,
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels,
+			struct iommu_table *tbl)
+{
+	long ret, table_size;
+
+	table_size = table_group->ops->get_table_size(page_shift, window_size,
+			levels);
+	if (!table_size)
+		return -EINVAL;
+
+	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
+	if (ret)
+		return ret;
+
+	ret = table_group->ops->create_table(table_group, num,
+			page_shift, window_size, levels, tbl);
+
+	WARN_ON(!ret && !tbl->it_ops->free);
+	WARN_ON(!ret && (tbl->it_allocated_size != table_size));
+
+	if (ret)
+		decrement_locked_vm(table_size >> PAGE_SHIFT);
+
+	return ret;
+}
+
+static void tce_iommu_free_table(struct iommu_table *tbl)
+{
+	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
+
+	if (!tbl->it_size)
+		return;
+
+	tbl->it_ops->free(tbl);
+	decrement_locked_vm(pages);
+	memset(tbl, 0, sizeof(*tbl));
+}
+
 static void *tce_iommu_open(unsigned long arg)
 {
 	struct tce_container *container;
@@ -271,19 +322,41 @@  static void *tce_iommu_open(unsigned long arg)
 		return ERR_PTR(-ENOMEM);
 
 	mutex_init(&container->lock);
+	INIT_LIST_HEAD_RCU(&container->group_list);
+
 	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 
 	return container;
 }
 
+static int tce_iommu_clear(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages);
+
 static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
+	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp;
+	long i;
 
-	WARN_ON(container->grp);
+	while (tce_groups_attached(container)) {
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+		tce_iommu_detach_group(iommu_data, tcegrp->grp);
+	}
 
-	if (container->grp)
-		tce_iommu_detach_group(iommu_data, container->grp);
+	/*
+	 * If VFIO created a table, it was not disposed
+	 * by tce_iommu_detach_group() so do it now.
+	 */
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = &container->tables[i];
+
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		tce_iommu_free_table(tbl);
+	}
 
 	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
@@ -509,12 +582,15 @@  static long tce_iommu_ioctl(void *iommu_data,
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
+		struct tce_iommu_group *tcegrp;
 		struct iommu_table_group *table_group;
 
-		if (WARN_ON(!container->grp))
+		if (!tce_groups_attached(container))
 			return -ENXIO;
 
-		table_group = iommu_group_get_iommudata(container->grp);
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
 
 		if (!table_group)
 			return -ENXIO;
@@ -707,12 +783,20 @@  static long tce_iommu_ioctl(void *iommu_data,
 		tce_iommu_disable(container);
 		mutex_unlock(&container->lock);
 		return 0;
-	case VFIO_EEH_PE_OP:
-		if (!container->grp)
-			return -ENODEV;
 
-		return vfio_spapr_iommu_eeh_ioctl(container->grp,
-						  cmd, arg);
+	case VFIO_EEH_PE_OP: {
+		struct tce_iommu_group *tcegrp;
+
+		ret = 0;
+		list_for_each_entry(tcegrp, &container->group_list, next) {
+			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
+					cmd, arg);
+			if (ret)
+				return ret;
+		}
+		return ret;
+	}
+
 	}
 
 	return -ENOTTY;
@@ -724,11 +808,14 @@  static void tce_iommu_release_ownership(struct tce_container *container,
 	int i;
 
 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-		struct iommu_table *tbl = &table_group->tables[i];
+		struct iommu_table *tbl = &container->tables[i];
 
 		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 		if (tbl->it_map)
 			iommu_release_ownership(tbl);
+
+		/* Reset the container's copy of the table descriptor */
+		memset(tbl, 0, sizeof(*tbl));
 	}
 }
 
@@ -758,38 +845,56 @@  static int tce_iommu_take_ownership(struct iommu_table_group *table_group)
 static int tce_iommu_attach_group(void *iommu_data,
 		struct iommu_group *iommu_group)
 {
-	int ret;
+	int ret, i;
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp = NULL;
+	bool first_group = !tce_groups_attached(container);
 
 	mutex_lock(&container->lock);
 
 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
 			iommu_group_id(iommu_group), iommu_group); */
-	if (container->grp) {
-		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-				iommu_group_id(container->grp),
-				iommu_group_id(iommu_group));
-		ret = -EBUSY;
-		goto unlock_exit;
-	}
-
-	if (container->enabled) {
-		pr_err("tce_vfio: attaching group #%u to enabled container\n",
-				iommu_group_id(iommu_group));
-		ret = -EBUSY;
-		goto unlock_exit;
-	}
-
 	table_group = iommu_group_get_iommudata(iommu_group);
-	if (!table_group) {
-		ret = -ENXIO;
+
+	if (!first_group && (!table_group->ops ||
+			!table_group->ops->take_ownership ||
+			!table_group->ops->release_ownership)) {
+		ret = -EBUSY;
+		goto unlock_exit;
+	}
+
+	/* Check if new group has the same iommu_ops (i.e. compatible) */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		struct iommu_table_group *table_group_tmp;
+
+		if (tcegrp->grp == iommu_group) {
+			pr_warn("tce_vfio: Group %d is already attached\n",
+					iommu_group_id(iommu_group));
+			ret = -EBUSY;
+			goto unlock_exit;
+		}
+		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
+		if (table_group_tmp->ops != table_group->ops) {
+			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
+					iommu_group_id(iommu_group),
+					iommu_group_id(tcegrp->grp));
+			ret = -EPERM;
+			goto unlock_exit;
+		}
+	}
+
+	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
+	if (!tcegrp) {
+		ret = -ENOMEM;
 		goto unlock_exit;
 	}
 
 	if (!table_group->ops || !table_group->ops->take_ownership ||
 			!table_group->ops->release_ownership) {
 		ret = tce_iommu_take_ownership(table_group);
+		if (!ret)
+			container->tables[0] = table_group->tables[0];
 	} else if (!table_group->ops->create_table ||
 			!table_group->ops->set_window) {
 		WARN_ON_ONCE(1);
@@ -801,23 +906,46 @@  static int tce_iommu_attach_group(void *iommu_data,
 		 * the pages that has been explicitly mapped into the iommu
 		 */
 		table_group->ops->take_ownership(table_group);
-		ret = table_group->ops->create_table(table_group,
-				0, /* window number */
-				IOMMU_PAGE_SHIFT_4K,
-				table_group->tce32_size,
-				1, /* default levels */
-				&table_group->tables[0]);
-		if (!ret)
-			ret = table_group->ops->set_window(table_group, 0,
-					&table_group->tables[0]);
+		/*
+		 * If it the first group attached, check if there is
+		 * a default DMA window and create one if none as
+		 * the userspace expects it to exist.
+		 */
+		if (first_group && !container->tables[0].it_size) {
+			ret = tce_iommu_create_table(table_group,
+					0, /* window number */
+					IOMMU_PAGE_SHIFT_4K,
+					table_group->tce32_size,
+					1, /* default levels */
+					&container->tables[0]);
+			if (ret)
+				goto unlock_exit;
+		}
+
+		/* Set all windows to the new group */
+		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+			struct iommu_table *tbl = &container->tables[i];
+
+			if (!tbl->it_size)
+				continue;
+
+			/* Set the default window to a new group */
+			ret = table_group->ops->set_window(table_group, i, tbl);
+			if (ret)
+				break;
+		}
 	}
 
 	if (ret)
 		goto unlock_exit;
 
-	container->grp = iommu_group;
+	tcegrp->grp = iommu_group;
+	list_add(&tcegrp->next, &container->group_list);
 
 unlock_exit:
+	if (ret && tcegrp)
+		kfree(tcegrp);
+
 	mutex_unlock(&container->lock);
 
 	return ret;
@@ -828,25 +956,27 @@  static void tce_iommu_detach_group(void *iommu_data,
 {
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp;
 	long i;
+	bool found = false;
 
 	mutex_lock(&container->lock);
-	if (iommu_group != container->grp) {
-		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
-				iommu_group_id(iommu_group),
-				iommu_group_id(container->grp));
+
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		if (tcegrp->grp == iommu_group) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		pr_warn("tce_vfio: detaching unattached group #%u\n",
+				iommu_group_id(iommu_group));
 		goto unlock_exit;
 	}
 
-	if (container->enabled) {
-		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-				iommu_group_id(container->grp));
-		tce_iommu_disable(container);
-	}
-
-	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
-	   iommu_group_id(iommu_group), iommu_group); */
-	container->grp = NULL;
+	list_del(&tcegrp->next);
+	kfree(tcegrp);
 
 	table_group = iommu_group_get_iommudata(iommu_group);
 	BUG_ON(!table_group);
@@ -857,18 +987,8 @@  static void tce_iommu_detach_group(void *iommu_data,
 	else if (!table_group->ops->unset_window)
 		WARN_ON_ONCE(1);
 	else {
-		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-			struct iommu_table tbl = table_group->tables[i];
-
-			if (!tbl.it_size)
-				continue;
-
+		for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
 			table_group->ops->unset_window(table_group, i);
-			tce_iommu_clear(container, &tbl,
-					tbl.it_offset, tbl.it_size);
-			if (tbl.it_ops->free)
-				tbl.it_ops->free(&tbl);
-		}
 
 		table_group->ops->release_ownership(table_group);
 	}