Patchwork [1/2] vfio powerpc: enabled on powernv platform

login
register
mail settings
Submitter Alexey Kardashevskiy
Date Dec. 3, 2012, 2:52 a.m.
Message ID <1354503178-2602-2-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/203262/
State Superseded
Headers show

Comments

Alexey Kardashevskiy - Dec. 3, 2012, 2:52 a.m.
This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h     |    9 ++
 arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
 drivers/iommu/Kconfig                |    8 ++
 4 files changed, 338 insertions(+)
Alex Williamson - Dec. 3, 2012, 5:35 p.m.
On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |    9 ++
>  arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 338 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index cbfe678..5c7087a 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>  	struct iommu_pool large_pool;
>  	struct iommu_pool pools[IOMMU_NR_POOLS];
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +	struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..2738aa4 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>  		free_pages((unsigned long)vaddr, get_order(size));
>  	}
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long entry)
> +{
> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +	/* Aligns TCE entry number to system page boundary */
> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +	/* Count used 4K pages */
> +	while (nbits--)
> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
> +
> +	return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +	/* Flush/invalidate TLB caches if necessary */
> +	if (ppc_md.tce_flush)
> +		ppc_md.tce_flush(tbl);
> +
> +	/* Make sure updates are seen by hardware */
> +	mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int i, retpages = 0;
> +	unsigned long oldtce, oldweight;
> +	struct page *page;
> +
> +	for (i = 0; i < pages; ++i) {
> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> +		ppc_md.tce_free(tbl, entry + i, 1);
> +
> +		oldweight = syspage_weight(tbl->it_map, entry);
> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +			continue;

Could this happen earlier, above syspage_weight() and __clear_bit()?

> +
> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +		WARN_ON(!page);
> +		if (!page)
> +			continue;
> +
> +		if (oldtce & TCE_PCI_WRITE)
> +			SetPageDirty(page);
> +
> +		put_page(page);
> +
> +		/* That was the last IOMMU page within the system page */
> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> +			++retpages;

If you used __test_and_clear_bit() above I think you could avoid this
2nd call to syspage_weight.  A minor optimization though.

> +	}
> +
> +	return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + / of released system pages
> + */

Something bad happened to your comments here.

> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +		unsigned long pages)
> +{
> +	int ret;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	spin_lock(&(pool->lock));
> +	ret = clear_tces_nolock(tbl, entry, pages);
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction)
> +{
> +	int ret;
> +	struct page *page = NULL;
> +	unsigned long kva, offset, oldweight;
> +
> +	/* Map new TCE */
> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);

Maybe the compiler will figure this out, but isn't this the same as tce
& (IOMMU_PAGE_MASK & PAGE_MASK)?

> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +			direction != DMA_TO_DEVICE, &page);
> +	if (ret < 1) {

Probably (ret != 1) here or else we never get to your >1 case below.

> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, ret);

Use pr_err

> +		if (!ret || (ret > 1))

Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret

> +			ret = -EFAULT;
> +		return ret;
> +	}

You're missing the code from x86 that handles mapping mmap'd ranges.
This is intended to allow peer-to-peer DMA between devices.  Is that
intentional?

> +
> +	kva = (unsigned long) page_address(page);
> +	kva += offset;
> +
> +	/* tce_build receives a virtual address */
> +	entry += tbl->it_offset; /* Offset into real TCE table */

Here's what makes me call the entry "relative" rather than zero-based.
The iova is relative to the start of dma32_window_start, ie. if the
window starts at bus address 512MB and I want to create a translation at
bus address 512MB, I pass in an iova of 0, right?  The above adds the
window offset.  So you've removed dma64 window, but we really need to
define iova better.

> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +	/* tce_build() only returns non-zero for transient errors */
> +	if (unlikely(ret)) {
> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);

Use pr_err

> +		put_page(page);
> +		return -EIO;
> +	}
> +
> +	/* Calculate if new system page has been locked */
> +	oldweight = syspage_weight(tbl->it_map, entry);
> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +	return (oldweight == 0) ? 1 : 0;
> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +		uint64_t tce, enum dma_data_direction direction,
> +		unsigned long pages)
> +{
> +	int i, ret = 0, retpages = 0;
> +	struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +	BUG_ON(direction == DMA_NONE);

This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
will always be the only caller of this function.

> +
> +	spin_lock(&(pool->lock));
> +
> +	/* Check if any is in use */
> +	for (i = 0; i < pages; ++i) {
> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> +				test_bit(entry + i, tbl->it_map)) {
> +			WARN_ON(test_bit(entry + i, tbl->it_map));

The WARN_ON seems to confirm that these are redundant tests, does that
imply we don't trust it_map?  It would be a lot faster if we could rely
on it_map exclusively here.

> +			spin_unlock(&(pool->lock));
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Put tces to the table */
> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +		ret = put_tce(tbl, entry + i, tce, direction);
> +		if (ret == 1)
> +			++retpages;
> +	}
> +
> +	/*
> +	 * If failed, release locked pages, otherwise return the number
> +	 * of locked system pages
> +	 */
> +	if (ret < 0)
> +		clear_tces_nolock(tbl, entry, i);
> +	else
> +		ret = retpages;
> +
> +	tce_flush(tbl);
> +	spin_unlock(&(pool->lock));
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..21250ef 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>  	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +	struct iommu_table *tbl;
> +	int ret = 0;
> +
> +	if (WARN_ON(dev->iommu_group)) {
> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> +				dev_name(dev),
> +				iommu_group_id(dev->iommu_group));

Use pr_warn

> +		return -EBUSY;
> +	}
> +
> +	tbl = get_iommu_table_base(dev);
> +	if (!tbl) {
> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +				dev_name(dev));
> +		return 0;
> +	}
> +
> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +			dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +	ret = iommu_group_add_device(tbl->it_group, dev);
> +	if (ret < 0)
> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> +				dev_name(dev), ret);

Use pr_err

> +
> +	return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +	iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +			      unsigned long action, void *data)
> +{
> +	struct device *dev = data;
> +
> +	switch (action) {
> +	case BUS_NOTIFY_ADD_DEVICE:
> +		return add_device(dev);
> +	case BUS_NOTIFY_DEL_DEVICE:
> +		del_device(dev);
> +		return 0;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +	.notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +	struct iommu_table *tbl = iommu_data;
> +	tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp;
> +
> +	/* Allocate and initialize IOMMU groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +
> +		/* Skip already initialized */
> +		if (tbl->it_group)
> +			continue;
> +
> +		grp = iommu_group_alloc();
> +		if (IS_ERR(grp)) {
> +			printk(KERN_INFO "tce_vfio: cannot create "
> +					"new IOMMU group, ret=%ld\n",
> +					PTR_ERR(grp));

Use pr_info

> +			return PTR_ERR(grp);
> +		}
> +		tbl->it_group = grp;
> +		iommu_group_set_iommudata(grp, tbl, group_release);
> +	}
> +
> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Add PCI devices to VFIO groups */
> +	for_each_pci_dev(pdev)
> +		add_device(&pdev->dev);
> +
> +	return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +	struct pci_dev *pdev = NULL;
> +	struct iommu_table *tbl;
> +	struct iommu_group *grp = NULL;
> +
> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +	/* Delete PCI devices from VFIO groups */
> +	for_each_pci_dev(pdev)
> +		del_device(&pdev->dev);
> +
> +	/* Release VFIO groups */
> +	for_each_pci_dev(pdev) {
> +		tbl = get_iommu_table_base(&pdev->dev);
> +		if (!tbl)
> +			continue;
> +		grp = tbl->it_group;
> +
> +		/* Skip (already) uninitialized */
> +		if (!grp)
> +			continue;
> +
> +		/* Do actual release, group_release() is expected to work */
> +		iommu_group_put(grp);
> +		BUG_ON(tbl->it_group);
> +	}
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>  	  Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +	bool "sPAPR TCE IOMMU Support"
> +	depends on PPC_POWERNV
> +	select IOMMU_API
> +	help
> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +	  still not implemented.
> +
>  endif # IOMMU_SUPPORT

Thanks,

Alex
Alexey Kardashevskiy - Dec. 4, 2012, 8:12 a.m.
On 04/12/12 04:35, Alex Williamson wrote:
> On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
>> This patch initializes IOMMU groups based on the IOMMU
>> configuration discovered during the PCI scan on POWERNV
>> (POWER non virtualized) platform. The IOMMU groups are
>> to be used later by VFIO driver (PCI pass through).
>>
>> It also implements an API for mapping/unmapping pages for
>> guest PCI drivers and providing DMA window properties.
>> This API is going to be used later by QEMU-VFIO to handle
>> h_put_tce hypercalls from the KVM guest.
>>
>> Although this driver has been tested only on the POWERNV
>> platform, it should work on any platform which supports
>> TCE tables.
>>
>> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
>> option and configure VFIO as required.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h     |    9 ++
>>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
>>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
>>   drivers/iommu/Kconfig                |    8 ++
>>   4 files changed, 338 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index cbfe678..5c7087a 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -76,6 +76,9 @@ struct iommu_table {
>>   	struct iommu_pool large_pool;
>>   	struct iommu_pool pools[IOMMU_NR_POOLS];
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>> +#ifdef CONFIG_IOMMU_API
>> +	struct iommu_group *it_group;
>> +#endif
>>   };
>>
>>   struct scatterlist;
>> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
>>   }
>>   #endif
>>
>> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages);
>> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages);
>> +
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_IOMMU_H */
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index ff5a6ce..2738aa4 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -44,6 +44,7 @@
>>   #include <asm/kdump.h>
>>   #include <asm/fadump.h>
>>   #include <asm/vio.h>
>> +#include <asm/tce.h>
>>
>>   #define DBG(...)
>>
>> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>   		free_pages((unsigned long)vaddr, get_order(size));
>>   	}
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * SPAPR TCE API
>> + */
>> +
>> +/*
>> + * Returns the number of used IOMMU pages (4K) within
>> + * the same system page (4K or 64K).
>> + * bitmap_weight is not used as it does not support bigendian maps.
>> + */
>> +static int syspage_weight(unsigned long *map, unsigned long entry)
>> +{
>> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
>> +
>> +	/* Aligns TCE entry number to system page boundary */
>> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
>> +
>> +	/* Count used 4K pages */
>> +	while (nbits--)
>> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
>> +
>> +	return ret;
>> +}
>> +
>> +static void tce_flush(struct iommu_table *tbl)
>> +{
>> +	/* Flush/invalidate TLB caches if necessary */
>> +	if (ppc_md.tce_flush)
>> +		ppc_md.tce_flush(tbl);
>> +
>> +	/* Make sure updates are seen by hardware */
>> +	mb();
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number of system pages
>> + * which it called put_page() on
>> + */
>> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int i, retpages = 0;
>> +	unsigned long oldtce, oldweight;
>> +	struct page *page;
>> +
>> +	for (i = 0; i < pages; ++i) {
>> +		oldtce = ppc_md.tce_get(tbl, entry + i);
>> +		ppc_md.tce_free(tbl, entry + i, 1);
>> +
>> +		oldweight = syspage_weight(tbl->it_map, entry);
>> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
>> +			continue;
>
> Could this happen earlier, above syspage_weight() and __clear_bit()?


Want to clear it anyway if it is not cleared by some reason. Added WARN_ON.


>> +
>> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
>> +
>> +		WARN_ON(!page);
>> +		if (!page)
>> +			continue;
>> +
>> +		if (oldtce & TCE_PCI_WRITE)
>> +			SetPageDirty(page);
>> +
>> +		put_page(page);
>> +
>> +		/* That was the last IOMMU page within the system page */
>> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
>> +			++retpages;
>
> If you used __test_and_clear_bit() above I think you could avoid this
> 2nd call to syspage_weight.  A minor optimization though.
>
>> +	}
>> +
>> +	return retpages;
>> +}
>> +
>> +/*
>> + * iommu_clear_tces clears tces and returned the number
>> + / of released system pages
>> + */
>
> Something bad happened to your comments here.
>
>> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
>> +		unsigned long pages)
>> +{
>> +	int ret;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	spin_lock(&(pool->lock));
>> +	ret = clear_tces_nolock(tbl, entry, pages);
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
>> +
>> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction)
>> +{
>> +	int ret;
>> +	struct page *page = NULL;
>> +	unsigned long kva, offset, oldweight;
>> +
>> +	/* Map new TCE */
>> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
>
> Maybe the compiler will figure this out, but isn't this the same as tce
> & (IOMMU_PAGE_MASK & PAGE_MASK)?


it is rather (tce & (IOMMU_PAGE_MASK & ~PAGE_MASK)) but I cannot see how it 
is simpler and I doubt that it is faster enough to notice it anyhow :)


>> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
>> +			direction != DMA_TO_DEVICE, &page);
>> +	if (ret < 1) {
>
> Probably (ret != 1) here or else we never get to your >1 case below.
>
>> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
>
> Use pr_err
 >
>> +		if (!ret || (ret > 1))
>
> Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret
>
>> +			ret = -EFAULT;
>> +		return ret;
>> +	}
>
> You're missing the code from x86 that handles mapping mmap'd ranges.
> This is intended to allow peer-to-peer DMA between devices.  Is that
> intentional?

I am not following you here. What code exactly are talking about? We do not 
track ranges at all and I do not see how it helps with p2p dma.


>> +
>> +	kva = (unsigned long) page_address(page);
>> +	kva += offset;
>> +
>> +	/* tce_build receives a virtual address */
>> +	entry += tbl->it_offset; /* Offset into real TCE table */
>
> Here's what makes me call the entry "relative" rather than zero-based.

This is the bug actually, I overlooked it and I removed it now. Thanks for 
being so picky :)


> The iova is relative to the start of dma32_window_start, ie. if the
> window starts at bus address 512MB and I want to create a translation at
> bus address 512MB, I pass in an iova of 0, right?  The above adds the
> window offset.  So you've removed dma64 window, but we really need to
> define iova better.




>> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
>> +
>> +	/* tce_build() only returns non-zero for transient errors */
>> +	if (unlikely(ret)) {
>> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
>> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
>
> Use pr_err
>
>> +		put_page(page);
>> +		return -EIO;
>> +	}
>> +
>> +	/* Calculate if new system page has been locked */
>> +	oldweight = syspage_weight(tbl->it_map, entry);
>> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
>> +
>> +	return (oldweight == 0) ? 1 : 0;
>> +}
>> +
>> +/*
>> + * iommu_put_tces builds tces and returned the number of actually
>> + * locked system pages
>> + */
>> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
>> +		uint64_t tce, enum dma_data_direction direction,
>> +		unsigned long pages)
>> +{
>> +	int i, ret = 0, retpages = 0;
>> +	struct iommu_pool *pool = get_pool(tbl, entry);
>> +
>> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
>> +	BUG_ON(direction == DMA_NONE);
>
> This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
> will always be the only caller of this function.


This is what other function does in this file.


>> +
>> +	spin_lock(&(pool->lock));
>> +
>> +	/* Check if any is in use */
>> +	for (i = 0; i < pages; ++i) {
>> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
>> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
>> +				test_bit(entry + i, tbl->it_map)) {
>> +			WARN_ON(test_bit(entry + i, tbl->it_map));
>
> The WARN_ON seems to confirm that these are redundant tests, does that
> imply we don't trust it_map?  It would be a lot faster if we could rely
> on it_map exclusively here.


As for me, pretty minor optimization. I'm testing it now to see if I do not 
miss bits.



>> +			spin_unlock(&(pool->lock));
>> +			return -EBUSY;
>> +		}
>> +	}
>> +
>> +	/* Put tces to the table */
>> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
>> +		ret = put_tce(tbl, entry + i, tce, direction);
>> +		if (ret == 1)
>> +			++retpages;
>> +	}
>> +
>> +	/*
>> +	 * If failed, release locked pages, otherwise return the number
>> +	 * of locked system pages
>> +	 */
>> +	if (ret < 0)
>> +		clear_tces_nolock(tbl, entry, i);
>> +	else
>> +		ret = retpages;
>> +
>> +	tce_flush(tbl);
>> +	spin_unlock(&(pool->lock));
>> +
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(iommu_put_tces);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
>> index 05205cf..21250ef 100644
>> --- a/arch/powerpc/platforms/powernv/pci.c
>> +++ b/arch/powerpc/platforms/powernv/pci.c
>> @@ -20,6 +20,7 @@
>>   #include <linux/irq.h>
>>   #include <linux/io.h>
>>   #include <linux/msi.h>
>> +#include <linux/iommu.h>
>>
>>   #include <asm/sections.h>
>>   #include <asm/io.h>
>> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
>>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>>   #endif
>>   }
>> +
>> +#ifdef CONFIG_IOMMU_API
>> +/*
>> + * IOMMU groups support required by VFIO
>> + */
>> +static int add_device(struct device *dev)
>> +{
>> +	struct iommu_table *tbl;
>> +	int ret = 0;
>> +
>> +	if (WARN_ON(dev->iommu_group)) {
>> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
>> +				dev_name(dev),
>> +				iommu_group_id(dev->iommu_group));
>
> Use pr_warn
>
>> +		return -EBUSY;
>> +	}
>> +
>> +	tbl = get_iommu_table_base(dev);
>> +	if (!tbl) {
>> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
>> +				dev_name(dev));
>> +		return 0;
>> +	}
>> +
>> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
>> +			dev_name(dev), iommu_group_id(tbl->it_group));
>> +
>> +	ret = iommu_group_add_device(tbl->it_group, dev);
>> +	if (ret < 0)
>> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
>> +				dev_name(dev), ret);
>
> Use pr_err
>
>> +
>> +	return ret;
>> +}
>> +
>> +static void del_device(struct device *dev)
>> +{
>> +	iommu_group_remove_device(dev);
>> +}
>> +
>> +static int iommu_bus_notifier(struct notifier_block *nb,
>> +			      unsigned long action, void *data)
>> +{
>> +	struct device *dev = data;
>> +
>> +	switch (action) {
>> +	case BUS_NOTIFY_ADD_DEVICE:
>> +		return add_device(dev);
>> +	case BUS_NOTIFY_DEL_DEVICE:
>> +		del_device(dev);
>> +		return 0;
>> +	default:
>> +		return 0;
>> +	}
>> +}
>> +
>> +static struct notifier_block tce_iommu_bus_nb = {
>> +	.notifier_call = iommu_bus_notifier,
>> +};
>> +
>> +static void group_release(void *iommu_data)
>> +{
>> +	struct iommu_table *tbl = iommu_data;
>> +	tbl->it_group = NULL;
>> +}
>> +
>> +static int __init tce_iommu_init(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp;
>> +
>> +	/* Allocate and initialize IOMMU groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +
>> +		/* Skip already initialized */
>> +		if (tbl->it_group)
>> +			continue;
>> +
>> +		grp = iommu_group_alloc();
>> +		if (IS_ERR(grp)) {
>> +			printk(KERN_INFO "tce_vfio: cannot create "
>> +					"new IOMMU group, ret=%ld\n",
>> +					PTR_ERR(grp));
>
> Use pr_info
>
>> +			return PTR_ERR(grp);
>> +		}
>> +		tbl->it_group = grp;
>> +		iommu_group_set_iommudata(grp, tbl, group_release);
>> +	}
>> +
>> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Add PCI devices to VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		add_device(&pdev->dev);
>> +
>> +	return 0;
>> +}
>> +
>> +static void __exit tce_iommu_cleanup(void)
>> +{
>> +	struct pci_dev *pdev = NULL;
>> +	struct iommu_table *tbl;
>> +	struct iommu_group *grp = NULL;
>> +
>> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
>> +
>> +	/* Delete PCI devices from VFIO groups */
>> +	for_each_pci_dev(pdev)
>> +		del_device(&pdev->dev);
>> +
>> +	/* Release VFIO groups */
>> +	for_each_pci_dev(pdev) {
>> +		tbl = get_iommu_table_base(&pdev->dev);
>> +		if (!tbl)
>> +			continue;
>> +		grp = tbl->it_group;
>> +
>> +		/* Skip (already) uninitialized */
>> +		if (!grp)
>> +			continue;
>> +
>> +		/* Do actual release, group_release() is expected to work */
>> +		iommu_group_put(grp);
>> +		BUG_ON(tbl->it_group);
>> +	}
>> +}
>> +
>> +module_init(tce_iommu_init);
>> +module_exit(tce_iommu_cleanup);
>> +#endif /* CONFIG_IOMMU_API */
>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>> index 9f69b56..29d11dc 100644
>> --- a/drivers/iommu/Kconfig
>> +++ b/drivers/iommu/Kconfig
>> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>>
>>   	  Say N unless you need kernel log message for IOMMU debugging
>>
>> +config SPAPR_TCE_IOMMU
>> +	bool "sPAPR TCE IOMMU Support"
>> +	depends on PPC_POWERNV
>> +	select IOMMU_API
>> +	help
>> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
>> +	  still not implemented.
>> +
>>   endif # IOMMU_SUPPORT
>
> Thanks,
>
> Alex
>
Alex Williamson - Dec. 4, 2012, 3:51 p.m.
On Tue, 2012-12-04 at 19:12 +1100, Alexey Kardashevskiy wrote:
> On 04/12/12 04:35, Alex Williamson wrote:
> > On Mon, 2012-12-03 at 13:52 +1100, Alexey Kardashevskiy wrote:
> >> This patch initializes IOMMU groups based on the IOMMU
> >> configuration discovered during the PCI scan on POWERNV
> >> (POWER non virtualized) platform. The IOMMU groups are
> >> to be used later by VFIO driver (PCI pass through).
> >>
> >> It also implements an API for mapping/unmapping pages for
> >> guest PCI drivers and providing DMA window properties.
> >> This API is going to be used later by QEMU-VFIO to handle
> >> h_put_tce hypercalls from the KVM guest.
> >>
> >> Although this driver has been tested only on the POWERNV
> >> platform, it should work on any platform which supports
> >> TCE tables.
> >>
> >> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> >> option and configure VFIO as required.
> >>
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>   arch/powerpc/include/asm/iommu.h     |    9 ++
> >>   arch/powerpc/kernel/iommu.c          |  186 ++++++++++++++++++++++++++++++++++
> >>   arch/powerpc/platforms/powernv/pci.c |  135 ++++++++++++++++++++++++
> >>   drivers/iommu/Kconfig                |    8 ++
> >>   4 files changed, 338 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >> index cbfe678..5c7087a 100644
> >> --- a/arch/powerpc/include/asm/iommu.h
> >> +++ b/arch/powerpc/include/asm/iommu.h
> >> @@ -76,6 +76,9 @@ struct iommu_table {
> >>   	struct iommu_pool large_pool;
> >>   	struct iommu_pool pools[IOMMU_NR_POOLS];
> >>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >> +#ifdef CONFIG_IOMMU_API
> >> +	struct iommu_group *it_group;
> >> +#endif
> >>   };
> >>
> >>   struct scatterlist;
> >> @@ -147,5 +150,11 @@ static inline void iommu_restore(void)
> >>   }
> >>   #endif
> >>
> >> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages);
> >> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction,
> >> +		unsigned long pages);
> >> +
> >>   #endif /* __KERNEL__ */
> >>   #endif /* _ASM_IOMMU_H */
> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >> index ff5a6ce..2738aa4 100644
> >> --- a/arch/powerpc/kernel/iommu.c
> >> +++ b/arch/powerpc/kernel/iommu.c
> >> @@ -44,6 +44,7 @@
> >>   #include <asm/kdump.h>
> >>   #include <asm/fadump.h>
> >>   #include <asm/vio.h>
> >> +#include <asm/tce.h>
> >>
> >>   #define DBG(...)
> >>
> >> @@ -856,3 +857,188 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
> >>   		free_pages((unsigned long)vaddr, get_order(size));
> >>   	}
> >>   }
> >> +
> >> +#ifdef CONFIG_IOMMU_API
> >> +/*
> >> + * SPAPR TCE API
> >> + */
> >> +
> >> +/*
> >> + * Returns the number of used IOMMU pages (4K) within
> >> + * the same system page (4K or 64K).
> >> + * bitmap_weight is not used as it does not support bigendian maps.
> >> + */
> >> +static int syspage_weight(unsigned long *map, unsigned long entry)
> >> +{
> >> +	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> >> +
> >> +	/* Aligns TCE entry number to system page boundary */
> >> +	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> >> +
> >> +	/* Count used 4K pages */
> >> +	while (nbits--)
> >> +		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void tce_flush(struct iommu_table *tbl)
> >> +{
> >> +	/* Flush/invalidate TLB caches if necessary */
> >> +	if (ppc_md.tce_flush)
> >> +		ppc_md.tce_flush(tbl);
> >> +
> >> +	/* Make sure updates are seen by hardware */
> >> +	mb();
> >> +}
> >> +
> >> +/*
> >> + * iommu_clear_tces clears tces and returned the number of system pages
> >> + * which it called put_page() on
> >> + */
> >> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages)
> >> +{
> >> +	int i, retpages = 0;
> >> +	unsigned long oldtce, oldweight;
> >> +	struct page *page;
> >> +
> >> +	for (i = 0; i < pages; ++i) {
> >> +		oldtce = ppc_md.tce_get(tbl, entry + i);
> >> +		ppc_md.tce_free(tbl, entry + i, 1);
> >> +
> >> +		oldweight = syspage_weight(tbl->it_map, entry);
> >> +		__clear_bit(entry - tbl->it_offset, tbl->it_map);
> >> +
> >> +		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
> >> +			continue;
> >
> > Could this happen earlier, above syspage_weight() and __clear_bit()?
> 
> 
> Want to clear it anyway if it is not cleared by some reason. Added WARN_ON.

The map shouldn't be set unless read/write is enabled, right?  It seems
like we don't have a lot of trust in this bitmap.

> >> +
> >> +		page = pfn_to_page(oldtce >> PAGE_SHIFT);
> >> +
> >> +		WARN_ON(!page);
> >> +		if (!page)
> >> +			continue;
> >> +
> >> +		if (oldtce & TCE_PCI_WRITE)
> >> +			SetPageDirty(page);
> >> +
> >> +		put_page(page);
> >> +
> >> +		/* That was the last IOMMU page within the system page */
> >> +		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
> >> +			++retpages;
> >
> > If you used __test_and_clear_bit() above I think you could avoid this
> > 2nd call to syspage_weight.  A minor optimization though.
> >
> >> +	}
> >> +
> >> +	return retpages;
> >> +}
> >> +
> >> +/*
> >> + * iommu_clear_tces clears tces and returned the number
> >> + / of released system pages
> >> + */
> >
> > Something bad happened to your comments here.
> >
> >> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		unsigned long pages)
> >> +{
> >> +	int ret;
> >> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >> +
> >> +	spin_lock(&(pool->lock));
> >> +	ret = clear_tces_nolock(tbl, entry, pages);
> >> +	tce_flush(tbl);
> >> +	spin_unlock(&(pool->lock));
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> >> +
> >> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction)
> >> +{
> >> +	int ret;
> >> +	struct page *page = NULL;
> >> +	unsigned long kva, offset, oldweight;
> >> +
> >> +	/* Map new TCE */
> >> +	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> >
> > Maybe the compiler will figure this out, but isn't this the same as tce
> > & (IOMMU_PAGE_MASK & PAGE_MASK)?
> 
> 
> it is rather (tce & (IOMMU_PAGE_MASK & ~PAGE_MASK)) but I cannot see how it 
> is simpler and I doubt that it is faster enough to notice it anyhow :)

Yes, ~PAGE_MASK.  IMHO, it's more intuitive.

> >> +	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> >> +			direction != DMA_TO_DEVICE, &page);
> >> +	if (ret < 1) {
> >
> > Probably (ret != 1) here or else we never get to your >1 case below.
> >
> >> +		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
> >> +				tce, entry << IOMMU_PAGE_SHIFT, ret);
> >
> > Use pr_err
>  >
> >> +		if (!ret || (ret > 1))
> >
> > Then (ret >= 0) here.  Or return (ret >= 0) ? -EFAULT : ret
> >
> >> +			ret = -EFAULT;
> >> +		return ret;
> >> +	}
> >
> > You're missing the code from x86 that handles mapping mmap'd ranges.
> > This is intended to allow peer-to-peer DMA between devices.  Is that
> > intentional?
> 
> I am not following you here. What code exactly are talking about? We do not 
> track ranges at all and I do not see how it helps with p2p dma.

The code in type1 that checks PFNMAP and reserved pages that I only
sometimes remember how it works ;)  The idea there is to allow p2p dma
by inserting iommu translations for non-page backed memory, ie. the
mmap'd BARs of other devices.  It may be that the POWER topology is not
amenable to this since you have a whole PCI bus in your group and
intra-group p2p isn't iommu translated.  I'm not sure how useful it is
even on x86, but KVM device assignment does it, so I added it to type1.

> >> +
> >> +	kva = (unsigned long) page_address(page);
> >> +	kva += offset;
> >> +
> >> +	/* tce_build receives a virtual address */
> >> +	entry += tbl->it_offset; /* Offset into real TCE table */
> >
> > Here's what makes me call the entry "relative" rather than zero-based.
> 
> This is the bug actually, I overlooked it and I removed it now. Thanks for 
> being so picky :)

Ah, ok.  I'll look for it on the next round and make sure I understand
it.

> > The iova is relative to the start of dma32_window_start, ie. if the
> > window starts at bus address 512MB and I want to create a translation at
> > bus address 512MB, I pass in an iova of 0, right?  The above adds the
> > window offset.  So you've removed dma64 window, but we really need to
> > define iova better.
> 
> 
> 
> 
> >> +	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> >> +
> >> +	/* tce_build() only returns non-zero for transient errors */
> >> +	if (unlikely(ret)) {
> >> +		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
> >> +				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> >
> > Use pr_err
> >
> >> +		put_page(page);
> >> +		return -EIO;
> >> +	}
> >> +
> >> +	/* Calculate if new system page has been locked */
> >> +	oldweight = syspage_weight(tbl->it_map, entry);
> >> +	__set_bit(entry - tbl->it_offset, tbl->it_map);
> >> +
> >> +	return (oldweight == 0) ? 1 : 0;
> >> +}
> >> +
> >> +/*
> >> + * iommu_put_tces builds tces and returned the number of actually
> >> + * locked system pages
> >> + */
> >> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> >> +		uint64_t tce, enum dma_data_direction direction,
> >> +		unsigned long pages)
> >> +{
> >> +	int i, ret = 0, retpages = 0;
> >> +	struct iommu_pool *pool = get_pool(tbl, entry);
> >> +
> >> +	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> >> +	BUG_ON(direction == DMA_NONE);
> >
> > This doesn't seem BUG worthy, -EINVAL?  We can't assume tce_iommu_ioctl
> > will always be the only caller of this function.
> 
> 
> This is what other function does in this file.

Blech, ok.

> >> +
> >> +	spin_lock(&(pool->lock));
> >> +
> >> +	/* Check if any is in use */
> >> +	for (i = 0; i < pages; ++i) {
> >> +		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
> >> +		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
> >> +				test_bit(entry + i, tbl->it_map)) {
> >> +			WARN_ON(test_bit(entry + i, tbl->it_map));
> >
> > The WARN_ON seems to confirm that these are redundant tests, does that
> > imply we don't trust it_map?  It would be a lot faster if we could rely
> > on it_map exclusively here.
> 
> 
> As for me, pretty minor optimization. I'm testing it now to see if I do not 
> miss bits.

It would be a lot more re-assuring if we didn't need it ;)  Thanks,

Alex

> >> +			spin_unlock(&(pool->lock));
> >> +			return -EBUSY;
> >> +		}
> >> +	}
> >> +
> >> +	/* Put tces to the table */
> >> +	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> >> +		ret = put_tce(tbl, entry + i, tce, direction);
> >> +		if (ret == 1)
> >> +			++retpages;
> >> +	}
> >> +
> >> +	/*
> >> +	 * If failed, release locked pages, otherwise return the number
> >> +	 * of locked system pages
> >> +	 */
> >> +	if (ret < 0)
> >> +		clear_tces_nolock(tbl, entry, i);
> >> +	else
> >> +		ret = retpages;
> >> +
> >> +	tce_flush(tbl);
> >> +	spin_unlock(&(pool->lock));
> >> +
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> >> +#endif /* CONFIG_IOMMU_API */
> >> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> >> index 05205cf..21250ef 100644
> >> --- a/arch/powerpc/platforms/powernv/pci.c
> >> +++ b/arch/powerpc/platforms/powernv/pci.c
> >> @@ -20,6 +20,7 @@
> >>   #include <linux/irq.h>
> >>   #include <linux/io.h>
> >>   #include <linux/msi.h>
> >> +#include <linux/iommu.h>
> >>
> >>   #include <asm/sections.h>
> >>   #include <asm/io.h>
> >> @@ -613,3 +614,137 @@ void __init pnv_pci_init(void)
> >>   	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
> >>   #endif
> >>   }
> >> +
> >> +#ifdef CONFIG_IOMMU_API
> >> +/*
> >> + * IOMMU groups support required by VFIO
> >> + */
> >> +static int add_device(struct device *dev)
> >> +{
> >> +	struct iommu_table *tbl;
> >> +	int ret = 0;
> >> +
> >> +	if (WARN_ON(dev->iommu_group)) {
> >> +		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
> >> +				dev_name(dev),
> >> +				iommu_group_id(dev->iommu_group));
> >
> > Use pr_warn
> >
> >> +		return -EBUSY;
> >> +	}
> >> +
> >> +	tbl = get_iommu_table_base(dev);
> >> +	if (!tbl) {
> >> +		pr_debug("tce_vfio: skipping device %s with no tbl\n",
> >> +				dev_name(dev));
> >> +		return 0;
> >> +	}
> >> +
> >> +	pr_debug("tce_vfio: adding %s to iommu group %d\n",
> >> +			dev_name(dev), iommu_group_id(tbl->it_group));
> >> +
> >> +	ret = iommu_group_add_device(tbl->it_group, dev);
> >> +	if (ret < 0)
> >> +		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
> >> +				dev_name(dev), ret);
> >
> > Use pr_err
> >
> >> +
> >> +	return ret;
> >> +}
> >> +
> >> +static void del_device(struct device *dev)
> >> +{
> >> +	iommu_group_remove_device(dev);
> >> +}
> >> +
> >> +static int iommu_bus_notifier(struct notifier_block *nb,
> >> +			      unsigned long action, void *data)
> >> +{
> >> +	struct device *dev = data;
> >> +
> >> +	switch (action) {
> >> +	case BUS_NOTIFY_ADD_DEVICE:
> >> +		return add_device(dev);
> >> +	case BUS_NOTIFY_DEL_DEVICE:
> >> +		del_device(dev);
> >> +		return 0;
> >> +	default:
> >> +		return 0;
> >> +	}
> >> +}
> >> +
> >> +static struct notifier_block tce_iommu_bus_nb = {
> >> +	.notifier_call = iommu_bus_notifier,
> >> +};
> >> +
> >> +static void group_release(void *iommu_data)
> >> +{
> >> +	struct iommu_table *tbl = iommu_data;
> >> +	tbl->it_group = NULL;
> >> +}
> >> +
> >> +static int __init tce_iommu_init(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp;
> >> +
> >> +	/* Allocate and initialize IOMMU groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +
> >> +		/* Skip already initialized */
> >> +		if (tbl->it_group)
> >> +			continue;
> >> +
> >> +		grp = iommu_group_alloc();
> >> +		if (IS_ERR(grp)) {
> >> +			printk(KERN_INFO "tce_vfio: cannot create "
> >> +					"new IOMMU group, ret=%ld\n",
> >> +					PTR_ERR(grp));
> >
> > Use pr_info
> >
> >> +			return PTR_ERR(grp);
> >> +		}
> >> +		tbl->it_group = grp;
> >> +		iommu_group_set_iommudata(grp, tbl, group_release);
> >> +	}
> >> +
> >> +	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Add PCI devices to VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		add_device(&pdev->dev);
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static void __exit tce_iommu_cleanup(void)
> >> +{
> >> +	struct pci_dev *pdev = NULL;
> >> +	struct iommu_table *tbl;
> >> +	struct iommu_group *grp = NULL;
> >> +
> >> +	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> >> +
> >> +	/* Delete PCI devices from VFIO groups */
> >> +	for_each_pci_dev(pdev)
> >> +		del_device(&pdev->dev);
> >> +
> >> +	/* Release VFIO groups */
> >> +	for_each_pci_dev(pdev) {
> >> +		tbl = get_iommu_table_base(&pdev->dev);
> >> +		if (!tbl)
> >> +			continue;
> >> +		grp = tbl->it_group;
> >> +
> >> +		/* Skip (already) uninitialized */
> >> +		if (!grp)
> >> +			continue;
> >> +
> >> +		/* Do actual release, group_release() is expected to work */
> >> +		iommu_group_put(grp);
> >> +		BUG_ON(tbl->it_group);
> >> +	}
> >> +}
> >> +
> >> +module_init(tce_iommu_init);
> >> +module_exit(tce_iommu_cleanup);
> >> +#endif /* CONFIG_IOMMU_API */
> >> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> >> index 9f69b56..29d11dc 100644
> >> --- a/drivers/iommu/Kconfig
> >> +++ b/drivers/iommu/Kconfig
> >> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
> >>
> >>   	  Say N unless you need kernel log message for IOMMU debugging
> >>
> >> +config SPAPR_TCE_IOMMU
> >> +	bool "sPAPR TCE IOMMU Support"
> >> +	depends on PPC_POWERNV
> >> +	select IOMMU_API
> >> +	help
> >> +	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
> >> +	  still not implemented.
> >> +
> >>   endif # IOMMU_SUPPORT
> >
> > Thanks,
> >
> > Alex
> >
> 
>

Patch

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@  struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+	struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@  static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..2738aa4 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@ 
 #include <asm/kdump.h>
 #include <asm/fadump.h>
 #include <asm/vio.h>
+#include <asm/tce.h>
 
 #define DBG(...)
 
@@ -856,3 +857,188 @@  void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		free_pages((unsigned long)vaddr, get_order(size));
 	}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+
+/*
+ * Returns the number of used IOMMU pages (4K) within
+ * the same system page (4K or 64K).
+ * bitmap_weight is not used as it does not support bigendian maps.
+ */
+static int syspage_weight(unsigned long *map, unsigned long entry)
+{
+	int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
+
+	/* Aligns TCE entry number to system page boundary */
+	entry &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
+
+	/* Count used 4K pages */
+	while (nbits--)
+		ret += (test_bit(entry++, map) == 0) ? 0 : 1;
+
+	return ret;
+}
+
+static void tce_flush(struct iommu_table *tbl)
+{
+	/* Flush/invalidate TLB caches if necessary */
+	if (ppc_md.tce_flush)
+		ppc_md.tce_flush(tbl);
+
+	/* Make sure updates are seen by hardware */
+	mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of system pages
+ * which it called put_page() on
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int i, retpages = 0;
+	unsigned long oldtce, oldweight;
+	struct page *page;
+
+	for (i = 0; i < pages; ++i) {
+		oldtce = ppc_md.tce_get(tbl, entry + i);
+		ppc_md.tce_free(tbl, entry + i, 1);
+
+		oldweight = syspage_weight(tbl->it_map, entry);
+		__clear_bit(entry - tbl->it_offset, tbl->it_map);
+
+		if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+		WARN_ON(!page);
+		if (!page)
+			continue;
+
+		if (oldtce & TCE_PCI_WRITE)
+			SetPageDirty(page);
+
+		put_page(page);
+
+		/* That was the last IOMMU page within the system page */
+		if ((oldweight == 1) && !syspage_weight(tbl->it_map, entry))
+			++retpages;
+	}
+
+	return retpages;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number
+ / of released system pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+		unsigned long pages)
+{
+	int ret;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	spin_lock(&(pool->lock));
+	ret = clear_tces_nolock(tbl, entry, pages);
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction)
+{
+	int ret;
+	struct page *page = NULL;
+	unsigned long kva, offset, oldweight;
+
+	/* Map new TCE */
+	offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page);
+	if (ret < 1) {
+		printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, ret);
+		if (!ret || (ret > 1))
+			ret = -EFAULT;
+		return ret;
+	}
+
+	kva = (unsigned long) page_address(page);
+	kva += offset;
+
+	/* tce_build receives a virtual address */
+	entry += tbl->it_offset; /* Offset into real TCE table */
+	ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
+
+	/* tce_build() only returns non-zero for transient errors */
+	if (unlikely(ret)) {
+		printk(KERN_ERR "tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx ret=%d\n",
+				tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
+		put_page(page);
+		return -EIO;
+	}
+
+	/* Calculate if new system page has been locked */
+	oldweight = syspage_weight(tbl->it_map, entry);
+	__set_bit(entry - tbl->it_offset, tbl->it_map);
+
+	return (oldweight == 0) ? 1 : 0;
+}
+
+/*
+ * iommu_put_tces builds tces and returned the number of actually
+ * locked system pages
+ */
+long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+		uint64_t tce, enum dma_data_direction direction,
+		unsigned long pages)
+{
+	int i, ret = 0, retpages = 0;
+	struct iommu_pool *pool = get_pool(tbl, entry);
+
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUG_ON(direction == DMA_NONE);
+
+	spin_lock(&(pool->lock));
+
+	/* Check if any is in use */
+	for (i = 0; i < pages; ++i) {
+		unsigned long oldtce = ppc_md.tce_get(tbl, entry + i);
+		if ((oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) ||
+				test_bit(entry + i, tbl->it_map)) {
+			WARN_ON(test_bit(entry + i, tbl->it_map));
+			spin_unlock(&(pool->lock));
+			return -EBUSY;
+		}
+	}
+
+	/* Put tces to the table */
+	for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
+		ret = put_tce(tbl, entry + i, tce, direction);
+		if (ret == 1)
+			++retpages;
+	}
+
+	/*
+	 * If failed, release locked pages, otherwise return the number
+	 * of locked system pages
+	 */
+	if (ret < 0)
+		clear_tces_nolock(tbl, entry, i);
+	else
+		ret = retpages;
+
+	tce_flush(tbl);
+	spin_unlock(&(pool->lock));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tces);
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 05205cf..21250ef 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -20,6 +20,7 @@ 
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/msi.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -613,3 +614,137 @@  void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * IOMMU groups support required by VFIO
+ */
+static int add_device(struct device *dev)
+{
+	struct iommu_table *tbl;
+	int ret = 0;
+
+	if (WARN_ON(dev->iommu_group)) {
+		printk(KERN_WARNING "tce_vfio: device %s is already in iommu group %d, skipping\n",
+				dev_name(dev),
+				iommu_group_id(dev->iommu_group));
+		return -EBUSY;
+	}
+
+	tbl = get_iommu_table_base(dev);
+	if (!tbl) {
+		pr_debug("tce_vfio: skipping device %s with no tbl\n",
+				dev_name(dev));
+		return 0;
+	}
+
+	pr_debug("tce_vfio: adding %s to iommu group %d\n",
+			dev_name(dev), iommu_group_id(tbl->it_group));
+
+	ret = iommu_group_add_device(tbl->it_group, dev);
+	if (ret < 0)
+		printk(KERN_ERR "tce_vfio: %s has not been added, ret=%d\n",
+				dev_name(dev), ret);
+
+	return ret;
+}
+
+static void del_device(struct device *dev)
+{
+	iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+			      unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = iommu_bus_notifier,
+};
+
+static void group_release(void *iommu_data)
+{
+	struct iommu_table *tbl = iommu_data;
+	tbl->it_group = NULL;
+}
+
+static int __init tce_iommu_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp;
+
+	/* Allocate and initialize IOMMU groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+
+		/* Skip already initialized */
+		if (tbl->it_group)
+			continue;
+
+		grp = iommu_group_alloc();
+		if (IS_ERR(grp)) {
+			printk(KERN_INFO "tce_vfio: cannot create "
+					"new IOMMU group, ret=%ld\n",
+					PTR_ERR(grp));
+			return PTR_ERR(grp);
+		}
+		tbl->it_group = grp;
+		iommu_group_set_iommudata(grp, tbl, group_release);
+	}
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Add PCI devices to VFIO groups */
+	for_each_pci_dev(pdev)
+		add_device(&pdev->dev);
+
+	return 0;
+}
+
+static void __exit tce_iommu_cleanup(void)
+{
+	struct pci_dev *pdev = NULL;
+	struct iommu_table *tbl;
+	struct iommu_group *grp = NULL;
+
+	bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+
+	/* Delete PCI devices from VFIO groups */
+	for_each_pci_dev(pdev)
+		del_device(&pdev->dev);
+
+	/* Release VFIO groups */
+	for_each_pci_dev(pdev) {
+		tbl = get_iommu_table_base(&pdev->dev);
+		if (!tbl)
+			continue;
+		grp = tbl->it_group;
+
+		/* Skip (already) uninitialized */
+		if (!grp)
+			continue;
+
+		/* Do actual release, group_release() is expected to work */
+		iommu_group_put(grp);
+		BUG_ON(tbl->it_group);
+	}
+}
+
+module_init(tce_iommu_init);
+module_exit(tce_iommu_cleanup);
+#endif /* CONFIG_IOMMU_API */
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9f69b56..29d11dc 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -187,4 +187,12 @@  config EXYNOS_IOMMU_DEBUG
 
 	  Say N unless you need kernel log message for IOMMU debugging
 
+config SPAPR_TCE_IOMMU
+	bool "sPAPR TCE IOMMU Support"
+	depends on PPC_POWERNV
+	select IOMMU_API
+	help
+	  Enables bits of IOMMU API required by VFIO. The iommu_ops is
+	  still not implemented.
+
 endif # IOMMU_SUPPORT