diff mbox series

[kernel,v3,04/22] powerpc/vfio/iommu/kvm: Do not pin device memory

Message ID 20181113082823.2440-5-aik@ozlabs.ru
State Superseded
Headers show
Series powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough | expand

Commit Message

Alexey Kardashevskiy Nov. 13, 2018, 8:28 a.m. UTC
This new memory does not have page structs as it is not plugged to
the host so gup() will fail anyway.

This adds 2 helpers:
- mm_iommu_newdev() to preregister the "memory device" memory so
the rest of API can still be used;
- mm_iommu_is_devmem() to know if the physical address is one of thise
new regions which we must avoid unpinning of.

This adds @mm to tce_page_is_contained() and iommu_tce_xchg() to test
if the memory is device memory to avoid pfn_to_page().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h       |  5 +-
 arch/powerpc/include/asm/mmu_context.h |  5 ++
 arch/powerpc/kernel/iommu.c            |  9 ++-
 arch/powerpc/kvm/book3s_64_vio.c       | 18 +++---
 arch/powerpc/mm/mmu_context_iommu.c    | 83 +++++++++++++++++++++++---
 drivers/vfio/vfio_iommu_spapr_tce.c    | 28 +++++----
 6 files changed, 116 insertions(+), 32 deletions(-)

Comments

David Gibson Nov. 16, 2018, 3:11 a.m. UTC | #1
On Tue, Nov 13, 2018 at 07:28:05PM +1100, Alexey Kardashevskiy wrote:
> This new memory does not have page structs as it is not plugged to
> the host so gup() will fail anyway.
> 
> This adds 2 helpers:
> - mm_iommu_newdev() to preregister the "memory device" memory so
> the rest of API can still be used;
> - mm_iommu_is_devmem() to know if the physical address is one of thise
> new regions which we must avoid unpinning of.
> 
> This adds @mm to tce_page_is_contained() and iommu_tce_xchg() to test
> if the memory is device memory to avoid pfn_to_page().
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h       |  5 +-
>  arch/powerpc/include/asm/mmu_context.h |  5 ++
>  arch/powerpc/kernel/iommu.c            |  9 ++-
>  arch/powerpc/kvm/book3s_64_vio.c       | 18 +++---
>  arch/powerpc/mm/mmu_context_iommu.c    | 83 +++++++++++++++++++++++---
>  drivers/vfio/vfio_iommu_spapr_tce.c    | 28 +++++----
>  6 files changed, 116 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index 35db0cb..a8aeac0 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -218,8 +218,9 @@ extern void iommu_register_group(struct iommu_table_group *table_group,
>  extern int iommu_add_device(struct device *dev);
>  extern void iommu_del_device(struct device *dev);
>  extern int __init tce_iommu_bus_notifier_init(void);
> -extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
> -		unsigned long *hpa, enum dma_data_direction *direction);
> +extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
> +		unsigned long entry, unsigned long *hpa,
> +		enum dma_data_direction *direction);
>  #else
>  static inline void iommu_register_group(struct iommu_table_group *table_group,
>  					int pci_domain_number,
> diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> index 2d6b00d..f0f9f3d 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -24,6 +24,9 @@ extern bool mm_iommu_preregistered(struct mm_struct *mm);
>  extern long mm_iommu_new(struct mm_struct *mm,
>  		unsigned long ua, unsigned long entries,
>  		struct mm_iommu_table_group_mem_t **pmem);
> +extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
> +		unsigned long entries, unsigned long dev_hpa,
> +		struct mm_iommu_table_group_mem_t **pmem);
>  extern long mm_iommu_put(struct mm_struct *mm,
>  		struct mm_iommu_table_group_mem_t *mem);
>  extern void mm_iommu_init(struct mm_struct *mm);
> @@ -39,6 +42,8 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
>  extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
>  		unsigned long ua, unsigned int pageshift, unsigned long *hpa);
>  extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua);
> +extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
> +		unsigned int pageshift);
>  extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
>  extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
>  #endif
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index f0dc680..8ccfdd9 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -47,6 +47,7 @@
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
>  #include <asm/tce.h>
> +#include <asm/mmu_context.h>
>  
>  #define DBG(...)
>  
> @@ -993,15 +994,17 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
>  }
>  EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
>  
> -long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
> -		unsigned long *hpa, enum dma_data_direction *direction)
> +long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
> +		unsigned long entry, unsigned long *hpa,
> +		enum dma_data_direction *direction)
>  {
>  	long ret;
>  
>  	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
>  
>  	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
> -			(*direction == DMA_BIDIRECTIONAL)))
> +			(*direction == DMA_BIDIRECTIONAL)) &&
> +			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift))
>  		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));

What about the equivalent real mode paths?  I guess they won't ever be
called for this case, since they're only used on POWER8.  However some
checks or WARN_ON() or something to make that clear would be nice.

>  	/* if (unlikely(ret))
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index 62a8d03..532ab797 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -397,12 +397,13 @@ static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
>  	return H_SUCCESS;
>  }
>  
> -static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
> +static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
> +		unsigned long entry)
>  {
>  	unsigned long hpa = 0;
>  	enum dma_data_direction dir = DMA_NONE;
>  
> -	iommu_tce_xchg(tbl, entry, &hpa, &dir);
> +	iommu_tce_xchg(mm, tbl, entry, &hpa, &dir);
>  }
>  
>  static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
> @@ -433,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
>  	unsigned long hpa = 0;
>  	long ret;
>  
> -	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
> +	if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir)))
>  		return H_TOO_HARD;
>  
>  	if (dir == DMA_NONE)
> @@ -441,7 +442,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
>  
>  	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
>  	if (ret != H_SUCCESS)
> -		iommu_tce_xchg(tbl, entry, &hpa, &dir);
> +		iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
>  
>  	return ret;
>  }
> @@ -487,7 +488,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
>  	if (mm_iommu_mapped_inc(mem))
>  		return H_TOO_HARD;
>  
> -	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
> +	ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
>  	if (WARN_ON_ONCE(ret)) {
>  		mm_iommu_mapped_dec(mem);
>  		return H_TOO_HARD;
> @@ -566,7 +567,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  					entry, ua, dir);
>  
>  		if (ret != H_SUCCESS) {
> -			kvmppc_clear_tce(stit->tbl, entry);
> +			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
>  			goto unlock_exit;
>  		}
>  	}
> @@ -655,7 +656,8 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  					iommu_tce_direction(tce));
>  
>  			if (ret != H_SUCCESS) {
> -				kvmppc_clear_tce(stit->tbl, entry);
> +				kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
> +						entry);
>  				goto unlock_exit;
>  			}
>  		}
> @@ -704,7 +706,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
>  				return ret;
>  
>  			WARN_ON_ONCE(1);
> -			kvmppc_clear_tce(stit->tbl, entry);
> +			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
>  		}
>  	}
>  
> diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
> index 580d89e..62fe5fe 100644
> --- a/arch/powerpc/mm/mmu_context_iommu.c
> +++ b/arch/powerpc/mm/mmu_context_iommu.c
> @@ -47,6 +47,8 @@ struct mm_iommu_table_group_mem_t {
>  		struct page **hpages;	/* vmalloc'ed */
>  		phys_addr_t *hpas;
>  	};
> +#define MM_IOMMU_TABLE_INVALID_HPA	((uint64_t)-1)
> +	u64 dev_hpa;		/* Device memory base address */
>  };
>  
>  static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
> @@ -89,7 +91,8 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
>  }
>  EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
>  
> -long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> +static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
> +		unsigned long entries, unsigned long dev_hpa,
>  		struct mm_iommu_table_group_mem_t **pmem)
>  {
>  	struct mm_iommu_table_group_mem_t *mem;
> @@ -112,11 +115,13 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
>  
>  	}
>  
> -	ret = mm_iommu_adjust_locked_vm(mm, entries, true);
> -	if (ret)
> -		goto unlock_exit;
> +	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
> +		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
> +		if (ret)
> +			goto unlock_exit;
>  
> -	locked_entries = entries;
> +		locked_entries = entries;
> +	}
>  
>  	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
>  	if (!mem) {
> @@ -124,6 +129,13 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
>  		goto unlock_exit;
>  	}
>  
> +	if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
> +		mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
> +		mem->dev_hpa = dev_hpa;
> +		goto good_exit;
> +	}
> +	mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
> +
>  	/*
>  	 * For a starting point for a maximum page size calculation
>  	 * we use @ua and @entries natural alignment to allow IOMMU pages
> @@ -180,6 +192,7 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
>  
>  	}
>  
> +good_exit:
>  	atomic64_set(&mem->mapped, 1);
>  	mem->used = 1;
>  	mem->ua = ua;
> @@ -196,13 +209,31 @@ long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
>  
>  	return ret;
>  }
> +
> +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
> +		struct mm_iommu_table_group_mem_t **pmem)
> +{
> +	return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
> +			pmem);
> +}
>  EXPORT_SYMBOL_GPL(mm_iommu_new);
>  
> +long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
> +		unsigned long entries, unsigned long dev_hpa,
> +		struct mm_iommu_table_group_mem_t **pmem)
> +{
> +	return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
> +}
> +EXPORT_SYMBOL_GPL(mm_iommu_newdev);
> +
>  static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
>  {
>  	long i;
>  	struct page *page = NULL;
>  
> +	if (!mem->hpas)
> +		return;
> +
>  	for (i = 0; i < mem->entries; ++i) {
>  		if (!mem->hpas[i])
>  			continue;
> @@ -244,6 +275,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
>  long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
>  {
>  	long ret = 0;
> +	unsigned long entries, dev_hpa;
>  
>  	mutex_lock(&mem_list_mutex);
>  
> @@ -265,9 +297,12 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
>  	}
>  
>  	/* @mapped became 0 so now mappings are disabled, release the region */
> +	entries = mem->entries;
> +	dev_hpa = mem->dev_hpa;
>  	mm_iommu_release(mem);
>  
> -	mm_iommu_adjust_locked_vm(mm, mem->entries, false);
> +	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
> +		mm_iommu_adjust_locked_vm(mm, entries, false);
>  
>  unlock_exit:
>  	mutex_unlock(&mem_list_mutex);
> @@ -337,7 +372,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
>  		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
>  {
>  	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
> -	u64 *va = &mem->hpas[entry];
> +	u64 *va;
>  
>  	if (entry >= mem->entries)
>  		return -EFAULT;
> @@ -345,6 +380,12 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
>  	if (pageshift > mem->pageshift)
>  		return -EFAULT;
>  
> +	if (!mem->hpas) {
> +		*hpa = mem->dev_hpa + (ua - mem->ua);
> +		return 0;
> +	}
> +
> +	va = &mem->hpas[entry];
>  	*hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
>  
>  	return 0;
> @@ -355,7 +396,6 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
>  		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
>  {
>  	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
> -	void *va = &mem->hpas[entry];
>  	unsigned long *pa;
>  
>  	if (entry >= mem->entries)
> @@ -364,7 +404,12 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
>  	if (pageshift > mem->pageshift)
>  		return -EFAULT;
>  
> -	pa = (void *) vmalloc_to_phys(va);
> +	if (!mem->hpas) {
> +		*hpa = mem->dev_hpa + (ua - mem->ua);
> +		return 0;
> +	}
> +
> +	pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
>  	if (!pa)
>  		return -EFAULT;
>  
> @@ -394,6 +439,26 @@ extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
>  	*pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
>  }
>  
> +extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
> +		unsigned int pageshift)
> +{
> +	struct mm_iommu_table_group_mem_t *mem;
> +	const unsigned long pagesize = 1UL << pageshift;
> +
> +	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
> +		if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
> +			continue;
> +
> +		if ((mem->dev_hpa <= hpa) &&
> +				(hpa + pagesize <= mem->dev_hpa +
> +				 (mem->entries << PAGE_SHIFT)))
> +			return true;
> +	}
> +
> +	return false;
> +}
> +EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
> +
>  long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
>  {
>  	if (atomic64_inc_not_zero(&mem->mapped))
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
> index 56db071..ed89137 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -222,8 +222,15 @@ static long tce_iommu_register_pages(struct tce_container *container,
>  	return ret;
>  }
>  
> -static bool tce_page_is_contained(struct page *page, unsigned page_shift)
> +static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
> +		unsigned int page_shift)
>  {
> +	struct page *page;
> +
> +	if (mm_iommu_is_devmem(mm, hpa, page_shift))
> +		return true;
> +
> +	page = pfn_to_page(hpa >> PAGE_SHIFT);
>  	/*
>  	 * Check that the TCE table granularity is not bigger than the size of
>  	 * a page we just found. Otherwise the hardware can get access to
> @@ -499,7 +506,8 @@ static int tce_iommu_clear(struct tce_container *container,
>  
>  		direction = DMA_NONE;
>  		oldhpa = 0;
> -		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
> +		ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
> +				&direction);
>  		if (ret)
>  			continue;
>  
> @@ -537,7 +545,6 @@ static long tce_iommu_build(struct tce_container *container,
>  		enum dma_data_direction direction)
>  {
>  	long i, ret = 0;
> -	struct page *page;
>  	unsigned long hpa;
>  	enum dma_data_direction dirtmp;
>  
> @@ -548,15 +555,16 @@ static long tce_iommu_build(struct tce_container *container,
>  		if (ret)
>  			break;
>  
> -		page = pfn_to_page(hpa >> PAGE_SHIFT);
> -		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
> +		if (!tce_page_is_contained(container->mm, hpa,
> +				tbl->it_page_shift)) {
>  			ret = -EPERM;
>  			break;
>  		}
>  
>  		hpa |= offset;
>  		dirtmp = direction;
> -		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
> +		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
> +				&dirtmp);
>  		if (ret) {
>  			tce_iommu_unuse_page(container, hpa);
>  			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
> @@ -583,7 +591,6 @@ static long tce_iommu_build_v2(struct tce_container *container,
>  		enum dma_data_direction direction)
>  {
>  	long i, ret = 0;
> -	struct page *page;
>  	unsigned long hpa;
>  	enum dma_data_direction dirtmp;
>  
> @@ -596,8 +603,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
>  		if (ret)
>  			break;
>  
> -		page = pfn_to_page(hpa >> PAGE_SHIFT);
> -		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
> +		if (!tce_page_is_contained(container->mm, hpa,
> +				tbl->it_page_shift)) {
>  			ret = -EPERM;
>  			break;
>  		}
> @@ -610,7 +617,8 @@ static long tce_iommu_build_v2(struct tce_container *container,
>  		if (mm_iommu_mapped_inc(mem))
>  			break;
>  
> -		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
> +		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
> +				&dirtmp);
>  		if (ret) {
>  			/* dirtmp cannot be DMA_NONE here */
>  			tce_iommu_unuse_page_v2(container, tbl, entry + i);
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 35db0cb..a8aeac0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -218,8 +218,9 @@  extern void iommu_register_group(struct iommu_table_group *table_group,
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
-extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-		unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long *hpa,
+		enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 2d6b00d..f0f9f3d 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -24,6 +24,9 @@  extern bool mm_iommu_preregistered(struct mm_struct *mm);
 extern long mm_iommu_new(struct mm_struct *mm,
 		unsigned long ua, unsigned long entries,
 		struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+		unsigned long entries, unsigned long dev_hpa,
+		struct mm_iommu_table_group_mem_t **pmem);
 extern long mm_iommu_put(struct mm_struct *mm,
 		struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_init(struct mm_struct *mm);
@@ -39,6 +42,8 @@  extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua);
+extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+		unsigned int pageshift);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
 #endif
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f0dc680..8ccfdd9 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -47,6 +47,7 @@ 
 #include <asm/fadump.h>
 #include <asm/vio.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 #define DBG(...)
 
@@ -993,15 +994,17 @@  int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
-long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-		unsigned long *hpa, enum dma_data_direction *direction)
+long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+		unsigned long entry, unsigned long *hpa,
+		enum dma_data_direction *direction)
 {
 	long ret;
 
 	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
 
 	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-			(*direction == DMA_BIDIRECTIONAL)))
+			(*direction == DMA_BIDIRECTIONAL)) &&
+			!mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift))
 		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
 
 	/* if (unlikely(ret))
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 62a8d03..532ab797 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -397,12 +397,13 @@  static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
 	return H_SUCCESS;
 }
 
-static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
+		unsigned long entry)
 {
 	unsigned long hpa = 0;
 	enum dma_data_direction dir = DMA_NONE;
 
-	iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	iommu_tce_xchg(mm, tbl, entry, &hpa, &dir);
 }
 
 static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -433,7 +434,7 @@  static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 	unsigned long hpa = 0;
 	long ret;
 
-	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
+	if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir)))
 		return H_TOO_HARD;
 
 	if (dir == DMA_NONE)
@@ -441,7 +442,7 @@  static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 
 	ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
 	if (ret != H_SUCCESS)
-		iommu_tce_xchg(tbl, entry, &hpa, &dir);
+		iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
 
 	return ret;
 }
@@ -487,7 +488,7 @@  long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 	if (mm_iommu_mapped_inc(mem))
 		return H_TOO_HARD;
 
-	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+	ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir);
 	if (WARN_ON_ONCE(ret)) {
 		mm_iommu_mapped_dec(mem);
 		return H_TOO_HARD;
@@ -566,7 +567,7 @@  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 					entry, ua, dir);
 
 		if (ret != H_SUCCESS) {
-			kvmppc_clear_tce(stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
 			goto unlock_exit;
 		}
 	}
@@ -655,7 +656,8 @@  long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 					iommu_tce_direction(tce));
 
 			if (ret != H_SUCCESS) {
-				kvmppc_clear_tce(stit->tbl, entry);
+				kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
+						entry);
 				goto unlock_exit;
 			}
 		}
@@ -704,7 +706,7 @@  long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
 				return ret;
 
 			WARN_ON_ONCE(1);
-			kvmppc_clear_tce(stit->tbl, entry);
+			kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
 		}
 	}
 
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 580d89e..62fe5fe 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -47,6 +47,8 @@  struct mm_iommu_table_group_mem_t {
 		struct page **hpages;	/* vmalloc'ed */
 		phys_addr_t *hpas;
 	};
+#define MM_IOMMU_TABLE_INVALID_HPA	((uint64_t)-1)
+	u64 dev_hpa;		/* Device memory base address */
 };
 
 static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
@@ -89,7 +91,8 @@  bool mm_iommu_preregistered(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
-long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
+		unsigned long entries, unsigned long dev_hpa,
 		struct mm_iommu_table_group_mem_t **pmem)
 {
 	struct mm_iommu_table_group_mem_t *mem;
@@ -112,11 +115,13 @@  long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 
 	}
 
-	ret = mm_iommu_adjust_locked_vm(mm, entries, true);
-	if (ret)
-		goto unlock_exit;
+	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+		if (ret)
+			goto unlock_exit;
 
-	locked_entries = entries;
+		locked_entries = entries;
+	}
 
 	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	if (!mem) {
@@ -124,6 +129,13 @@  long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 		goto unlock_exit;
 	}
 
+	if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
+		mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
+		mem->dev_hpa = dev_hpa;
+		goto good_exit;
+	}
+	mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
+
 	/*
 	 * For a starting point for a maximum page size calculation
 	 * we use @ua and @entries natural alignment to allow IOMMU pages
@@ -180,6 +192,7 @@  long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 
 	}
 
+good_exit:
 	atomic64_set(&mem->mapped, 1);
 	mem->used = 1;
 	mem->ua = ua;
@@ -196,13 +209,31 @@  long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
 
 	return ret;
 }
+
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
+			pmem);
+}
 EXPORT_SYMBOL_GPL(mm_iommu_new);
 
+long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+		unsigned long entries, unsigned long dev_hpa,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_newdev);
+
 static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
 {
 	long i;
 	struct page *page = NULL;
 
+	if (!mem->hpas)
+		return;
+
 	for (i = 0; i < mem->entries; ++i) {
 		if (!mem->hpas[i])
 			continue;
@@ -244,6 +275,7 @@  static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
 long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 {
 	long ret = 0;
+	unsigned long entries, dev_hpa;
 
 	mutex_lock(&mem_list_mutex);
 
@@ -265,9 +297,12 @@  long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 	}
 
 	/* @mapped became 0 so now mappings are disabled, release the region */
+	entries = mem->entries;
+	dev_hpa = mem->dev_hpa;
 	mm_iommu_release(mem);
 
-	mm_iommu_adjust_locked_vm(mm, mem->entries, false);
+	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+		mm_iommu_adjust_locked_vm(mm, entries, false);
 
 unlock_exit:
 	mutex_unlock(&mem_list_mutex);
@@ -337,7 +372,7 @@  long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
 {
 	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-	u64 *va = &mem->hpas[entry];
+	u64 *va;
 
 	if (entry >= mem->entries)
 		return -EFAULT;
@@ -345,6 +380,12 @@  long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
 	if (pageshift > mem->pageshift)
 		return -EFAULT;
 
+	if (!mem->hpas) {
+		*hpa = mem->dev_hpa + (ua - mem->ua);
+		return 0;
+	}
+
+	va = &mem->hpas[entry];
 	*hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
 
 	return 0;
@@ -355,7 +396,6 @@  long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
 		unsigned long ua, unsigned int pageshift, unsigned long *hpa)
 {
 	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
-	void *va = &mem->hpas[entry];
 	unsigned long *pa;
 
 	if (entry >= mem->entries)
@@ -364,7 +404,12 @@  long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
 	if (pageshift > mem->pageshift)
 		return -EFAULT;
 
-	pa = (void *) vmalloc_to_phys(va);
+	if (!mem->hpas) {
+		*hpa = mem->dev_hpa + (ua - mem->ua);
+		return 0;
+	}
+
+	pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
 	if (!pa)
 		return -EFAULT;
 
@@ -394,6 +439,26 @@  extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
 	*pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
 }
 
+extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+		unsigned int pageshift)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	const unsigned long pagesize = 1UL << pageshift;
+
+	list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+		if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+			continue;
+
+		if ((mem->dev_hpa <= hpa) &&
+				(hpa + pagesize <= mem->dev_hpa +
+				 (mem->entries << PAGE_SHIFT)))
+			return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
+
 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
 {
 	if (atomic64_inc_not_zero(&mem->mapped))
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 56db071..ed89137 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -222,8 +222,15 @@  static long tce_iommu_register_pages(struct tce_container *container,
 	return ret;
 }
 
-static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
+		unsigned int page_shift)
 {
+	struct page *page;
+
+	if (mm_iommu_is_devmem(mm, hpa, page_shift))
+		return true;
+
+	page = pfn_to_page(hpa >> PAGE_SHIFT);
 	/*
 	 * Check that the TCE table granularity is not bigger than the size of
 	 * a page we just found. Otherwise the hardware can get access to
@@ -499,7 +506,8 @@  static int tce_iommu_clear(struct tce_container *container,
 
 		direction = DMA_NONE;
 		oldhpa = 0;
-		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
+		ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa,
+				&direction);
 		if (ret)
 			continue;
 
@@ -537,7 +545,6 @@  static long tce_iommu_build(struct tce_container *container,
 		enum dma_data_direction direction)
 {
 	long i, ret = 0;
-	struct page *page;
 	unsigned long hpa;
 	enum dma_data_direction dirtmp;
 
@@ -548,15 +555,16 @@  static long tce_iommu_build(struct tce_container *container,
 		if (ret)
 			break;
 
-		page = pfn_to_page(hpa >> PAGE_SHIFT);
-		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+		if (!tce_page_is_contained(container->mm, hpa,
+				tbl->it_page_shift)) {
 			ret = -EPERM;
 			break;
 		}
 
 		hpa |= offset;
 		dirtmp = direction;
-		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
+				&dirtmp);
 		if (ret) {
 			tce_iommu_unuse_page(container, hpa);
 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -583,7 +591,6 @@  static long tce_iommu_build_v2(struct tce_container *container,
 		enum dma_data_direction direction)
 {
 	long i, ret = 0;
-	struct page *page;
 	unsigned long hpa;
 	enum dma_data_direction dirtmp;
 
@@ -596,8 +603,8 @@  static long tce_iommu_build_v2(struct tce_container *container,
 		if (ret)
 			break;
 
-		page = pfn_to_page(hpa >> PAGE_SHIFT);
-		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+		if (!tce_page_is_contained(container->mm, hpa,
+				tbl->it_page_shift)) {
 			ret = -EPERM;
 			break;
 		}
@@ -610,7 +617,8 @@  static long tce_iommu_build_v2(struct tce_container *container,
 		if (mm_iommu_mapped_inc(mem))
 			break;
 
-		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+		ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa,
+				&dirtmp);
 		if (ret) {
 			/* dirtmp cannot be DMA_NONE here */
 			tce_iommu_unuse_page_v2(container, tbl, entry + i);