Patchwork [8/8] KVM: PPC: Add hugepage support for IOMMU in-kernel handling

login
register
mail settings
Submitter Alexey Kardashevskiy
Date July 6, 2013, 3:07 p.m.
Message ID <1373123227-22969-9-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/257278/
State Not Applicable
Headers show

Comments

Alexey Kardashevskiy - July 6, 2013, 3:07 p.m.
This adds special support for huge pages (16MB).  The reference
counting cannot be easily done for such pages in real mode (when
MMU is off) so we added a list of huge pages.  It is populated in
virtual mode and get_page is called just once per a huge page.
Real mode handlers check if the requested page is huge and in the list,
then no reference counting is done, otherwise an exit to virtual mode
happens.  The list is released at KVM exit.  At the moment the fastest
card available for tests uses up to 9 huge pages so walking through this
list is not very expensive.  However this can change and we may want
to optimize this.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

---

Changes:
2013/06/27:
* list of huge pages replaces with hashtable for better performance
* spinlock removed from real mode and only protects insertion of new
huge [ages descriptors into the hashtable

2013/06/05:
* fixed compile error when CONFIG_IOMMU_API=n

2013/05/20:
* the real mode handler now searches for a huge page by gpa (used to be pte)
* the virtual mode handler prints warning if it is called twice for the same
huge page as the real mode handler is expected to fail just once - when a huge
page is not in the list yet.
* the huge page is refcounted twice - when added to the hugepage list and
when used in the virtual mode hcall handler (can be optimized but it will
make the patch less nice).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/kvm_host.h |  25 +++++++++
 arch/powerpc/kernel/iommu.c         |   6 ++-
 arch/powerpc/kvm/book3s_64_vio.c    | 104 +++++++++++++++++++++++++++++++++---
 arch/powerpc/kvm/book3s_64_vio_hv.c |  21 ++++++--
 4 files changed, 146 insertions(+), 10 deletions(-)
Alexander Graf - July 9, 2013, 5:32 p.m.
On 07/06/2013 05:07 PM, Alexey Kardashevskiy wrote:
> This adds special support for huge pages (16MB).  The reference
> counting cannot be easily done for such pages in real mode (when
> MMU is off) so we added a list of huge pages.  It is populated in
> virtual mode and get_page is called just once per a huge page.
> Real mode handlers check if the requested page is huge and in the list,
> then no reference counting is done, otherwise an exit to virtual mode
> happens.  The list is released at KVM exit.  At the moment the fastest
> card available for tests uses up to 9 huge pages so walking through this
> list is not very expensive.  However this can change and we may want
> to optimize this.
>
> Signed-off-by: Paul Mackerras<paulus@samba.org>
> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>
> ---
>
> Changes:
> 2013/06/27:
> * list of huge pages replaces with hashtable for better performance

So the only thing your patch description really talks about is not true 
anymore?

> * spinlock removed from real mode and only protects insertion of new
> huge [ages descriptors into the hashtable
>
> 2013/06/05:
> * fixed compile error when CONFIG_IOMMU_API=n
>
> 2013/05/20:
> * the real mode handler now searches for a huge page by gpa (used to be pte)
> * the virtual mode handler prints warning if it is called twice for the same
> huge page as the real mode handler is expected to fail just once - when a huge
> page is not in the list yet.
> * the huge page is refcounted twice - when added to the hugepage list and
> when used in the virtual mode hcall handler (can be optimized but it will
> make the patch less nice).
>
> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
> ---
>   arch/powerpc/include/asm/kvm_host.h |  25 +++++++++
>   arch/powerpc/kernel/iommu.c         |   6 ++-
>   arch/powerpc/kvm/book3s_64_vio.c    | 104 +++++++++++++++++++++++++++++++++---
>   arch/powerpc/kvm/book3s_64_vio_hv.c |  21 ++++++--
>   4 files changed, 146 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 53e61b2..a7508cf 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -30,6 +30,7 @@
>   #include<linux/kvm_para.h>
>   #include<linux/list.h>
>   #include<linux/atomic.h>
> +#include<linux/hashtable.h>
>   #include<asm/kvm_asm.h>
>   #include<asm/processor.h>
>   #include<asm/page.h>
> @@ -182,10 +183,34 @@ struct kvmppc_spapr_tce_table {
>   	u32 window_size;
>   	struct iommu_group *grp;		/* used for IOMMU groups */
>   	struct vfio_group *vfio_grp;		/* used for IOMMU groups */
> +	DECLARE_HASHTABLE(hash_tab, ilog2(64));	/* used for IOMMU groups */
> +	spinlock_t hugepages_write_lock;	/* used for IOMMU groups */
>   	struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
>   	struct page *pages[0];
>   };
>
> +/*
> + * The KVM guest can be backed with 16MB pages.
> + * In this case, we cannot do page counting from the real mode
> + * as the compound pages are used - they are linked in a list
> + * with pointers as virtual addresses which are inaccessible
> + * in real mode.
> + *
> + * The code below keeps a 16MB pages list and uses page struct
> + * in real mode if it is already locked in RAM and inserted into
> + * the list or switches to the virtual mode where it can be
> + * handled in a usual manner.
> + */
> +#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)	hash_32(gpa>>  24, 32)
> +
> +struct kvmppc_spapr_iommu_hugepage {
> +	struct hlist_node hash_node;
> +	unsigned long gpa;	/* Guest physical address */
> +	unsigned long hpa;	/* Host physical address */
> +	struct page *page;	/* page struct of the very first subpage */
> +	unsigned long size;	/* Huge page size (always 16MB at the moment) */
> +};
> +
>   struct kvmppc_linear_info {
>   	void		*base_virt;
>   	unsigned long	 base_pfn;
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 51678ec..e0b6eca 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -999,7 +999,8 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned long entry,
>   			if (!pg) {
>   				ret = -EAGAIN;
>   			} else if (PageCompound(pg)) {
> -				ret = -EAGAIN;
> +				/* Hugepages will be released at KVM exit */
> +				ret = 0;
>   			} else {
>   				if (oldtce&  TCE_PCI_WRITE)
>   					SetPageDirty(pg);
> @@ -1009,6 +1010,9 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned long entry,
>   			struct page *pg = pfn_to_page(oldtce>>  PAGE_SHIFT);
>   			if (!pg) {
>   				ret = -EAGAIN;
> +			} else if (PageCompound(pg)) {
> +				/* Hugepages will be released at KVM exit */
> +				ret = 0;
>   			} else {
>   				if (oldtce&  TCE_PCI_WRITE)
>   					SetPageDirty(pg);
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index 2b51f4a..c037219 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -46,6 +46,40 @@
>
>   #define ERROR_ADDR      ((void *)~(unsigned long)0x0)
>
> +#ifdef CONFIG_IOMMU_API

Can't you just make CONFIG_IOMMU_API mandatory in Kconfig?

> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
> +{
> +	spin_lock_init(&tt->hugepages_write_lock);
> +	hash_init(tt->hash_tab);
> +}
> +
> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt)
> +{
> +	int bkt;
> +	struct kvmppc_spapr_iommu_hugepage *hp;
> +	struct hlist_node *tmp;
> +
> +	spin_lock(&tt->hugepages_write_lock);
> +	hash_for_each_safe(tt->hash_tab, bkt, tmp, hp, hash_node) {
> +		pr_debug("Release HP liobn=%llx #%u gpa=%lx hpa=%lx size=%ld\n",
> +				tt->liobn, bkt, hp->gpa, hp->hpa, hp->size);

trace point

> +		hlist_del_rcu(&hp->hash_node);
> +
> +		put_page(hp->page);

Don't you have to mark them dirty?

> +		kfree(hp);
> +	}
> +	spin_unlock(&tt->hugepages_write_lock);
> +}
> +#else
> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
> +{
> +}
> +
> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt)
> +{
> +}
> +#endif /* CONFIG_IOMMU_API */
> +
>   static long kvmppc_stt_npages(unsigned long window_size)
>   {
>   	return ALIGN((window_size>>  SPAPR_TCE_SHIFT)
> @@ -112,6 +146,7 @@ static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
>
>   	mutex_lock(&kvm->lock);
>   	list_del(&stt->list);
> +	kvmppc_iommu_hugepages_cleanup(stt);
>
>   #ifdef CONFIG_IOMMU_API
>   	if (stt->grp) {
> @@ -200,6 +235,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>   	kvm_get_kvm(kvm);
>
>   	mutex_lock(&kvm->lock);
> +	kvmppc_iommu_hugepages_init(stt);
>   	list_add(&stt->list,&kvm->arch.spapr_tce_tables);
>
>   	mutex_unlock(&kvm->lock);
> @@ -283,6 +319,7 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
>
>   	kvm_get_kvm(kvm);
>   	mutex_lock(&kvm->lock);
> +	kvmppc_iommu_hugepages_init(tt);
>   	list_add(&tt->list,&kvm->arch.spapr_tce_tables);
>   	mutex_unlock(&kvm->lock);
>
> @@ -307,10 +344,17 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
>
>   /* Converts guest physical address to host virtual address */
>   static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
> +		struct kvmppc_spapr_tce_table *tt,
>   		unsigned long gpa, struct page **pg, unsigned long *hpa)
>   {
>   	unsigned long hva, gfn = gpa>>  PAGE_SHIFT;
>   	struct kvm_memory_slot *memslot;
> +#ifdef CONFIG_IOMMU_API
> +	struct kvmppc_spapr_iommu_hugepage *hp;
> +	unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
> +	pte_t *ptep;
> +	unsigned int shift = 0;
> +#endif
>
>   	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
>   	if (!memslot)
> @@ -325,6 +369,54 @@ static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
>   		*hpa = __pa((unsigned long) page_address(*pg)) +
>   				(hva&  ~PAGE_MASK);
>
> +#ifdef CONFIG_IOMMU_API

This function is becoming incredibly large. Please split it up. Also 
please document the code.


Alex

> +	if (!PageCompound(*pg))
> +		return (void *) hva;
> +
> +	spin_lock(&tt->hugepages_write_lock);
> +	hash_for_each_possible_rcu(tt->hash_tab, hp, hash_node, key) {
> +		if ((gpa<  hp->gpa) || (gpa>= hp->gpa + hp->size))
> +			continue;
> +		if (hpa)
> +			*hpa = __pa((unsigned long) page_address(hp->page)) +
> +					(hva&  (hp->size - 1));
> +		goto unlock_exit;
> +	}
> +
> +	ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva,&shift);
> +	WARN_ON(!ptep);
> +
> +	if (!ptep || (shift<= PAGE_SHIFT)) {
> +		hva = (unsigned long) ERROR_ADDR;
> +		goto unlock_exit;
> +	}
> +
> +	hp = kzalloc(sizeof(*hp), GFP_KERNEL);
> +	if (!hp) {
> +		hva = (unsigned long) ERROR_ADDR;
> +		goto unlock_exit;
> +	}
> +
> +	hp->gpa = gpa&  ~((1<<  shift) - 1);
> +	hp->hpa = (pte_pfn(*ptep)<<  PAGE_SHIFT);
> +	hp->size = 1<<  shift;
> +
> +	if (get_user_pages_fast(hva&  ~(hp->size - 1), 1, 1,&hp->page) != 1) {
> +		hva = (unsigned long) ERROR_ADDR;
> +		kfree(hp);
> +		goto unlock_exit;
> +	}
> +	hash_add_rcu(tt->hash_tab,&hp->hash_node, key);
> +
> +	if (hpa)
> +		*hpa = __pa((unsigned long) page_address(hp->page)) +
> +				(hva&  (hp->size - 1));
> +unlock_exit:
> +	spin_unlock(&tt->hugepages_write_lock);
> +
> +	put_page(*pg);
> +	*pg = NULL;
> +#endif /* CONFIG_IOMMU_API */
>   	return (void *) hva;
>   }
>
> @@ -363,7 +455,7 @@ long kvmppc_vm_h_put_tce_iommu(struct kvm_vcpu *vcpu,
>   		if (iommu_tce_put_param_check(tbl, ioba, tce))
>   			return H_PARAMETER;
>
> -		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tce,&pg,&hpa);
> +		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, tce,&pg,&hpa);
>   		if (hva == ERROR_ADDR)
>   			return H_HARDWARE;
>   	}
> @@ -372,7 +464,7 @@ long kvmppc_vm_h_put_tce_iommu(struct kvm_vcpu *vcpu,
>   		return H_SUCCESS;
>
>   	pg = pfn_to_page(hpa>>  PAGE_SHIFT);
> -	if (pg)
> +	if (pg&&  !PageCompound(pg))
>   		put_page(pg);
>
>   	return H_HARDWARE;
> @@ -414,7 +506,7 @@ static long kvmppc_vm_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
>   					(i<<  IOMMU_PAGE_SHIFT), gpa))
>   			return H_PARAMETER;
>
> -		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, gpa,&pg,
> +		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, gpa,&pg,
>   				&vcpu->arch.tce_tmp_hpas[i]);
>   		if (hva == ERROR_ADDR)
>   			goto putpages_flush_exit;
> @@ -429,7 +521,7 @@ putpages_flush_exit:
>   	for ( --i; i>= 0; --i) {
>   		struct page *pg;
>   		pg = pfn_to_page(vcpu->arch.tce_tmp_hpas[i]>>  PAGE_SHIFT);
> -		if (pg)
> +		if (pg&&  !PageCompound(pg))
>   			put_page(pg);
>   	}
>
> @@ -517,7 +609,7 @@ long kvmppc_vm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>   	if ((ioba + (npages<<  IOMMU_PAGE_SHIFT))>  tt->window_size)
>   		return H_PARAMETER;
>
> -	tces = kvmppc_vm_gpa_to_hva_and_get(vcpu, tce_list,&pg, NULL);
> +	tces = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, tce_list,&pg, NULL);
>   	if (tces == ERROR_ADDR)
>   		return H_TOO_HARD;
>
> @@ -547,7 +639,7 @@ long kvmppc_vm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>   		kvmppc_emulated_put_tce(tt, ioba + (i<<  IOMMU_PAGE_SHIFT),
>   				vcpu->arch.tce_tmp_hpas[i]);
>   put_list_page_exit:
> -	if (pg)
> +	if (pg&&  !PageCompound(pg))
>   		put_page(pg);
>
>   	if (vcpu->arch.tce_rm_fail != TCERM_NONE) {
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index f8103c6..8c6449f 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -132,6 +132,7 @@ EXPORT_SYMBOL_GPL(kvmppc_emulated_put_tce);
>    * returns ERROR_ADDR if failed.
>    */
>   static unsigned long kvmppc_rm_gpa_to_hpa_and_get(struct kvm_vcpu *vcpu,
> +		struct kvmppc_spapr_tce_table *tt,
>   		unsigned long gpa, struct page **pg)
>   {
>   	struct kvm_memory_slot *memslot;
> @@ -139,6 +140,20 @@ static unsigned long kvmppc_rm_gpa_to_hpa_and_get(struct kvm_vcpu *vcpu,
>   	unsigned long hva, hpa = ERROR_ADDR;
>   	unsigned long gfn = gpa>>  PAGE_SHIFT;
>   	unsigned shift = 0;
> +	struct kvmppc_spapr_iommu_hugepage *hp;
> +
> +	/* Try to find an already used hugepage */
> +	unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
> +
> +	hash_for_each_possible_rcu_notrace(tt->hash_tab, hp,
> +			hash_node, key) {
> +		if ((gpa<  hp->gpa) || (gpa>= hp->gpa + hp->size))
> +			continue;
> +
> +		*pg = NULL; /* Tell the caller not to put page */
> +
> +		return hp->hpa + (gpa&  (hp->size - 1));
> +	}
>
>   	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
>   	if (!memslot)
> @@ -208,7 +223,7 @@ static long kvmppc_h_put_tce_iommu(struct kvm_vcpu *vcpu,
>   	if (iommu_tce_put_param_check(tbl, ioba, tce))
>   		return H_PARAMETER;
>
> -	hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tce,&pg);
> +	hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tce,&pg);
>   	if (hpa == ERROR_ADDR)
>   		return H_TOO_HARD;
>
> @@ -247,7 +262,7 @@ static long kvmppc_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
>
>   	/* Translate TCEs and go get_page() */
>   	for (i = 0; i<  npages; ++i) {
> -		hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tces[i],&pg);
> +		hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tces[i],&pg);
>   		if (hpa == ERROR_ADDR) {
>   			vcpu->arch.tce_tmp_num = i;
>   			vcpu->arch.tce_rm_fail = TCERM_GETPAGE;
> @@ -342,7 +357,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>   	if ((ioba + (npages<<  IOMMU_PAGE_SHIFT))>  tt->window_size)
>   		return H_PARAMETER;
>
> -	tces = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tce_list,&pg);
> +	tces = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tce_list,&pg);
>   	if (tces == ERROR_ADDR)
>   		return H_TOO_HARD;
>
Alexey Kardashevskiy - July 9, 2013, 11:29 p.m.
On 07/10/2013 03:32 AM, Alexander Graf wrote:
> On 07/06/2013 05:07 PM, Alexey Kardashevskiy wrote:
>> This adds special support for huge pages (16MB).  The reference
>> counting cannot be easily done for such pages in real mode (when
>> MMU is off) so we added a list of huge pages.  It is populated in
>> virtual mode and get_page is called just once per a huge page.
>> Real mode handlers check if the requested page is huge and in the list,
>> then no reference counting is done, otherwise an exit to virtual mode
>> happens.  The list is released at KVM exit.  At the moment the fastest
>> card available for tests uses up to 9 huge pages so walking through this
>> list is not very expensive.  However this can change and we may want
>> to optimize this.
>>
>> Signed-off-by: Paul Mackerras<paulus@samba.org>
>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>>
>> ---
>>
>> Changes:
>> 2013/06/27:
>> * list of huge pages replaces with hashtable for better performance
> 
> So the only thing your patch description really talks about is not true
> anymore?
> 
>> * spinlock removed from real mode and only protects insertion of new
>> huge [ages descriptors into the hashtable
>>
>> 2013/06/05:
>> * fixed compile error when CONFIG_IOMMU_API=n
>>
>> 2013/05/20:
>> * the real mode handler now searches for a huge page by gpa (used to be pte)
>> * the virtual mode handler prints warning if it is called twice for the same
>> huge page as the real mode handler is expected to fail just once - when a
>> huge
>> page is not in the list yet.
>> * the huge page is refcounted twice - when added to the hugepage list and
>> when used in the virtual mode hcall handler (can be optimized but it will
>> make the patch less nice).
>>
>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/kvm_host.h |  25 +++++++++
>>   arch/powerpc/kernel/iommu.c         |   6 ++-
>>   arch/powerpc/kvm/book3s_64_vio.c    | 104
>> +++++++++++++++++++++++++++++++++---
>>   arch/powerpc/kvm/book3s_64_vio_hv.c |  21 ++++++--
>>   4 files changed, 146 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_host.h
>> b/arch/powerpc/include/asm/kvm_host.h
>> index 53e61b2..a7508cf 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -30,6 +30,7 @@
>>   #include<linux/kvm_para.h>
>>   #include<linux/list.h>
>>   #include<linux/atomic.h>
>> +#include<linux/hashtable.h>
>>   #include<asm/kvm_asm.h>
>>   #include<asm/processor.h>
>>   #include<asm/page.h>
>> @@ -182,10 +183,34 @@ struct kvmppc_spapr_tce_table {
>>       u32 window_size;
>>       struct iommu_group *grp;        /* used for IOMMU groups */
>>       struct vfio_group *vfio_grp;        /* used for IOMMU groups */
>> +    DECLARE_HASHTABLE(hash_tab, ilog2(64));    /* used for IOMMU groups */
>> +    spinlock_t hugepages_write_lock;    /* used for IOMMU groups */
>>       struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
>>       struct page *pages[0];
>>   };
>>
>> +/*
>> + * The KVM guest can be backed with 16MB pages.
>> + * In this case, we cannot do page counting from the real mode
>> + * as the compound pages are used - they are linked in a list
>> + * with pointers as virtual addresses which are inaccessible
>> + * in real mode.
>> + *
>> + * The code below keeps a 16MB pages list and uses page struct
>> + * in real mode if it is already locked in RAM and inserted into
>> + * the list or switches to the virtual mode where it can be
>> + * handled in a usual manner.
>> + */
>> +#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)    hash_32(gpa>>  24, 32)
>> +
>> +struct kvmppc_spapr_iommu_hugepage {
>> +    struct hlist_node hash_node;
>> +    unsigned long gpa;    /* Guest physical address */
>> +    unsigned long hpa;    /* Host physical address */
>> +    struct page *page;    /* page struct of the very first subpage */
>> +    unsigned long size;    /* Huge page size (always 16MB at the moment) */
>> +};
>> +
>>   struct kvmppc_linear_info {
>>       void        *base_virt;
>>       unsigned long     base_pfn;
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 51678ec..e0b6eca 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -999,7 +999,8 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned
>> long entry,
>>               if (!pg) {
>>                   ret = -EAGAIN;
>>               } else if (PageCompound(pg)) {
>> -                ret = -EAGAIN;
>> +                /* Hugepages will be released at KVM exit */
>> +                ret = 0;
>>               } else {
>>                   if (oldtce&  TCE_PCI_WRITE)
>>                       SetPageDirty(pg);
>> @@ -1009,6 +1010,9 @@ int iommu_free_tces(struct iommu_table *tbl,
>> unsigned long entry,
>>               struct page *pg = pfn_to_page(oldtce>>  PAGE_SHIFT);
>>               if (!pg) {
>>                   ret = -EAGAIN;
>> +            } else if (PageCompound(pg)) {
>> +                /* Hugepages will be released at KVM exit */
>> +                ret = 0;
>>               } else {
>>                   if (oldtce&  TCE_PCI_WRITE)
>>                       SetPageDirty(pg);
>> diff --git a/arch/powerpc/kvm/book3s_64_vio.c
>> b/arch/powerpc/kvm/book3s_64_vio.c
>> index 2b51f4a..c037219 100644
>> --- a/arch/powerpc/kvm/book3s_64_vio.c
>> +++ b/arch/powerpc/kvm/book3s_64_vio.c
>> @@ -46,6 +46,40 @@
>>
>>   #define ERROR_ADDR      ((void *)~(unsigned long)0x0)
>>
>> +#ifdef CONFIG_IOMMU_API
> 
> Can't you just make CONFIG_IOMMU_API mandatory in Kconfig?


Sure I can. I can do anything. Why should I? Do I have to do that to get
this accepted? I do not understand this comment. It has already been
discussed how to enable this option.


>> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
>> +{
>> +    spin_lock_init(&tt->hugepages_write_lock);
>> +    hash_init(tt->hash_tab);
>> +}
>> +
>> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table
>> *tt)
>> +{
>> +    int bkt;
>> +    struct kvmppc_spapr_iommu_hugepage *hp;
>> +    struct hlist_node *tmp;
>> +
>> +    spin_lock(&tt->hugepages_write_lock);
>> +    hash_for_each_safe(tt->hash_tab, bkt, tmp, hp, hash_node) {
>> +        pr_debug("Release HP liobn=%llx #%u gpa=%lx hpa=%lx size=%ld\n",
>> +                tt->liobn, bkt, hp->gpa, hp->hpa, hp->size);
> 
> trace point
> 
>> +        hlist_del_rcu(&hp->hash_node);
>> +
>> +        put_page(hp->page);
> 
> Don't you have to mark them dirty?


get_user_pages_fast is called with writing==1. Does not it do the same?

> 
>> +        kfree(hp);
>> +    }
>> +    spin_unlock(&tt->hugepages_write_lock);
>> +}
>> +#else
>> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
>> +{
>> +}
>> +
>> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table
>> *tt)
>> +{
>> +}
>> +#endif /* CONFIG_IOMMU_API */
>> +
>>   static long kvmppc_stt_npages(unsigned long window_size)
>>   {
>>       return ALIGN((window_size>>  SPAPR_TCE_SHIFT)
>> @@ -112,6 +146,7 @@ static void release_spapr_tce_table(struct
>> kvmppc_spapr_tce_table *stt)
>>
>>       mutex_lock(&kvm->lock);
>>       list_del(&stt->list);
>> +    kvmppc_iommu_hugepages_cleanup(stt);
>>
>>   #ifdef CONFIG_IOMMU_API
>>       if (stt->grp) {
>> @@ -200,6 +235,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>>       kvm_get_kvm(kvm);
>>
>>       mutex_lock(&kvm->lock);
>> +    kvmppc_iommu_hugepages_init(stt);
>>       list_add(&stt->list,&kvm->arch.spapr_tce_tables);
>>
>>       mutex_unlock(&kvm->lock);
>> @@ -283,6 +319,7 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm
>> *kvm,
>>
>>       kvm_get_kvm(kvm);
>>       mutex_lock(&kvm->lock);
>> +    kvmppc_iommu_hugepages_init(tt);
>>       list_add(&tt->list,&kvm->arch.spapr_tce_tables);
>>       mutex_unlock(&kvm->lock);
>>
>> @@ -307,10 +344,17 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm
>> *kvm,
>>
>>   /* Converts guest physical address to host virtual address */
>>   static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
>> +        struct kvmppc_spapr_tce_table *tt,
>>           unsigned long gpa, struct page **pg, unsigned long *hpa)
>>   {
>>       unsigned long hva, gfn = gpa>>  PAGE_SHIFT;
>>       struct kvm_memory_slot *memslot;
>> +#ifdef CONFIG_IOMMU_API
>> +    struct kvmppc_spapr_iommu_hugepage *hp;
>> +    unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
>> +    pte_t *ptep;
>> +    unsigned int shift = 0;
>> +#endif
>>
>>       memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
>>       if (!memslot)
>> @@ -325,6 +369,54 @@ static void __user
>> *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
>>           *hpa = __pa((unsigned long) page_address(*pg)) +
>>                   (hva&  ~PAGE_MASK);
>>
>> +#ifdef CONFIG_IOMMU_API
> 
> This function is becoming incredibly large. Please split it up. Also please
> document the code.


Less than 100 lines is incredibly large? There are _many_ functions bigger
than that. I do not really see the point in making a separate function
which is going to be called only once.
Alexander Graf - July 10, 2013, 10:33 a.m.
On 10.07.2013, at 01:29, Alexey Kardashevskiy wrote:

> On 07/10/2013 03:32 AM, Alexander Graf wrote:
>> On 07/06/2013 05:07 PM, Alexey Kardashevskiy wrote:
>>> This adds special support for huge pages (16MB).  The reference
>>> counting cannot be easily done for such pages in real mode (when
>>> MMU is off) so we added a list of huge pages.  It is populated in
>>> virtual mode and get_page is called just once per a huge page.
>>> Real mode handlers check if the requested page is huge and in the list,
>>> then no reference counting is done, otherwise an exit to virtual mode
>>> happens.  The list is released at KVM exit.  At the moment the fastest
>>> card available for tests uses up to 9 huge pages so walking through this
>>> list is not very expensive.  However this can change and we may want
>>> to optimize this.
>>> 
>>> Signed-off-by: Paul Mackerras<paulus@samba.org>
>>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>>> 
>>> ---
>>> 
>>> Changes:
>>> 2013/06/27:
>>> * list of huge pages replaces with hashtable for better performance
>> 
>> So the only thing your patch description really talks about is not true
>> anymore?
>> 
>>> * spinlock removed from real mode and only protects insertion of new
>>> huge [ages descriptors into the hashtable
>>> 
>>> 2013/06/05:
>>> * fixed compile error when CONFIG_IOMMU_API=n
>>> 
>>> 2013/05/20:
>>> * the real mode handler now searches for a huge page by gpa (used to be pte)
>>> * the virtual mode handler prints warning if it is called twice for the same
>>> huge page as the real mode handler is expected to fail just once - when a
>>> huge
>>> page is not in the list yet.
>>> * the huge page is refcounted twice - when added to the hugepage list and
>>> when used in the virtual mode hcall handler (can be optimized but it will
>>> make the patch less nice).
>>> 
>>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>>> ---
>>>  arch/powerpc/include/asm/kvm_host.h |  25 +++++++++
>>>  arch/powerpc/kernel/iommu.c         |   6 ++-
>>>  arch/powerpc/kvm/book3s_64_vio.c    | 104
>>> +++++++++++++++++++++++++++++++++---
>>>  arch/powerpc/kvm/book3s_64_vio_hv.c |  21 ++++++--
>>>  4 files changed, 146 insertions(+), 10 deletions(-)
>>> 
>>> diff --git a/arch/powerpc/include/asm/kvm_host.h
>>> b/arch/powerpc/include/asm/kvm_host.h
>>> index 53e61b2..a7508cf 100644
>>> --- a/arch/powerpc/include/asm/kvm_host.h
>>> +++ b/arch/powerpc/include/asm/kvm_host.h
>>> @@ -30,6 +30,7 @@
>>>  #include<linux/kvm_para.h>
>>>  #include<linux/list.h>
>>>  #include<linux/atomic.h>
>>> +#include<linux/hashtable.h>
>>>  #include<asm/kvm_asm.h>
>>>  #include<asm/processor.h>
>>>  #include<asm/page.h>
>>> @@ -182,10 +183,34 @@ struct kvmppc_spapr_tce_table {
>>>      u32 window_size;
>>>      struct iommu_group *grp;        /* used for IOMMU groups */
>>>      struct vfio_group *vfio_grp;        /* used for IOMMU groups */
>>> +    DECLARE_HASHTABLE(hash_tab, ilog2(64));    /* used for IOMMU groups */
>>> +    spinlock_t hugepages_write_lock;    /* used for IOMMU groups */
>>>      struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
>>>      struct page *pages[0];
>>>  };
>>> 
>>> +/*
>>> + * The KVM guest can be backed with 16MB pages.
>>> + * In this case, we cannot do page counting from the real mode
>>> + * as the compound pages are used - they are linked in a list
>>> + * with pointers as virtual addresses which are inaccessible
>>> + * in real mode.
>>> + *
>>> + * The code below keeps a 16MB pages list and uses page struct
>>> + * in real mode if it is already locked in RAM and inserted into
>>> + * the list or switches to the virtual mode where it can be
>>> + * handled in a usual manner.
>>> + */
>>> +#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)    hash_32(gpa>>  24, 32)
>>> +
>>> +struct kvmppc_spapr_iommu_hugepage {
>>> +    struct hlist_node hash_node;
>>> +    unsigned long gpa;    /* Guest physical address */
>>> +    unsigned long hpa;    /* Host physical address */
>>> +    struct page *page;    /* page struct of the very first subpage */
>>> +    unsigned long size;    /* Huge page size (always 16MB at the moment) */
>>> +};
>>> +
>>>  struct kvmppc_linear_info {
>>>      void        *base_virt;
>>>      unsigned long     base_pfn;
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index 51678ec..e0b6eca 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -999,7 +999,8 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned
>>> long entry,
>>>              if (!pg) {
>>>                  ret = -EAGAIN;
>>>              } else if (PageCompound(pg)) {
>>> -                ret = -EAGAIN;
>>> +                /* Hugepages will be released at KVM exit */
>>> +                ret = 0;
>>>              } else {
>>>                  if (oldtce&  TCE_PCI_WRITE)
>>>                      SetPageDirty(pg);
>>> @@ -1009,6 +1010,9 @@ int iommu_free_tces(struct iommu_table *tbl,
>>> unsigned long entry,
>>>              struct page *pg = pfn_to_page(oldtce>>  PAGE_SHIFT);
>>>              if (!pg) {
>>>                  ret = -EAGAIN;
>>> +            } else if (PageCompound(pg)) {
>>> +                /* Hugepages will be released at KVM exit */
>>> +                ret = 0;
>>>              } else {
>>>                  if (oldtce&  TCE_PCI_WRITE)
>>>                      SetPageDirty(pg);
>>> diff --git a/arch/powerpc/kvm/book3s_64_vio.c
>>> b/arch/powerpc/kvm/book3s_64_vio.c
>>> index 2b51f4a..c037219 100644
>>> --- a/arch/powerpc/kvm/book3s_64_vio.c
>>> +++ b/arch/powerpc/kvm/book3s_64_vio.c
>>> @@ -46,6 +46,40 @@
>>> 
>>>  #define ERROR_ADDR      ((void *)~(unsigned long)0x0)
>>> 
>>> +#ifdef CONFIG_IOMMU_API
>> 
>> Can't you just make CONFIG_IOMMU_API mandatory in Kconfig?
> 
> 
> Sure I can. I can do anything. Why should I?

To get rid of #ifdef's. They make code hard to maintain.

> Do I have to do that to get
> this accepted? I do not understand this comment. It has already been
> discussed how to enable this option.
> 
> 
>>> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
>>> +{
>>> +    spin_lock_init(&tt->hugepages_write_lock);
>>> +    hash_init(tt->hash_tab);
>>> +}
>>> +
>>> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table
>>> *tt)
>>> +{
>>> +    int bkt;
>>> +    struct kvmppc_spapr_iommu_hugepage *hp;
>>> +    struct hlist_node *tmp;
>>> +
>>> +    spin_lock(&tt->hugepages_write_lock);
>>> +    hash_for_each_safe(tt->hash_tab, bkt, tmp, hp, hash_node) {
>>> +        pr_debug("Release HP liobn=%llx #%u gpa=%lx hpa=%lx size=%ld\n",
>>> +                tt->liobn, bkt, hp->gpa, hp->hpa, hp->size);
>> 
>> trace point
>> 
>>> +        hlist_del_rcu(&hp->hash_node);
>>> +
>>> +        put_page(hp->page);
>> 
>> Don't you have to mark them dirty?
> 
> 
> get_user_pages_fast is called with writing==1. Does not it do the same?

It's not exactly obvious that you're calling it with writing == 1 :). Can you create a new local variable "is_write" in the calling function, set that to 1 before the call to get_user_pages_fast and pass it in instead of the 1? The compiler should easily optimize all of that away, but it makes the code by far easier to read.

> 
>> 
>>> +        kfree(hp);
>>> +    }
>>> +    spin_unlock(&tt->hugepages_write_lock);
>>> +}
>>> +#else
>>> +static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
>>> +{
>>> +}
>>> +
>>> +static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table
>>> *tt)
>>> +{
>>> +}
>>> +#endif /* CONFIG_IOMMU_API */
>>> +
>>>  static long kvmppc_stt_npages(unsigned long window_size)
>>>  {
>>>      return ALIGN((window_size>>  SPAPR_TCE_SHIFT)
>>> @@ -112,6 +146,7 @@ static void release_spapr_tce_table(struct
>>> kvmppc_spapr_tce_table *stt)
>>> 
>>>      mutex_lock(&kvm->lock);
>>>      list_del(&stt->list);
>>> +    kvmppc_iommu_hugepages_cleanup(stt);
>>> 
>>>  #ifdef CONFIG_IOMMU_API
>>>      if (stt->grp) {
>>> @@ -200,6 +235,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>>>      kvm_get_kvm(kvm);
>>> 
>>>      mutex_lock(&kvm->lock);
>>> +    kvmppc_iommu_hugepages_init(stt);
>>>      list_add(&stt->list,&kvm->arch.spapr_tce_tables);
>>> 
>>>      mutex_unlock(&kvm->lock);
>>> @@ -283,6 +319,7 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm
>>> *kvm,
>>> 
>>>      kvm_get_kvm(kvm);
>>>      mutex_lock(&kvm->lock);
>>> +    kvmppc_iommu_hugepages_init(tt);
>>>      list_add(&tt->list,&kvm->arch.spapr_tce_tables);
>>>      mutex_unlock(&kvm->lock);
>>> 
>>> @@ -307,10 +344,17 @@ long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm
>>> *kvm,
>>> 
>>>  /* Converts guest physical address to host virtual address */
>>>  static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
>>> +        struct kvmppc_spapr_tce_table *tt,
>>>          unsigned long gpa, struct page **pg, unsigned long *hpa)
>>>  {
>>>      unsigned long hva, gfn = gpa>>  PAGE_SHIFT;
>>>      struct kvm_memory_slot *memslot;
>>> +#ifdef CONFIG_IOMMU_API
>>> +    struct kvmppc_spapr_iommu_hugepage *hp;
>>> +    unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
>>> +    pte_t *ptep;
>>> +    unsigned int shift = 0;
>>> +#endif
>>> 
>>>      memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
>>>      if (!memslot)
>>> @@ -325,6 +369,54 @@ static void __user
>>> *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
>>>          *hpa = __pa((unsigned long) page_address(*pg)) +
>>>                  (hva&  ~PAGE_MASK);
>>> 
>>> +#ifdef CONFIG_IOMMU_API
>> 
>> This function is becoming incredibly large. Please split it up. Also please
>> document the code.
> 
> 
> Less than 100 lines is incredibly large? There are _many_ functions bigger
> than that. I do not really see the point in making a separate function
> which is going to be called only once.

Anything above 20 lines is too big usually, with very few exceptions. As mnemonic you can always imagine Linus sitting there with an 80x25 UNIX terminal, reading your code ;).


Alex
Benjamin Herrenschmidt - July 10, 2013, 10:39 a.m.
On Wed, 2013-07-10 at 12:33 +0200, Alexander Graf wrote:
> 
> It's not exactly obvious that you're calling it with writing == 1 :).
> Can you create a new local variable "is_write" in the calling
> function, set that to 1 before the call to get_user_pages_fast and
> pass it in instead of the 1? The compiler should easily optimize all
> of that away, but it makes the code by far easier to read.

Ugh ?

Nobody else does that .... (look at futex :-)

Ben.
Alexander Graf - July 10, 2013, 10:40 a.m.
On 10.07.2013, at 12:39, Benjamin Herrenschmidt wrote:

> On Wed, 2013-07-10 at 12:33 +0200, Alexander Graf wrote:
>> 
>> It's not exactly obvious that you're calling it with writing == 1 :).
>> Can you create a new local variable "is_write" in the calling
>> function, set that to 1 before the call to get_user_pages_fast and
>> pass it in instead of the 1? The compiler should easily optimize all
>> of that away, but it makes the code by far easier to read.
> 
> Ugh ?
> 
> Nobody else does that .... (look at futex :-)

Yeah, that's fortunately code that I don't have to read :).


Alex
Alexander Graf - July 10, 2013, 10:42 a.m.
On 10.07.2013, at 12:40, Alexander Graf wrote:

> 
> On 10.07.2013, at 12:39, Benjamin Herrenschmidt wrote:
> 
>> On Wed, 2013-07-10 at 12:33 +0200, Alexander Graf wrote:
>>> 
>>> It's not exactly obvious that you're calling it with writing == 1 :).
>>> Can you create a new local variable "is_write" in the calling
>>> function, set that to 1 before the call to get_user_pages_fast and
>>> pass it in instead of the 1? The compiler should easily optimize all
>>> of that away, but it makes the code by far easier to read.
>> 
>> Ugh ?
>> 
>> Nobody else does that .... (look at futex :-)
> 
> Yeah, that's fortunately code that I don't have to read :).

The "proper" alternative would be to pass an enum for read/write into the function rather than an int. But that'd be a pretty controversial, big change that I'd rather not put on Alexey. With a local variable we're nicely self-contained readable ;)


Alex
Alexey Kardashevskiy - July 11, 2013, 8:57 a.m.
On 07/10/2013 03:32 AM, Alexander Graf wrote:
> On 07/06/2013 05:07 PM, Alexey Kardashevskiy wrote:
>> This adds special support for huge pages (16MB).  The reference
>> counting cannot be easily done for such pages in real mode (when
>> MMU is off) so we added a list of huge pages.  It is populated in
>> virtual mode and get_page is called just once per a huge page.
>> Real mode handlers check if the requested page is huge and in the list,
>> then no reference counting is done, otherwise an exit to virtual mode
>> happens.  The list is released at KVM exit.  At the moment the fastest
>> card available for tests uses up to 9 huge pages so walking through this
>> list is not very expensive.  However this can change and we may want
>> to optimize this.
>>
>> Signed-off-by: Paul Mackerras<paulus@samba.org>
>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>>
>> ---
>>
>> Changes:
>> 2013/06/27:
>> * list of huge pages replaces with hashtable for better performance
> 
> So the only thing your patch description really talks about is not true
> anymore?
> 
>> * spinlock removed from real mode and only protects insertion of new
>> huge [ages descriptors into the hashtable
>>
>> 2013/06/05:
>> * fixed compile error when CONFIG_IOMMU_API=n
>>
>> 2013/05/20:
>> * the real mode handler now searches for a huge page by gpa (used to be pte)
>> * the virtual mode handler prints warning if it is called twice for the same
>> huge page as the real mode handler is expected to fail just once - when a
>> huge
>> page is not in the list yet.
>> * the huge page is refcounted twice - when added to the hugepage list and
>> when used in the virtual mode hcall handler (can be optimized but it will
>> make the patch less nice).
>>
>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/kvm_host.h |  25 +++++++++
>>   arch/powerpc/kernel/iommu.c         |   6 ++-
>>   arch/powerpc/kvm/book3s_64_vio.c    | 104
>> +++++++++++++++++++++++++++++++++---
>>   arch/powerpc/kvm/book3s_64_vio_hv.c |  21 ++++++--
>>   4 files changed, 146 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_host.h
>> b/arch/powerpc/include/asm/kvm_host.h
>> index 53e61b2..a7508cf 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -30,6 +30,7 @@
>>   #include<linux/kvm_para.h>
>>   #include<linux/list.h>
>>   #include<linux/atomic.h>
>> +#include<linux/hashtable.h>
>>   #include<asm/kvm_asm.h>
>>   #include<asm/processor.h>
>>   #include<asm/page.h>
>> @@ -182,10 +183,34 @@ struct kvmppc_spapr_tce_table {
>>       u32 window_size;
>>       struct iommu_group *grp;        /* used for IOMMU groups */
>>       struct vfio_group *vfio_grp;        /* used for IOMMU groups */
>> +    DECLARE_HASHTABLE(hash_tab, ilog2(64));    /* used for IOMMU groups */
>> +    spinlock_t hugepages_write_lock;    /* used for IOMMU groups */
>>       struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
>>       struct page *pages[0];
>>   };
>>
>> +/*
>> + * The KVM guest can be backed with 16MB pages.
>> + * In this case, we cannot do page counting from the real mode
>> + * as the compound pages are used - they are linked in a list
>> + * with pointers as virtual addresses which are inaccessible
>> + * in real mode.
>> + *
>> + * The code below keeps a 16MB pages list and uses page struct
>> + * in real mode if it is already locked in RAM and inserted into
>> + * the list or switches to the virtual mode where it can be
>> + * handled in a usual manner.
>> + */
>> +#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)    hash_32(gpa>>  24, 32)
>> +
>> +struct kvmppc_spapr_iommu_hugepage {
>> +    struct hlist_node hash_node;
>> +    unsigned long gpa;    /* Guest physical address */
>> +    unsigned long hpa;    /* Host physical address */
>> +    struct page *page;    /* page struct of the very first subpage */
>> +    unsigned long size;    /* Huge page size (always 16MB at the moment) */
>> +};
>> +
>>   struct kvmppc_linear_info {
>>       void        *base_virt;
>>       unsigned long     base_pfn;
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 51678ec..e0b6eca 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -999,7 +999,8 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned
>> long entry,
>>               if (!pg) {
>>                   ret = -EAGAIN;
>>               } else if (PageCompound(pg)) {
>> -                ret = -EAGAIN;
>> +                /* Hugepages will be released at KVM exit */
>> +                ret = 0;
>>               } else {
>>                   if (oldtce&  TCE_PCI_WRITE)
>>                       SetPageDirty(pg);
>> @@ -1009,6 +1010,9 @@ int iommu_free_tces(struct iommu_table *tbl,
>> unsigned long entry,
>>               struct page *pg = pfn_to_page(oldtce>>  PAGE_SHIFT);
>>               if (!pg) {
>>                   ret = -EAGAIN;
>> +            } else if (PageCompound(pg)) {
>> +                /* Hugepages will be released at KVM exit */
>> +                ret = 0;
>>               } else {
>>                   if (oldtce&  TCE_PCI_WRITE)
>>                       SetPageDirty(pg);
>> diff --git a/arch/powerpc/kvm/book3s_64_vio.c
>> b/arch/powerpc/kvm/book3s_64_vio.c
>> index 2b51f4a..c037219 100644
>> --- a/arch/powerpc/kvm/book3s_64_vio.c
>> +++ b/arch/powerpc/kvm/book3s_64_vio.c
>> @@ -46,6 +46,40 @@
>>
>>   #define ERROR_ADDR      ((void *)~(unsigned long)0x0)
>>
>> +#ifdef CONFIG_IOMMU_API
> 
> Can't you just make CONFIG_IOMMU_API mandatory in Kconfig?


Where exactly (it is rather SPAPR_TCE_IOMMU but does not really matter)?
Select it on KVM_BOOK3S_64? CONFIG_KVM_BOOK3S_64_HV?
CONFIG_KVM_BOOK3S_64_PR? PPC_BOOK3S_64?

I am trying to imagine a configuration where we really do not want
IOMMU_API. Ben mentioned PPC32 and embedded PPC64 and that's it so any of
BOOK3S (KVM_BOOK3S_64 is the best) should be fine, no?
Alexander Graf - July 11, 2013, 9:52 a.m.
On 11.07.2013, at 10:57, Alexey Kardashevskiy wrote:

> On 07/10/2013 03:32 AM, Alexander Graf wrote:
>> On 07/06/2013 05:07 PM, Alexey Kardashevskiy wrote:
>>> This adds special support for huge pages (16MB).  The reference
>>> counting cannot be easily done for such pages in real mode (when
>>> MMU is off) so we added a list of huge pages.  It is populated in
>>> virtual mode and get_page is called just once per a huge page.
>>> Real mode handlers check if the requested page is huge and in the list,
>>> then no reference counting is done, otherwise an exit to virtual mode
>>> happens.  The list is released at KVM exit.  At the moment the fastest
>>> card available for tests uses up to 9 huge pages so walking through this
>>> list is not very expensive.  However this can change and we may want
>>> to optimize this.
>>> 
>>> Signed-off-by: Paul Mackerras<paulus@samba.org>
>>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>>> 
>>> ---
>>> 
>>> Changes:
>>> 2013/06/27:
>>> * list of huge pages replaces with hashtable for better performance
>> 
>> So the only thing your patch description really talks about is not true
>> anymore?
>> 
>>> * spinlock removed from real mode and only protects insertion of new
>>> huge [ages descriptors into the hashtable
>>> 
>>> 2013/06/05:
>>> * fixed compile error when CONFIG_IOMMU_API=n
>>> 
>>> 2013/05/20:
>>> * the real mode handler now searches for a huge page by gpa (used to be pte)
>>> * the virtual mode handler prints warning if it is called twice for the same
>>> huge page as the real mode handler is expected to fail just once - when a
>>> huge
>>> page is not in the list yet.
>>> * the huge page is refcounted twice - when added to the hugepage list and
>>> when used in the virtual mode hcall handler (can be optimized but it will
>>> make the patch less nice).
>>> 
>>> Signed-off-by: Alexey Kardashevskiy<aik@ozlabs.ru>
>>> ---
>>>  arch/powerpc/include/asm/kvm_host.h |  25 +++++++++
>>>  arch/powerpc/kernel/iommu.c         |   6 ++-
>>>  arch/powerpc/kvm/book3s_64_vio.c    | 104
>>> +++++++++++++++++++++++++++++++++---
>>>  arch/powerpc/kvm/book3s_64_vio_hv.c |  21 ++++++--
>>>  4 files changed, 146 insertions(+), 10 deletions(-)
>>> 
>>> diff --git a/arch/powerpc/include/asm/kvm_host.h
>>> b/arch/powerpc/include/asm/kvm_host.h
>>> index 53e61b2..a7508cf 100644
>>> --- a/arch/powerpc/include/asm/kvm_host.h
>>> +++ b/arch/powerpc/include/asm/kvm_host.h
>>> @@ -30,6 +30,7 @@
>>>  #include<linux/kvm_para.h>
>>>  #include<linux/list.h>
>>>  #include<linux/atomic.h>
>>> +#include<linux/hashtable.h>
>>>  #include<asm/kvm_asm.h>
>>>  #include<asm/processor.h>
>>>  #include<asm/page.h>
>>> @@ -182,10 +183,34 @@ struct kvmppc_spapr_tce_table {
>>>      u32 window_size;
>>>      struct iommu_group *grp;        /* used for IOMMU groups */
>>>      struct vfio_group *vfio_grp;        /* used for IOMMU groups */
>>> +    DECLARE_HASHTABLE(hash_tab, ilog2(64));    /* used for IOMMU groups */
>>> +    spinlock_t hugepages_write_lock;    /* used for IOMMU groups */
>>>      struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
>>>      struct page *pages[0];
>>>  };
>>> 
>>> +/*
>>> + * The KVM guest can be backed with 16MB pages.
>>> + * In this case, we cannot do page counting from the real mode
>>> + * as the compound pages are used - they are linked in a list
>>> + * with pointers as virtual addresses which are inaccessible
>>> + * in real mode.
>>> + *
>>> + * The code below keeps a 16MB pages list and uses page struct
>>> + * in real mode if it is already locked in RAM and inserted into
>>> + * the list or switches to the virtual mode where it can be
>>> + * handled in a usual manner.
>>> + */
>>> +#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)    hash_32(gpa>>  24, 32)
>>> +
>>> +struct kvmppc_spapr_iommu_hugepage {
>>> +    struct hlist_node hash_node;
>>> +    unsigned long gpa;    /* Guest physical address */
>>> +    unsigned long hpa;    /* Host physical address */
>>> +    struct page *page;    /* page struct of the very first subpage */
>>> +    unsigned long size;    /* Huge page size (always 16MB at the moment) */
>>> +};
>>> +
>>>  struct kvmppc_linear_info {
>>>      void        *base_virt;
>>>      unsigned long     base_pfn;
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index 51678ec..e0b6eca 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -999,7 +999,8 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned
>>> long entry,
>>>              if (!pg) {
>>>                  ret = -EAGAIN;
>>>              } else if (PageCompound(pg)) {
>>> -                ret = -EAGAIN;
>>> +                /* Hugepages will be released at KVM exit */
>>> +                ret = 0;
>>>              } else {
>>>                  if (oldtce&  TCE_PCI_WRITE)
>>>                      SetPageDirty(pg);
>>> @@ -1009,6 +1010,9 @@ int iommu_free_tces(struct iommu_table *tbl,
>>> unsigned long entry,
>>>              struct page *pg = pfn_to_page(oldtce>>  PAGE_SHIFT);
>>>              if (!pg) {
>>>                  ret = -EAGAIN;
>>> +            } else if (PageCompound(pg)) {
>>> +                /* Hugepages will be released at KVM exit */
>>> +                ret = 0;
>>>              } else {
>>>                  if (oldtce&  TCE_PCI_WRITE)
>>>                      SetPageDirty(pg);
>>> diff --git a/arch/powerpc/kvm/book3s_64_vio.c
>>> b/arch/powerpc/kvm/book3s_64_vio.c
>>> index 2b51f4a..c037219 100644
>>> --- a/arch/powerpc/kvm/book3s_64_vio.c
>>> +++ b/arch/powerpc/kvm/book3s_64_vio.c
>>> @@ -46,6 +46,40 @@
>>> 
>>>  #define ERROR_ADDR      ((void *)~(unsigned long)0x0)
>>> 
>>> +#ifdef CONFIG_IOMMU_API
>> 
>> Can't you just make CONFIG_IOMMU_API mandatory in Kconfig?
> 
> 
> Where exactly (it is rather SPAPR_TCE_IOMMU but does not really matter)?
> Select it on KVM_BOOK3S_64? CONFIG_KVM_BOOK3S_64_HV?
> CONFIG_KVM_BOOK3S_64_PR? PPC_BOOK3S_64?

I'd say the most logical choice would be to check the Makefile and see when it gets compiled. For those cases we want it enabled.

> I am trying to imagine a configuration where we really do not want
> IOMMU_API. Ben mentioned PPC32 and embedded PPC64 and that's it so any of
> BOOK3S (KVM_BOOK3S_64 is the best) should be fine, no?

book3s_32 doesn't want this, but any book3s_64 implementation could potentially use it, yes. That's pretty much what the Makefile tells you too :).


Alex
Benjamin Herrenschmidt - July 11, 2013, 12:37 p.m.
On Thu, 2013-07-11 at 11:52 +0200, Alexander Graf wrote:
> > Where exactly (it is rather SPAPR_TCE_IOMMU but does not really
> matter)?
> > Select it on KVM_BOOK3S_64? CONFIG_KVM_BOOK3S_64_HV?
> > CONFIG_KVM_BOOK3S_64_PR? PPC_BOOK3S_64?
> 
> I'd say the most logical choice would be to check the Makefile and see
> when it gets compiled. For those cases we want it enabled.

What *what* gets compiled ? You know our Makefile, it's crap :-)

We enable built-in things when CONFIG_KVM=m (which means you cannot take
a kernel build with CONFIG_KVM not set, enable CONFIG_KVM=m, and just
build the module, it won't work).

We could use KVM_BOOK3S_64 maybe ?

> > I am trying to imagine a configuration where we really do not want
> > IOMMU_API. Ben mentioned PPC32 and embedded PPC64 and that's it so
> any of
> > BOOK3S (KVM_BOOK3S_64 is the best) should be fine, no?
> 
> book3s_32 doesn't want this, but any book3s_64 implementation could
> potentially use it, yes. That's pretty much what the Makefile tells
> you too :).

Not really no. But that would do. You could have give a more useful
answer in the first place though rather than stringing him along.

Cheers,
Ben.
Alexander Graf - July 11, 2013, 12:50 p.m.
On 11.07.2013, at 14:37, Benjamin Herrenschmidt wrote:

> On Thu, 2013-07-11 at 11:52 +0200, Alexander Graf wrote:
>>> Where exactly (it is rather SPAPR_TCE_IOMMU but does not really
>> matter)?
>>> Select it on KVM_BOOK3S_64? CONFIG_KVM_BOOK3S_64_HV?
>>> CONFIG_KVM_BOOK3S_64_PR? PPC_BOOK3S_64?
>> 
>> I'd say the most logical choice would be to check the Makefile and see
>> when it gets compiled. For those cases we want it enabled.
> 
> What *what* gets compiled ? You know our Makefile, it's crap :-)
> 
> We enable built-in things when CONFIG_KVM=m (which means you cannot take
> a kernel build with CONFIG_KVM not set, enable CONFIG_KVM=m, and just
> build the module, it won't work).
> 
> We could use KVM_BOOK3S_64 maybe ?

If either a =m or a =y option selects a =y option, it gets selected regardless, no? So it shouldn't really matter where we attach it FWIW.

> 
>>> I am trying to imagine a configuration where we really do not want
>>> IOMMU_API. Ben mentioned PPC32 and embedded PPC64 and that's it so
>> any of
>>> BOOK3S (KVM_BOOK3S_64 is the best) should be fine, no?
>> 
>> book3s_32 doesn't want this, but any book3s_64 implementation could
>> potentially use it, yes. That's pretty much what the Makefile tells
>> you too :).
> 
> Not really no. But that would do. You could have give a more useful
> answer in the first place though rather than stringing him along.

Sorry, I figured it was obvious.


Alex
Benjamin Herrenschmidt - July 11, 2013, 12:56 p.m.
On Thu, 2013-07-11 at 14:50 +0200, Alexander Graf wrote:
> > Not really no. But that would do. You could have give a more useful
> > answer in the first place though rather than stringing him along.
> 
> Sorry, I figured it was obvious.

It wasn't no, because of the mess with modules and the nasty Makefile we
have in there. Even I had to scratch my head for a bit :-)

Ben.
chandrashekar shastri - July 11, 2013, 1:41 p.m.
Hi All,

I complied the latest kernel 3.10.0+ pulled from the git on top of  
3.10.0-rc5+ by enabling the new Virtualiztaion features. The compliation 
was sucessfull, when I rebooted the machine it fails to boot with error 
as " systemd [1] : Failed to mount /dev : no such device.

Is it problem with the KVM module?

Thanks,
Shastri

On 07/11/2013 06:26 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2013-07-11 at 14:50 +0200, Alexander Graf wrote:
>>> Not really no. But that would do. You could have give a more useful
>>> answer in the first place though rather than stringing him along.
>> Sorry, I figured it was obvious.
> It wasn't no, because of the mess with modules and the nasty Makefile we
> have in there. Even I had to scratch my head for a bit :-)
>
> Ben.
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
Alexander Graf - July 11, 2013, 1:44 p.m.
On 11.07.2013, at 15:41, chandrashekar shastri wrote:

> Hi All,
> 
> I complied the latest kernel 3.10.0+ pulled from the git on top of  3.10.0-rc5+ by enabling the new Virtualiztaion features. The compliation was sucessfull, when I rebooted the machine it fails to boot with error as " systemd [1] : Failed to mount /dev : no such device.
> 
> Is it problem with the KVM module?

Very unlikely. You're probably missing generic config options in your .config file. But this is very off topic for a) this thread and b) these mailing lists.


Alex
Alexey Kardashevskiy - July 11, 2013, 1:46 p.m.
On 07/11/2013 11:41 PM, chandrashekar shastri wrote:
> Hi All,
> 
> I complied the latest kernel 3.10.0+ pulled from the git on top of 
> 3.10.0-rc5+ by enabling the new Virtualiztaion features. The compliation
> was sucessfull, when I rebooted the machine it fails to boot with error as
> " systemd [1] : Failed to mount /dev : no such device.
> 
> Is it problem with the KVM module?


Wrong thread actually, would be better if you started the new one.

And you may want to try this - http://patchwork.ozlabs.org/patch/256027/

Patch

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 53e61b2..a7508cf 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -30,6 +30,7 @@ 
 #include <linux/kvm_para.h>
 #include <linux/list.h>
 #include <linux/atomic.h>
+#include <linux/hashtable.h>
 #include <asm/kvm_asm.h>
 #include <asm/processor.h>
 #include <asm/page.h>
@@ -182,10 +183,34 @@  struct kvmppc_spapr_tce_table {
 	u32 window_size;
 	struct iommu_group *grp;		/* used for IOMMU groups */
 	struct vfio_group *vfio_grp;		/* used for IOMMU groups */
+	DECLARE_HASHTABLE(hash_tab, ilog2(64));	/* used for IOMMU groups */
+	spinlock_t hugepages_write_lock;	/* used for IOMMU groups */
 	struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
 	struct page *pages[0];
 };
 
+/*
+ * The KVM guest can be backed with 16MB pages.
+ * In this case, we cannot do page counting from the real mode
+ * as the compound pages are used - they are linked in a list
+ * with pointers as virtual addresses which are inaccessible
+ * in real mode.
+ *
+ * The code below keeps a 16MB pages list and uses page struct
+ * in real mode if it is already locked in RAM and inserted into
+ * the list or switches to the virtual mode where it can be
+ * handled in a usual manner.
+ */
+#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)	hash_32(gpa >> 24, 32)
+
+struct kvmppc_spapr_iommu_hugepage {
+	struct hlist_node hash_node;
+	unsigned long gpa;	/* Guest physical address */
+	unsigned long hpa;	/* Host physical address */
+	struct page *page;	/* page struct of the very first subpage */
+	unsigned long size;	/* Huge page size (always 16MB at the moment) */
+};
+
 struct kvmppc_linear_info {
 	void		*base_virt;
 	unsigned long	 base_pfn;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 51678ec..e0b6eca 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -999,7 +999,8 @@  int iommu_free_tces(struct iommu_table *tbl, unsigned long entry,
 			if (!pg) {
 				ret = -EAGAIN;
 			} else if (PageCompound(pg)) {
-				ret = -EAGAIN;
+				/* Hugepages will be released at KVM exit */
+				ret = 0;
 			} else {
 				if (oldtce & TCE_PCI_WRITE)
 					SetPageDirty(pg);
@@ -1009,6 +1010,9 @@  int iommu_free_tces(struct iommu_table *tbl, unsigned long entry,
 			struct page *pg = pfn_to_page(oldtce >> PAGE_SHIFT);
 			if (!pg) {
 				ret = -EAGAIN;
+			} else if (PageCompound(pg)) {
+				/* Hugepages will be released at KVM exit */
+				ret = 0;
 			} else {
 				if (oldtce & TCE_PCI_WRITE)
 					SetPageDirty(pg);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 2b51f4a..c037219 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -46,6 +46,40 @@ 
 
 #define ERROR_ADDR      ((void *)~(unsigned long)0x0)
 
+#ifdef CONFIG_IOMMU_API
+static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
+{
+	spin_lock_init(&tt->hugepages_write_lock);
+	hash_init(tt->hash_tab);
+}
+
+static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt)
+{
+	int bkt;
+	struct kvmppc_spapr_iommu_hugepage *hp;
+	struct hlist_node *tmp;
+
+	spin_lock(&tt->hugepages_write_lock);
+	hash_for_each_safe(tt->hash_tab, bkt, tmp, hp, hash_node) {
+		pr_debug("Release HP liobn=%llx #%u gpa=%lx hpa=%lx size=%ld\n",
+				tt->liobn, bkt, hp->gpa, hp->hpa, hp->size);
+		hlist_del_rcu(&hp->hash_node);
+
+		put_page(hp->page);
+		kfree(hp);
+	}
+	spin_unlock(&tt->hugepages_write_lock);
+}
+#else
+static void kvmppc_iommu_hugepages_init(struct kvmppc_spapr_tce_table *tt)
+{
+}
+
+static void kvmppc_iommu_hugepages_cleanup(struct kvmppc_spapr_tce_table *tt)
+{
+}
+#endif /* CONFIG_IOMMU_API */
+
 static long kvmppc_stt_npages(unsigned long window_size)
 {
 	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
@@ -112,6 +146,7 @@  static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
 
 	mutex_lock(&kvm->lock);
 	list_del(&stt->list);
+	kvmppc_iommu_hugepages_cleanup(stt);
 
 #ifdef CONFIG_IOMMU_API
 	if (stt->grp) {
@@ -200,6 +235,7 @@  long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	kvm_get_kvm(kvm);
 
 	mutex_lock(&kvm->lock);
+	kvmppc_iommu_hugepages_init(stt);
 	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
 
 	mutex_unlock(&kvm->lock);
@@ -283,6 +319,7 @@  long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
 
 	kvm_get_kvm(kvm);
 	mutex_lock(&kvm->lock);
+	kvmppc_iommu_hugepages_init(tt);
 	list_add(&tt->list, &kvm->arch.spapr_tce_tables);
 	mutex_unlock(&kvm->lock);
 
@@ -307,10 +344,17 @@  long kvm_vm_ioctl_create_spapr_tce_iommu(struct kvm *kvm,
 
 /* Converts guest physical address to host virtual address */
 static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
+		struct kvmppc_spapr_tce_table *tt,
 		unsigned long gpa, struct page **pg, unsigned long *hpa)
 {
 	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
 	struct kvm_memory_slot *memslot;
+#ifdef CONFIG_IOMMU_API
+	struct kvmppc_spapr_iommu_hugepage *hp;
+	unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
+	pte_t *ptep;
+	unsigned int shift = 0;
+#endif
 
 	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
 	if (!memslot)
@@ -325,6 +369,54 @@  static void __user *kvmppc_vm_gpa_to_hva_and_get(struct kvm_vcpu *vcpu,
 		*hpa = __pa((unsigned long) page_address(*pg)) +
 				(hva & ~PAGE_MASK);
 
+#ifdef CONFIG_IOMMU_API
+	if (!PageCompound(*pg))
+		return (void *) hva;
+
+	spin_lock(&tt->hugepages_write_lock);
+	hash_for_each_possible_rcu(tt->hash_tab, hp, hash_node, key) {
+		if ((gpa < hp->gpa) || (gpa >= hp->gpa + hp->size))
+			continue;
+		if (hpa)
+			*hpa = __pa((unsigned long) page_address(hp->page)) +
+					(hva & (hp->size - 1));
+		goto unlock_exit;
+	}
+
+	ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift);
+	WARN_ON(!ptep);
+
+	if (!ptep || (shift <= PAGE_SHIFT)) {
+		hva = (unsigned long) ERROR_ADDR;
+		goto unlock_exit;
+	}
+
+	hp = kzalloc(sizeof(*hp), GFP_KERNEL);
+	if (!hp) {
+		hva = (unsigned long) ERROR_ADDR;
+		goto unlock_exit;
+	}
+
+	hp->gpa = gpa & ~((1 << shift) - 1);
+	hp->hpa = (pte_pfn(*ptep) << PAGE_SHIFT);
+	hp->size = 1 << shift;
+
+	if (get_user_pages_fast(hva & ~(hp->size - 1), 1, 1, &hp->page) != 1) {
+		hva = (unsigned long) ERROR_ADDR;
+		kfree(hp);
+		goto unlock_exit;
+	}
+	hash_add_rcu(tt->hash_tab, &hp->hash_node, key);
+
+	if (hpa)
+		*hpa = __pa((unsigned long) page_address(hp->page)) +
+				(hva & (hp->size - 1));
+unlock_exit:
+	spin_unlock(&tt->hugepages_write_lock);
+
+	put_page(*pg);
+	*pg = NULL;
+#endif /* CONFIG_IOMMU_API */
 	return (void *) hva;
 }
 
@@ -363,7 +455,7 @@  long kvmppc_vm_h_put_tce_iommu(struct kvm_vcpu *vcpu,
 		if (iommu_tce_put_param_check(tbl, ioba, tce))
 			return H_PARAMETER;
 
-		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tce, &pg, &hpa);
+		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, tce, &pg, &hpa);
 		if (hva == ERROR_ADDR)
 			return H_HARDWARE;
 	}
@@ -372,7 +464,7 @@  long kvmppc_vm_h_put_tce_iommu(struct kvm_vcpu *vcpu,
 		return H_SUCCESS;
 
 	pg = pfn_to_page(hpa >> PAGE_SHIFT);
-	if (pg)
+	if (pg && !PageCompound(pg))
 		put_page(pg);
 
 	return H_HARDWARE;
@@ -414,7 +506,7 @@  static long kvmppc_vm_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
 					(i << IOMMU_PAGE_SHIFT), gpa))
 			return H_PARAMETER;
 
-		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, gpa, &pg,
+		hva = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, gpa, &pg,
 				&vcpu->arch.tce_tmp_hpas[i]);
 		if (hva == ERROR_ADDR)
 			goto putpages_flush_exit;
@@ -429,7 +521,7 @@  putpages_flush_exit:
 	for ( --i; i >= 0; --i) {
 		struct page *pg;
 		pg = pfn_to_page(vcpu->arch.tce_tmp_hpas[i] >> PAGE_SHIFT);
-		if (pg)
+		if (pg && !PageCompound(pg))
 			put_page(pg);
 	}
 
@@ -517,7 +609,7 @@  long kvmppc_vm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
 		return H_PARAMETER;
 
-	tces = kvmppc_vm_gpa_to_hva_and_get(vcpu, tce_list, &pg, NULL);
+	tces = kvmppc_vm_gpa_to_hva_and_get(vcpu, tt, tce_list, &pg, NULL);
 	if (tces == ERROR_ADDR)
 		return H_TOO_HARD;
 
@@ -547,7 +639,7 @@  long kvmppc_vm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 		kvmppc_emulated_put_tce(tt, ioba + (i << IOMMU_PAGE_SHIFT),
 				vcpu->arch.tce_tmp_hpas[i]);
 put_list_page_exit:
-	if (pg)
+	if (pg && !PageCompound(pg))
 		put_page(pg);
 
 	if (vcpu->arch.tce_rm_fail != TCERM_NONE) {
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index f8103c6..8c6449f 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -132,6 +132,7 @@  EXPORT_SYMBOL_GPL(kvmppc_emulated_put_tce);
  * returns ERROR_ADDR if failed.
  */
 static unsigned long kvmppc_rm_gpa_to_hpa_and_get(struct kvm_vcpu *vcpu,
+		struct kvmppc_spapr_tce_table *tt,
 		unsigned long gpa, struct page **pg)
 {
 	struct kvm_memory_slot *memslot;
@@ -139,6 +140,20 @@  static unsigned long kvmppc_rm_gpa_to_hpa_and_get(struct kvm_vcpu *vcpu,
 	unsigned long hva, hpa = ERROR_ADDR;
 	unsigned long gfn = gpa >> PAGE_SHIFT;
 	unsigned shift = 0;
+	struct kvmppc_spapr_iommu_hugepage *hp;
+
+	/* Try to find an already used hugepage */
+	unsigned key = KVMPPC_SPAPR_HUGEPAGE_HASH(gpa);
+
+	hash_for_each_possible_rcu_notrace(tt->hash_tab, hp,
+			hash_node, key) {
+		if ((gpa < hp->gpa) || (gpa >= hp->gpa + hp->size))
+			continue;
+
+		*pg = NULL; /* Tell the caller not to put page */
+
+		return hp->hpa + (gpa & (hp->size - 1));
+	}
 
 	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
 	if (!memslot)
@@ -208,7 +223,7 @@  static long kvmppc_h_put_tce_iommu(struct kvm_vcpu *vcpu,
 	if (iommu_tce_put_param_check(tbl, ioba, tce))
 		return H_PARAMETER;
 
-	hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tce, &pg);
+	hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tce, &pg);
 	if (hpa == ERROR_ADDR)
 		return H_TOO_HARD;
 
@@ -247,7 +262,7 @@  static long kvmppc_h_put_tce_indirect_iommu(struct kvm_vcpu *vcpu,
 
 	/* Translate TCEs and go get_page() */
 	for (i = 0; i < npages; ++i) {
-		hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tces[i], &pg);
+		hpa = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tces[i], &pg);
 		if (hpa == ERROR_ADDR) {
 			vcpu->arch.tce_tmp_num = i;
 			vcpu->arch.tce_rm_fail = TCERM_GETPAGE;
@@ -342,7 +357,7 @@  long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
 		return H_PARAMETER;
 
-	tces = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tce_list, &pg);
+	tces = kvmppc_rm_gpa_to_hpa_and_get(vcpu, tt, tce_list, &pg);
 	if (tces == ERROR_ADDR)
 		return H_TOO_HARD;