[RFC,v1,1/4] kvmppc: HMM backend driver to manage pages of secure guest

Message ID 20181022051837.1165-2-bharata@linux.ibm.com
State New
Headers show
Series
  • kvmppc: HMM backend driver to manage pages of secure guest
Related show

Commit Message

Bharata B Rao Oct. 22, 2018, 5:18 a.m.
HMM driver for KVM PPC to manage page transitions of
secure guest via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.

H_SVM_PAGE_IN: Move the content of a normal page to secure page
H_SVM_PAGE_OUT: Move the content of a secure page to normal page

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
 arch/powerpc/include/asm/hvcall.h    |   7 +-
 arch/powerpc/include/asm/kvm_host.h  |  15 +
 arch/powerpc/include/asm/kvm_ppc.h   |  28 ++
 arch/powerpc/include/asm/ucall-api.h |  20 ++
 arch/powerpc/kvm/Makefile            |   3 +
 arch/powerpc/kvm/book3s_hv.c         |  38 ++
 arch/powerpc/kvm/book3s_hv_hmm.c     | 514 +++++++++++++++++++++++++++
 7 files changed, 624 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/ucall-api.h
 create mode 100644 arch/powerpc/kvm/book3s_hv_hmm.c

Comments

Paul Mackerras Oct. 30, 2018, 5:03 a.m. | #1
On Mon, Oct 22, 2018 at 10:48:34AM +0530, Bharata B Rao wrote:
> HMM driver for KVM PPC to manage page transitions of
> secure guest via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.
> 
> H_SVM_PAGE_IN: Move the content of a normal page to secure page
> H_SVM_PAGE_OUT: Move the content of a secure page to normal page

Comments below...

> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/hvcall.h    |   7 +-
>  arch/powerpc/include/asm/kvm_host.h  |  15 +
>  arch/powerpc/include/asm/kvm_ppc.h   |  28 ++
>  arch/powerpc/include/asm/ucall-api.h |  20 ++
>  arch/powerpc/kvm/Makefile            |   3 +
>  arch/powerpc/kvm/book3s_hv.c         |  38 ++
>  arch/powerpc/kvm/book3s_hv_hmm.c     | 514 +++++++++++++++++++++++++++
>  7 files changed, 624 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/include/asm/ucall-api.h
>  create mode 100644 arch/powerpc/kvm/book3s_hv_hmm.c
> 
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index a0b17f9f1ea4..89e6b70c1857 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -158,6 +158,9 @@
>  /* Each control block has to be on a 4K boundary */
>  #define H_CB_ALIGNMENT          4096
>  
> +/* Flags for H_SVM_PAGE_IN */
> +#define H_PAGE_IN_SHARED	0x1
> +
>  /* pSeries hypervisor opcodes */
>  #define H_REMOVE		0x04
>  #define H_ENTER			0x08
> @@ -295,7 +298,9 @@
>  #define H_INT_ESB               0x3C8
>  #define H_INT_SYNC              0x3CC
>  #define H_INT_RESET             0x3D0
> -#define MAX_HCALL_OPCODE	H_INT_RESET
> +#define H_SVM_PAGE_IN		0x3D4
> +#define H_SVM_PAGE_OUT		0x3D8
> +#define MAX_HCALL_OPCODE	H_SVM_PAGE_OUT

We should define hcall numbers in the implementation-specific range.
We can't use numbers in this range without first getting them
standardized in PAPR.  Since these hcalls are not actually used by
the guest but are just a private interface between KVM and the
ultravisor, it's probably not worth putting them in PAPR.  We should
pick a range somewhere in the 0xf000 - 0xfffc area and use that.

>  /* H_VIOCTL functions */
>  #define H_GET_VIOA_DUMP_SIZE	0x01
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 906bcbdfd2a1..194e6e0ff239 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -310,6 +310,9 @@ struct kvm_arch {
>  	struct kvmppc_passthru_irqmap *pimap;
>  #endif
>  	struct kvmppc_ops *kvm_ops;
> +#ifdef CONFIG_PPC_SVM
> +	struct hlist_head *hmm_hash;
> +#endif

I'd suggest not having the #ifdef here, given it's only 8 bytes and
the type (struct hlist_head) doesn't depend on CONFIG_PPC_SVM.

>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	/* This array can grow quite large, keep it at the end */
>  	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
> @@ -830,4 +833,16 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
>  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
>  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
>  
> +#ifdef CONFIG_PPC_SVM
> +struct kvmppc_hmm_device {
> +	struct hmm_device *device;
> +	struct hmm_devmem *devmem;
> +	unsigned long *pfn_bitmap;
> +};
> +
> +extern int kvmppc_hmm_init(void);
> +extern void kvmppc_hmm_free(void);
> +extern int kvmppc_hmm_hash_create(struct kvm *kvm);
> +extern void kvmppc_hmm_hash_destroy(struct kvm *kvm);
> +#endif
>  #endif /* __POWERPC_KVM_HOST_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index e991821dd7fa..ba81a07e2bdf 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -906,4 +906,32 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
>  
>  extern void xics_wake_cpu(int cpu);
>  
> +#ifdef CONFIG_PPC_SVM
> +extern unsigned long kvmppc_h_svm_page_in(struct kvm *kvm,
> +					  unsigned int lpid,
> +					  unsigned long gra,
> +					  unsigned long flags,
> +					  unsigned long page_shift);
> +extern unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
> +					  unsigned int lpid,
> +					  unsigned long gra,
> +					  unsigned long flags,
> +					  unsigned long page_shift);
> +#else
> +static inline unsigned long
> +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned int lpid,
> +		     unsigned long gra, unsigned long flags,
> +		     unsigned long page_shift)
> +{
> +	return H_UNSUPPORTED;
> +}
> +
> +static inline unsigned long
> +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned int lpid,
> +		      unsigned long gra, unsigned long flags,
> +		      unsigned long page_shift)
> +{
> +	return H_UNSUPPORTED;
> +}
> +#endif
>  #endif /* __POWERPC_KVM_PPC_H__ */
> diff --git a/arch/powerpc/include/asm/ucall-api.h b/arch/powerpc/include/asm/ucall-api.h
> new file mode 100644
> index 000000000000..2c12f514f8ab
> --- /dev/null
> +++ b/arch/powerpc/include/asm/ucall-api.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_UCALL_API_H
> +#define _ASM_POWERPC_UCALL_API_H
> +
> +#define U_SUCCESS 0
> +
> +/*
> + * TODO: Dummy uvcalls, will be replaced by real calls
> + */
> +static inline int uv_page_in(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
> +{
> +	return U_SUCCESS;
> +}
> +
> +static inline int uv_page_out(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
> +{
> +	return U_SUCCESS;
> +}

Please give these real parameter names, even if the implementations
are dummy implementations for now.

> +
> +#endif	/* _ASM_POWERPC_UCALL_API_H */
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index f872c04bb5b1..6945ffc18679 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -77,6 +77,9 @@ kvm-hv-y += \
>  	book3s_64_mmu_hv.o \
>  	book3s_64_mmu_radix.o
>  
> +kvm-hv-$(CONFIG_PPC_SVM) += \
> +	book3s_hv_hmm.o
> +
>  kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
>  	book3s_hv_tm.o
>  
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 3e3a71594e63..05084eb8aadd 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -73,6 +73,7 @@
>  #include <asm/opal.h>
>  #include <asm/xics.h>
>  #include <asm/xive.h>
> +#include <asm/kvm_host.h>
>  
>  #include "book3s.h"
>  
> @@ -935,6 +936,20 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  		if (ret == H_TOO_HARD)
>  			return RESUME_HOST;
>  		break;
> +	case H_SVM_PAGE_IN:
> +		ret = kvmppc_h_svm_page_in(vcpu->kvm,
> +					   kvmppc_get_gpr(vcpu, 4),
> +					   kvmppc_get_gpr(vcpu, 5),
> +					   kvmppc_get_gpr(vcpu, 6),
> +					   kvmppc_get_gpr(vcpu, 7));
> +		break;
> +	case H_SVM_PAGE_OUT:
> +		ret = kvmppc_h_svm_page_out(vcpu->kvm,
> +					    kvmppc_get_gpr(vcpu, 4),
> +					    kvmppc_get_gpr(vcpu, 5),
> +					    kvmppc_get_gpr(vcpu, 6),
> +					    kvmppc_get_gpr(vcpu, 7));
> +		break;
>  	default:
>  		return RESUME_HOST;
>  	}
> @@ -961,6 +976,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
>  	case H_IPOLL:
>  	case H_XIRR_X:
>  #endif
> +	case H_SVM_PAGE_IN:
> +	case H_SVM_PAGE_OUT:
>  		return 1;
>  	}

You don't need this if you use hcall numbers in the implementation
specific range.

>  
> @@ -3938,6 +3955,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
>  		return -ENOMEM;
>  	kvm->arch.lpid = lpid;
>  
> +#ifdef CONFIG_PPC_SVM
> +	ret = kvmppc_hmm_hash_create(kvm);
> +	if (ret) {
> +		kvmppc_free_lpid(kvm->arch.lpid);
> +		return ret;
> +	}
> +#endif

Please don't put #ifdefs inside function definitions.  Instead,
arrange to define dummy/null implementations of
kvmppc_hmm_hash_create() etc. when CONFIG_PPC_SVM=n.

>  	kvmppc_alloc_host_rm_ops();
>  
>  	/*
> @@ -4073,6 +4097,9 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
>  
>  	kvmppc_free_vcores(kvm);
>  
> +#ifdef CONFIG_PPC_SVM
> +	kvmppc_hmm_hash_destroy(kvm);
> +#endif

Ditto.

>  	kvmppc_free_lpid(kvm->arch.lpid);
>  
>  	if (kvm_is_radix(kvm))
> @@ -4384,6 +4411,8 @@ static unsigned int default_hcall_list[] = {
>  	H_XIRR,
>  	H_XIRR_X,
>  #endif
> +	H_SVM_PAGE_IN,
> +	H_SVM_PAGE_OUT,

Once again, this goes away if these are in the implementation specific
range.

>  	0
>  };
>  
> @@ -4596,11 +4625,20 @@ static int kvmppc_book3s_init_hv(void)
>  			no_mixing_hpt_and_radix = true;
>  	}
>  
> +#ifdef CONFIG_PPC_SVM
> +	r = kvmppc_hmm_init();
> +	if (r < 0)
> +		pr_err("KVM-HV: kvmppc_hmm_init failed %d\n", r);
> +#endif

Same comment about #ifdefs.

> +
>  	return r;
>  }
>  
>  static void kvmppc_book3s_exit_hv(void)
>  {
> +#ifdef CONFIG_PPC_SVM
> +	kvmppc_hmm_free();
> +#endif

Ditto.

>  	kvmppc_free_host_rm_ops();
>  	if (kvmppc_radix_possible())
>  		kvmppc_radix_exit();
> diff --git a/arch/powerpc/kvm/book3s_hv_hmm.c b/arch/powerpc/kvm/book3s_hv_hmm.c
> new file mode 100644
> index 000000000000..a2ee3163a312
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_hmm.c
> @@ -0,0 +1,514 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * HMM driver to manage page migration between normal and secure
> + * memory.
> + *
> + * Based on Jérôme Glisse's HMM dummy driver.
> + *
> + * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
> + */
> +
> +/*
> + * A pseries guest can be run as a secure guest on Ultravisor-enabled
> + * POWER platforms. On such platforms, this driver will be used to manage
> + * the movement of guest pages between the normal memory managed by
> + * hypervisor (HV) and secure memory managed by Ultravisor (UV).
> + *
> + * Private ZONE_DEVICE memory equal to the amount of secure memory
> + * available in the platform for running secure guests is created
> + * via a HMM device. The movement of pages between normal and secure
> + * memory is done by ->alloc_and_copy() callback routine of migrate_vma().
> + *
> + * The page-in or page-out requests from UV will come to HV as hcalls and
> + * HV will call back into UV via uvcalls to satisfy these page requests.
> + *
> + * For each page that gets moved into secure memory, a HMM PFN is used
> + * on the HV side and HMM migration PTE corresponding to that PFN would be
> + * populated in the QEMU page tables. A per-guest hash table is created to
> + * manage the pool of HMM PFNs. Guest real address is used as key to index
> + * into the hash table and choose a free HMM PFN.
> + */
> +
> +#include <linux/hmm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/sched/mm.h>
> +#include <asm/ucall-api.h>
> +
> +static struct kvmppc_hmm_device *kvmppc_hmm;
> +spinlock_t kvmppc_hmm_lock;
> +
> +#define KVMPPC_HMM_HASH_BITS    10
> +#define KVMPPC_HMM_HASH_SIZE   (1 << KVMPPC_HMM_HASH_BITS)
> +
> +struct kvmppc_hmm_pfn_entry {
> +	struct hlist_node hlist;
> +	unsigned long addr;
> +	unsigned long hmm_pfn;
> +};
> +
> +struct kvmppc_hmm_page_pvt {
> +	struct hlist_head *hmm_hash;
> +	unsigned int lpid;
> +	unsigned long gpa;
> +};
> +
> +struct kvmppc_hmm_migrate_args {
> +	struct hlist_head *hmm_hash;
> +	unsigned int lpid;
> +	unsigned long gpa;
> +	unsigned long page_shift;
> +};
> +
> +int kvmppc_hmm_hash_create(struct kvm *kvm)
> +{
> +	int i;
> +
> +	kvm->arch.hmm_hash = kzalloc(KVMPPC_HMM_HASH_SIZE *
> +				     sizeof(struct hlist_head), GFP_KERNEL);
> +	if (!kvm->arch.hmm_hash)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++)
> +		INIT_HLIST_HEAD(&kvm->arch.hmm_hash[i]);
> +	return 0;
> +}

I gather that this hash table maps a guest real address (or pfn) to a
hmm_pfn, is that right?  I think that mapping could be stored in the
rmap arrays attached to the memslots, since there won't be any
hypervisor-side mappings of the page while it is over in the secure
space.  But maybe that could be a future optimization.

> +/*
> + * Cleanup the HMM pages hash table when guest terminates
> + *
> + * Iterate over all the HMM pages hash list entries and release
> + * reference on them. The actual freeing of the entry happens
> + * via hmm_devmem_ops.free path.
> + */
> +void kvmppc_hmm_hash_destroy(struct kvm *kvm)
> +{
> +	int i;
> +	struct kvmppc_hmm_pfn_entry *p;
> +	struct page *hmm_page;
> +
> +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++) {
> +		while (!hlist_empty(&kvm->arch.hmm_hash[i])) {
> +			p = hlist_entry(kvm->arch.hmm_hash[i].first,
> +					struct kvmppc_hmm_pfn_entry,
> +					hlist);
> +			hmm_page = pfn_to_page(p->hmm_pfn);
> +			put_page(hmm_page);
> +		}
> +	}
> +	kfree(kvm->arch.hmm_hash);
> +}
> +
> +static u64 kvmppc_hmm_pfn_hash_fn(u64 addr)
> +{
> +	return hash_64(addr, KVMPPC_HMM_HASH_BITS);
> +}
> +
> +static void
> +kvmppc_hmm_hash_free_pfn(struct hlist_head *hmm_hash, unsigned long gpa)
> +{
> +	struct kvmppc_hmm_pfn_entry *p;
> +	struct hlist_head *list;
> +
> +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> +	hlist_for_each_entry(p, list, hlist) {
> +		if (p->addr == gpa) {
> +			hlist_del(&p->hlist);
> +			kfree(p);
> +			return;
> +		}
> +	}
> +}
> +
> +/*
> + * Get a free HMM PFN from the pool
> + *
> + * Called when a normal page is moved to secure memory (UV_PAGE_IN). HMM
> + * PFN will be used to keep track of the secure page on HV side.
> + */
> +static struct page *kvmppc_hmm_get_page(struct hlist_head *hmm_hash,
> +					unsigned long gpa, unsigned int lpid)
> +{
> +	struct page *dpage = NULL;
> +	unsigned long bit;
> +	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
> +				kvmppc_hmm->devmem->pfn_first;
> +	struct hlist_head *list;
> +	struct kvmppc_hmm_pfn_entry *p;
> +	bool found = false;
> +	unsigned long flags;
> +	struct kvmppc_hmm_page_pvt *pvt;
> +
> +	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
> +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> +	hlist_for_each_entry(p, list, hlist) {
> +		if (p->addr == gpa) {
> +			found = true;
> +			break;
> +		}
> +	}
> +	if (!found) {
> +		p = kzalloc(sizeof(struct kvmppc_hmm_pfn_entry), GFP_ATOMIC);
> +		if (!p) {
> +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +			return NULL;
> +		}
> +		p->addr = gpa;
> +		bit = find_first_zero_bit(kvmppc_hmm->pfn_bitmap, nr_pfns);
> +		if (bit >= nr_pfns) {
> +			kfree(p);
> +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +			return NULL;
> +		}
> +		bitmap_set(kvmppc_hmm->pfn_bitmap, bit, 1);
> +		p->hmm_pfn = bit + kvmppc_hmm->devmem->pfn_first;
> +		INIT_HLIST_NODE(&p->hlist);
> +		hlist_add_head(&p->hlist, list);
> +	} else {
> +		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +		return NULL;
> +	}

I suggest you redo that if statement as:

	if (found) {
		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
		return NULL;
	}

and then have the main path (the code starting with p = kzalloc...) at
a single level of indentation, so the normal/successful path is more
obvious.

> +	dpage = pfn_to_page(p->hmm_pfn);
> +
> +	if (!trylock_page(dpage)) {

Where does this page get unlocked?

> +		bitmap_clear(kvmppc_hmm->pfn_bitmap,
> +			     p->hmm_pfn - kvmppc_hmm->devmem->pfn_first, 1);
> +		hlist_del(&p->hlist);
> +		kfree(p);
> +		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +		return NULL;
> +	}
> +	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +
> +	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
> +	pvt->hmm_hash = hmm_hash;
> +	pvt->gpa = gpa;
> +	pvt->lpid = lpid;
> +	hmm_devmem_page_set_drvdata(dpage, (unsigned long)pvt);
> +
> +	get_page(dpage);
> +	return dpage;
> +}
> +
> +/*
> + * Release the HMM PFN back to the pool
> + *
> + * Called when secure page becomes a normal page during UV_PAGE_OUT.
> + */
> +static void kvmppc_hmm_put_page(struct page *page)
> +{
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	struct kvmppc_hmm_page_pvt *pvt;
> +
> +	pvt = (struct kvmppc_hmm_page_pvt *)hmm_devmem_page_get_drvdata(page);
> +	hmm_devmem_page_set_drvdata(page, 0);
> +
> +	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
> +	bitmap_clear(kvmppc_hmm->pfn_bitmap,
> +		     pfn - kvmppc_hmm->devmem->pfn_first, 1);
> +	kvmppc_hmm_hash_free_pfn(pvt->hmm_hash, pvt->gpa);
> +	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +	kfree(pvt);
> +}
> +
> +static void
> +kvmppc_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
> +				  const unsigned long *src_pfns,
> +				  unsigned long *dst_pfns,
> +				  unsigned long start,
> +				  unsigned long end,
> +				  void *private)

A comment above this function telling the reader when it gets called
and what it is supposed to do would be helpful.  For example, which
way is it migrating pages - secure to normal, or normal to secure, or
either depending on how it's called?

> +{
> +	unsigned long addr;
> +	struct kvmppc_hmm_migrate_args *args = private;
> +	unsigned long page_size = 1UL << args->page_shift;
> +
> +	for (addr = start; addr < end;
> +		addr += page_size, src_pfns++, dst_pfns++) {
> +		struct page *spage = migrate_pfn_to_page(*src_pfns);
> +		struct page *dpage;
> +		unsigned long pfn = *src_pfns >> MIGRATE_PFN_SHIFT;
> +
> +		*dst_pfns = 0;
> +		if (!spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
> +			continue;

Don't those two statements above boil down to:

		if (!(*src_pfns & MIGRATE_PFN_MIGRATE))
			continue;

? Why two separate statements?

> +
> +		dpage = kvmppc_hmm_get_page(args->hmm_hash, args->gpa,
> +					    args->lpid);
> +		if (!dpage)
> +			continue;
> +
> +		if (spage)
> +			uv_page_in(args->lpid, pfn << args->page_shift,
> +				   args->gpa, 0, args->page_shift);
> +
> +		*dst_pfns = migrate_pfn(page_to_pfn(dpage)) |
> +			    MIGRATE_PFN_DEVICE | MIGRATE_PFN_LOCKED;
> +	}
> +}
> +
> +static void
> +kvmppc_hmm_migrate_finalize_and_map(struct vm_area_struct *vma,
> +				    const unsigned long *src_pfns,
> +				    const unsigned long *dst_pfns,
> +				    unsigned long start,
> +				    unsigned long end,
> +				    void *private)
> +{
> +}

A comment indicating when this is called and why we don't have to do
anything would be helpful.

> +static const struct migrate_vma_ops kvmppc_hmm_migrate_ops = {
> +	.alloc_and_copy = kvmppc_hmm_migrate_alloc_and_copy,
> +	.finalize_and_map = kvmppc_hmm_migrate_finalize_and_map,
> +};
> +
> +static unsigned long kvmppc_gpa_to_hva(struct kvm *kvm, unsigned long gpa,
> +				       unsigned long page_shift)
> +{
> +	unsigned long gfn, hva;
> +	struct kvm_memory_slot *memslot;
> +
> +	gfn = gpa >> page_shift;
> +	memslot = gfn_to_memslot(kvm, gfn);
> +	hva = gfn_to_hva_memslot(memslot, gfn);
> +
> +	return hva;
> +}

Isn't this just gfn_to_hva(kvm, gpa >> PAGE_SHIFT)?  Why do we need
our own implementation of gfn_to_hva()?

Also, I believe it's necessary to hold the srcu read lock for the VM
when looking up memslots. 

> +/*
> + * Move page from normal memory to secure memory.
> + */
> +unsigned long
> +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> +		     unsigned long flags, unsigned long page_shift)
> +{
> +	unsigned long addr, end;
> +	unsigned long src_pfn, dst_pfn;
> +	struct kvmppc_hmm_migrate_args args;
> +	struct mm_struct *mm = get_task_mm(current);
> +	struct vm_area_struct *vma;
> +	int ret = H_SUCCESS;
> +
> +	if (page_shift != PAGE_SHIFT)
> +		return H_P3;
> +
> +	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
> +	if (!addr)
> +		return H_PARAMETER;
> +	end = addr + (1UL << page_shift);
> +
> +	if (flags)
> +		return H_P2;
> +
> +	args.hmm_hash = kvm->arch.hmm_hash;
> +	args.lpid = kvm->arch.lpid;
> +	args.gpa = gpa;
> +	args.page_shift = page_shift;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma_intersection(mm, addr, end);
> +	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
> +		ret = H_PARAMETER;
> +		goto out;
> +	}
> +	ret = migrate_vma(&kvmppc_hmm_migrate_ops, vma, addr, end,
> +			  &src_pfn, &dst_pfn, &args);
> +	if (ret < 0)
> +		ret = H_PARAMETER;
> +out:
> +	up_read(&mm->mmap_sem);
> +	return ret;
> +}
> +
> +static void
> +kvmppc_hmm_fault_migrate_alloc_and_copy(struct vm_area_struct *vma,
> +					const unsigned long *src_pfn,
> +					unsigned long *dst_pfn,
> +					unsigned long start,
> +					unsigned long end,
> +					void *private)
> +{
> +	struct page *dpage, *spage;
> +	struct kvmppc_hmm_page_pvt *pvt;
> +	unsigned long pfn;
> +	int ret = U_SUCCESS;
> +
> +	*dst_pfn = MIGRATE_PFN_ERROR;
> +	spage = migrate_pfn_to_page(*src_pfn);
> +	if (!spage || !(*src_pfn & MIGRATE_PFN_MIGRATE))
> +		return;
> +	if (!is_zone_device_page(spage))
> +		return;
> +	dpage = hmm_vma_alloc_locked_page(vma, start);
> +	if (!dpage)
> +		return;
> +	pvt = (struct kvmppc_hmm_page_pvt *)
> +	       hmm_devmem_page_get_drvdata(spage);
> +
> +	pfn = page_to_pfn(dpage);
> +	ret = uv_page_out(pvt->lpid, pfn << PAGE_SHIFT,
> +			  pvt->gpa, 0, PAGE_SHIFT);
> +	if (ret == U_SUCCESS)
> +		*dst_pfn = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> +}
> +
> +static void
> +kvmppc_hmm_fault_migrate_finalize_and_map(struct vm_area_struct *vma,
> +					  const unsigned long *src_pfns,
> +					  const unsigned long *dst_pfns,
> +					  unsigned long start,
> +					  unsigned long end,
> +					  void *private)
> +{
> +}
> +
> +static const struct migrate_vma_ops kvmppc_hmm_fault_migrate_ops = {
> +	.alloc_and_copy = kvmppc_hmm_fault_migrate_alloc_and_copy,
> +	.finalize_and_map = kvmppc_hmm_fault_migrate_finalize_and_map,
> +};
> +
> +/*
> + * Fault handler callback when HV touches any page that has been
> + * moved to secure memory, we ask UV to give back the page by
> + * issuing a UV_PAGE_OUT uvcall.
> + */
> +static int kvmppc_hmm_devmem_fault(struct hmm_devmem *devmem,
> +				   struct vm_area_struct *vma,
> +				   unsigned long addr,
> +				   const struct page *page,
> +				   unsigned int flags,
> +				   pmd_t *pmdp)
> +{
> +	unsigned long end = addr + PAGE_SIZE;
> +	unsigned long src_pfn, dst_pfn = 0;
> +
> +	if (migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
> +			&src_pfn, &dst_pfn, NULL))
> +		return VM_FAULT_SIGBUS;
> +	if (dst_pfn == MIGRATE_PFN_ERROR)
> +		return VM_FAULT_SIGBUS;
> +	return 0;
> +}
> +
> +static void kvmppc_hmm_devmem_free(struct hmm_devmem *devmem,
> +				   struct page *page)
> +{
> +	kvmppc_hmm_put_page(page);
> +}
> +
> +static const struct hmm_devmem_ops kvmppc_hmm_devmem_ops = {
> +	.free = kvmppc_hmm_devmem_free,
> +	.fault = kvmppc_hmm_devmem_fault,
> +};
> +
> +/*
> + * Move page from secure memory to normal memory.
> + */
> +unsigned long
> +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
> +		      unsigned long flags, unsigned long page_shift)
> +{
> +	unsigned long addr, end;
> +	struct mm_struct *mm = get_task_mm(current);
> +	struct vm_area_struct *vma;
> +	unsigned long src_pfn, dst_pfn = 0;
> +	int ret = H_SUCCESS;
> +
> +	if (page_shift != PAGE_SHIFT)
> +		return H_P4;
> +
> +	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
> +	if (!addr)
> +		return H_P2;
> +	end = addr + (1UL << page_shift);
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma_intersection(mm, addr, end);
> +	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
> +		ret = H_PARAMETER;
> +		goto out;
> +	}
> +	ret = migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
> +			  &src_pfn, &dst_pfn, NULL);
> +	if (ret < 0)
> +		ret = H_PARAMETER;
> +out:
> +	up_read(&mm->mmap_sem);
> +	return ret;
> +}
> +
> +/*
> + * TODO: Number of secure pages and the page size order would probably come
> + * via DT or via some uvcall. Return 8G for now.
> + */
> +static unsigned long kvmppc_get_secmem_size(void)
> +{
> +	return (1UL << 33);
> +}
> +
> +static int kvmppc_hmm_pages_init(void)
> +{
> +	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
> +				kvmppc_hmm->devmem->pfn_first;
> +
> +	kvmppc_hmm->pfn_bitmap = kcalloc(BITS_TO_LONGS(nr_pfns),
> +					 sizeof(unsigned long), GFP_KERNEL);
> +	if (!kvmppc_hmm->pfn_bitmap)
> +		return -ENOMEM;
> +
> +	spin_lock_init(&kvmppc_hmm_lock);
> +
> +	return 0;
> +}
> +
> +int kvmppc_hmm_init(void)
> +{
> +	int ret = 0;
> +	unsigned long size = kvmppc_get_secmem_size();
> +
> +	kvmppc_hmm = kzalloc(sizeof(*kvmppc_hmm), GFP_KERNEL);
> +	if (!kvmppc_hmm) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}

If there is only ever one of these kvmppc_hmm_device structs, and it's
only 24 bytes, why do we bother allocating it dynamically?  Why not
just have the single instance as a static variable?

> +
> +	kvmppc_hmm->device = hmm_device_new(NULL);
> +	if (IS_ERR(kvmppc_hmm->device)) {
> +		ret = PTR_ERR(kvmppc_hmm->device);
> +		goto out_free;
> +	}
> +
> +	kvmppc_hmm->devmem = hmm_devmem_add(&kvmppc_hmm_devmem_ops,
> +					    &kvmppc_hmm->device->device, size);
> +	if (IS_ERR(kvmppc_hmm->devmem)) {
> +		ret = PTR_ERR(kvmppc_hmm->devmem);
> +		goto out_device;
> +	}
> +	ret = kvmppc_hmm_pages_init();
> +	if (ret < 0)
> +		goto out_devmem;
> +
> +	return ret;
> +
> +out_devmem:
> +	hmm_devmem_remove(kvmppc_hmm->devmem);
> +out_device:
> +	hmm_device_put(kvmppc_hmm->device);
> +out_free:
> +	kfree(kvmppc_hmm);
> +	kvmppc_hmm = NULL;
> +out:
> +	return ret;
> +}
> +
> +void kvmppc_hmm_free(void)
> +{
> +	kfree(kvmppc_hmm->pfn_bitmap);
> +	hmm_devmem_remove(kvmppc_hmm->devmem);
> +	hmm_device_put(kvmppc_hmm->device);
> +	kfree(kvmppc_hmm);
> +	kvmppc_hmm = NULL;
> +}
> -- 
> 2.17.1

Paul.
Ram Pai Oct. 30, 2018, 6:31 a.m. | #2
On Tue, Oct 30, 2018 at 04:03:00PM +1100, Paul Mackerras wrote:
> On Mon, Oct 22, 2018 at 10:48:34AM +0530, Bharata B Rao wrote:
> > HMM driver for KVM PPC to manage page transitions of
> > secure guest via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.
> > 
> > H_SVM_PAGE_IN: Move the content of a normal page to secure page
> > H_SVM_PAGE_OUT: Move the content of a secure page to normal page
> 
> Comments below...
> 
> > Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> > ---
> >  /* pSeries hypervisor opcodes */
....
> >  #define H_REMOVE		0x04
> >  #define H_ENTER			0x08
> > @@ -295,7 +298,9 @@
> >  #define H_INT_ESB               0x3C8
> >  #define H_INT_SYNC              0x3CC
> >  #define H_INT_RESET             0x3D0
> > -#define MAX_HCALL_OPCODE	H_INT_RESET
> > +#define H_SVM_PAGE_IN		0x3D4
> > +#define H_SVM_PAGE_OUT		0x3D8
> > +#define MAX_HCALL_OPCODE	H_SVM_PAGE_OUT
> 
> We should define hcall numbers in the implementation-specific range.
> We can't use numbers in this range without first getting them
> standardized in PAPR.  Since these hcalls are not actually used by
> the guest but are just a private interface between KVM and the
> ultravisor, it's probably not worth putting them in PAPR.  We should
> pick a range somewhere in the 0xf000 - 0xfffc area and use that.

We are using that range for Ucalls.  For hcalls we were told to reserve
a range between 1024(0x400) to  2047(0x7FF). Have to reserve them in the
appropriate database.


RP
Paul Mackerras Oct. 30, 2018, 6:32 a.m. | #3
On Mon, Oct 29, 2018 at 11:31:55PM -0700, Ram Pai wrote:
> On Tue, Oct 30, 2018 at 04:03:00PM +1100, Paul Mackerras wrote:
> > On Mon, Oct 22, 2018 at 10:48:34AM +0530, Bharata B Rao wrote:
> > > HMM driver for KVM PPC to manage page transitions of
> > > secure guest via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.
> > > 
> > > H_SVM_PAGE_IN: Move the content of a normal page to secure page
> > > H_SVM_PAGE_OUT: Move the content of a secure page to normal page
> > 
> > Comments below...
> > 
> > > Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> > > ---
> > >  /* pSeries hypervisor opcodes */
> ....
> > >  #define H_REMOVE		0x04
> > >  #define H_ENTER			0x08
> > > @@ -295,7 +298,9 @@
> > >  #define H_INT_ESB               0x3C8
> > >  #define H_INT_SYNC              0x3CC
> > >  #define H_INT_RESET             0x3D0
> > > -#define MAX_HCALL_OPCODE	H_INT_RESET
> > > +#define H_SVM_PAGE_IN		0x3D4
> > > +#define H_SVM_PAGE_OUT		0x3D8
> > > +#define MAX_HCALL_OPCODE	H_SVM_PAGE_OUT
> > 
> > We should define hcall numbers in the implementation-specific range.
> > We can't use numbers in this range without first getting them
> > standardized in PAPR.  Since these hcalls are not actually used by
> > the guest but are just a private interface between KVM and the
> > ultravisor, it's probably not worth putting them in PAPR.  We should
> > pick a range somewhere in the 0xf000 - 0xfffc area and use that.
> 
> We are using that range for Ucalls.  For hcalls we were told to reserve
> a range between 1024(0x400) to  2047(0x7FF). Have to reserve them in the
> appropriate database.

Who gave you that advice?

Paul.
Balbir Singh Nov. 1, 2018, 6:43 a.m. | #4
On Mon, Oct 22, 2018 at 10:48:34AM +0530, Bharata B Rao wrote:
> HMM driver for KVM PPC to manage page transitions of
> secure guest via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.
> 
> H_SVM_PAGE_IN: Move the content of a normal page to secure page
> H_SVM_PAGE_OUT: Move the content of a secure page to normal page
> 
> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/hvcall.h    |   7 +-
>  arch/powerpc/include/asm/kvm_host.h  |  15 +
>  arch/powerpc/include/asm/kvm_ppc.h   |  28 ++
>  arch/powerpc/include/asm/ucall-api.h |  20 ++
>  arch/powerpc/kvm/Makefile            |   3 +
>  arch/powerpc/kvm/book3s_hv.c         |  38 ++
>  arch/powerpc/kvm/book3s_hv_hmm.c     | 514 +++++++++++++++++++++++++++
>  7 files changed, 624 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/include/asm/ucall-api.h
>  create mode 100644 arch/powerpc/kvm/book3s_hv_hmm.c
> 
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index a0b17f9f1ea4..89e6b70c1857 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -158,6 +158,9 @@
>  /* Each control block has to be on a 4K boundary */
>  #define H_CB_ALIGNMENT          4096
>  
> +/* Flags for H_SVM_PAGE_IN */
> +#define H_PAGE_IN_SHARED	0x1
> +
>  /* pSeries hypervisor opcodes */
>  #define H_REMOVE		0x04
>  #define H_ENTER			0x08
> @@ -295,7 +298,9 @@
>  #define H_INT_ESB               0x3C8
>  #define H_INT_SYNC              0x3CC
>  #define H_INT_RESET             0x3D0
> -#define MAX_HCALL_OPCODE	H_INT_RESET
> +#define H_SVM_PAGE_IN		0x3D4
> +#define H_SVM_PAGE_OUT		0x3D8
> +#define MAX_HCALL_OPCODE	H_SVM_PAGE_OUT
>  
>  /* H_VIOCTL functions */
>  #define H_GET_VIOA_DUMP_SIZE	0x01
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 906bcbdfd2a1..194e6e0ff239 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -310,6 +310,9 @@ struct kvm_arch {
>  	struct kvmppc_passthru_irqmap *pimap;
>  #endif
>  	struct kvmppc_ops *kvm_ops;
> +#ifdef CONFIG_PPC_SVM
> +	struct hlist_head *hmm_hash;
> +#endif
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	/* This array can grow quite large, keep it at the end */
>  	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
> @@ -830,4 +833,16 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
>  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
>  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
>  
> +#ifdef CONFIG_PPC_SVM
> +struct kvmppc_hmm_device {
> +	struct hmm_device *device;
> +	struct hmm_devmem *devmem;
> +	unsigned long *pfn_bitmap;
> +};
> +
> +extern int kvmppc_hmm_init(void);
> +extern void kvmppc_hmm_free(void);
> +extern int kvmppc_hmm_hash_create(struct kvm *kvm);
> +extern void kvmppc_hmm_hash_destroy(struct kvm *kvm);
> +#endif
>  #endif /* __POWERPC_KVM_HOST_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index e991821dd7fa..ba81a07e2bdf 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -906,4 +906,32 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
>  
>  extern void xics_wake_cpu(int cpu);
>  
> +#ifdef CONFIG_PPC_SVM
> +extern unsigned long kvmppc_h_svm_page_in(struct kvm *kvm,
> +					  unsigned int lpid,
> +					  unsigned long gra,
> +					  unsigned long flags,
> +					  unsigned long page_shift);
> +extern unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
> +					  unsigned int lpid,
> +					  unsigned long gra,
> +					  unsigned long flags,
> +					  unsigned long page_shift);
> +#else
> +static inline unsigned long
> +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned int lpid,
> +		     unsigned long gra, unsigned long flags,
> +		     unsigned long page_shift)
> +{
> +	return H_UNSUPPORTED;
> +}
> +
> +static inline unsigned long
> +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned int lpid,
> +		      unsigned long gra, unsigned long flags,
> +		      unsigned long page_shift)
> +{
> +	return H_UNSUPPORTED;
> +}
> +#endif
>  #endif /* __POWERPC_KVM_PPC_H__ */
> diff --git a/arch/powerpc/include/asm/ucall-api.h b/arch/powerpc/include/asm/ucall-api.h
> new file mode 100644
> index 000000000000..2c12f514f8ab
> --- /dev/null
> +++ b/arch/powerpc/include/asm/ucall-api.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_UCALL_API_H
> +#define _ASM_POWERPC_UCALL_API_H
> +
> +#define U_SUCCESS 0
> +
> +/*
> + * TODO: Dummy uvcalls, will be replaced by real calls
> + */
> +static inline int uv_page_in(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
> +{
> +	return U_SUCCESS;
> +}
> +
> +static inline int uv_page_out(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
> +{
> +	return U_SUCCESS;
> +}
> +
> +#endif	/* _ASM_POWERPC_UCALL_API_H */
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index f872c04bb5b1..6945ffc18679 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -77,6 +77,9 @@ kvm-hv-y += \
>  	book3s_64_mmu_hv.o \
>  	book3s_64_mmu_radix.o
>  
> +kvm-hv-$(CONFIG_PPC_SVM) += \
> +	book3s_hv_hmm.o
> +
>  kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
>  	book3s_hv_tm.o
>  
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 3e3a71594e63..05084eb8aadd 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -73,6 +73,7 @@
>  #include <asm/opal.h>
>  #include <asm/xics.h>
>  #include <asm/xive.h>
> +#include <asm/kvm_host.h>
>  
>  #include "book3s.h"
>  
> @@ -935,6 +936,20 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  		if (ret == H_TOO_HARD)
>  			return RESUME_HOST;
>  		break;
> +	case H_SVM_PAGE_IN:
> +		ret = kvmppc_h_svm_page_in(vcpu->kvm,
> +					   kvmppc_get_gpr(vcpu, 4),
> +					   kvmppc_get_gpr(vcpu, 5),
> +					   kvmppc_get_gpr(vcpu, 6),
> +					   kvmppc_get_gpr(vcpu, 7));
> +		break;
> +	case H_SVM_PAGE_OUT:
> +		ret = kvmppc_h_svm_page_out(vcpu->kvm,
> +					    kvmppc_get_gpr(vcpu, 4),
> +					    kvmppc_get_gpr(vcpu, 5),
> +					    kvmppc_get_gpr(vcpu, 6),
> +					    kvmppc_get_gpr(vcpu, 7));

Won't just touching the page cause page_out? I wonder if you could
avoid code duplication by touch addr. So under mmap_sem if you
did get_user_pages() and put_page(), it might simplify all the
additional code you've got.

> +		break;
>  	default:
>  		return RESUME_HOST;
>  	}
> @@ -961,6 +976,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
>  	case H_IPOLL:
>  	case H_XIRR_X:
>  #endif
> +	case H_SVM_PAGE_IN:
> +	case H_SVM_PAGE_OUT:
>  		return 1;
>  	}
>  
> @@ -3938,6 +3955,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
>  		return -ENOMEM;
>  	kvm->arch.lpid = lpid;
>  
> +#ifdef CONFIG_PPC_SVM
> +	ret = kvmppc_hmm_hash_create(kvm);
> +	if (ret) {
> +		kvmppc_free_lpid(kvm->arch.lpid);
> +		return ret;
> +	}
> +#endif
>  	kvmppc_alloc_host_rm_ops();
>  
>  	/*
> @@ -4073,6 +4097,9 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
>  
>  	kvmppc_free_vcores(kvm);
>  
> +#ifdef CONFIG_PPC_SVM
> +	kvmppc_hmm_hash_destroy(kvm);
> +#endif
>  	kvmppc_free_lpid(kvm->arch.lpid);
>  
>  	if (kvm_is_radix(kvm))
> @@ -4384,6 +4411,8 @@ static unsigned int default_hcall_list[] = {
>  	H_XIRR,
>  	H_XIRR_X,
>  #endif
> +	H_SVM_PAGE_IN,
> +	H_SVM_PAGE_OUT,
>  	0
>  };
>  
> @@ -4596,11 +4625,20 @@ static int kvmppc_book3s_init_hv(void)
>  			no_mixing_hpt_and_radix = true;
>  	}
>  
> +#ifdef CONFIG_PPC_SVM
> +	r = kvmppc_hmm_init();
> +	if (r < 0)
> +		pr_err("KVM-HV: kvmppc_hmm_init failed %d\n", r);
> +#endif
> +
>  	return r;
>  }
>  
>  static void kvmppc_book3s_exit_hv(void)
>  {
> +#ifdef CONFIG_PPC_SVM
> +	kvmppc_hmm_free();
> +#endif
>  	kvmppc_free_host_rm_ops();
>  	if (kvmppc_radix_possible())
>  		kvmppc_radix_exit();
> diff --git a/arch/powerpc/kvm/book3s_hv_hmm.c b/arch/powerpc/kvm/book3s_hv_hmm.c
> new file mode 100644
> index 000000000000..a2ee3163a312
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_hmm.c
> @@ -0,0 +1,514 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * HMM driver to manage page migration between normal and secure
> + * memory.
> + *
> + * Based on Jérôme Glisse's HMM dummy driver.
> + *
> + * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
> + */
> +
> +/*
> + * A pseries guest can be run as a secure guest on Ultravisor-enabled
> + * POWER platforms. On such platforms, this driver will be used to manage
> + * the movement of guest pages between the normal memory managed by
> + * hypervisor (HV) and secure memory managed by Ultravisor (UV).
> + *
> + * Private ZONE_DEVICE memory equal to the amount of secure memory
> + * available in the platform for running secure guests is created
> + * via a HMM device. The movement of pages between normal and secure
> + * memory is done by ->alloc_and_copy() callback routine of migrate_vma().
> + *
> + * The page-in or page-out requests from UV will come to HV as hcalls and
> + * HV will call back into UV via uvcalls to satisfy these page requests.
> + *
> + * For each page that gets moved into secure memory, a HMM PFN is used
> + * on the HV side and HMM migration PTE corresponding to that PFN would be
> + * populated in the QEMU page tables. A per-guest hash table is created to
> + * manage the pool of HMM PFNs. Guest real address is used as key to index
> + * into the hash table and choose a free HMM PFN.
> + */
> +
> +#include <linux/hmm.h>
> +#include <linux/kvm_host.h>
> +#include <linux/sched/mm.h>
> +#include <asm/ucall-api.h>
> +
> +static struct kvmppc_hmm_device *kvmppc_hmm;
> +spinlock_t kvmppc_hmm_lock;
> +
> +#define KVMPPC_HMM_HASH_BITS    10
> +#define KVMPPC_HMM_HASH_SIZE   (1 << KVMPPC_HMM_HASH_BITS)
> +
> +struct kvmppc_hmm_pfn_entry {
> +	struct hlist_node hlist;
> +	unsigned long addr;
> +	unsigned long hmm_pfn;
> +};
> +
> +struct kvmppc_hmm_page_pvt {
> +	struct hlist_head *hmm_hash;
> +	unsigned int lpid;
> +	unsigned long gpa;
> +};
> +
> +struct kvmppc_hmm_migrate_args {
> +	struct hlist_head *hmm_hash;
> +	unsigned int lpid;
> +	unsigned long gpa;
> +	unsigned long page_shift;
> +};
> +
> +int kvmppc_hmm_hash_create(struct kvm *kvm)
> +{
> +	int i;
> +
> +	kvm->arch.hmm_hash = kzalloc(KVMPPC_HMM_HASH_SIZE *
> +				     sizeof(struct hlist_head), GFP_KERNEL);
> +	if (!kvm->arch.hmm_hash)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++)
> +		INIT_HLIST_HEAD(&kvm->arch.hmm_hash[i]);
> +	return 0;
> +}
> +
> +/*
> + * Cleanup the HMM pages hash table when guest terminates
> + *
> + * Iterate over all the HMM pages hash list entries and release
> + * reference on them. The actual freeing of the entry happens
> + * via hmm_devmem_ops.free path.
> + */
> +void kvmppc_hmm_hash_destroy(struct kvm *kvm)
> +{
> +	int i;
> +	struct kvmppc_hmm_pfn_entry *p;
> +	struct page *hmm_page;
> +
> +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++) {
> +		while (!hlist_empty(&kvm->arch.hmm_hash[i])) {
> +			p = hlist_entry(kvm->arch.hmm_hash[i].first,
> +					struct kvmppc_hmm_pfn_entry,
> +					hlist);
> +			hmm_page = pfn_to_page(p->hmm_pfn);
> +			put_page(hmm_page);
> +		}
> +	}
> +	kfree(kvm->arch.hmm_hash);
> +}
> +
> +static u64 kvmppc_hmm_pfn_hash_fn(u64 addr)
> +{
> +	return hash_64(addr, KVMPPC_HMM_HASH_BITS);
> +}
> +
> +static void
> +kvmppc_hmm_hash_free_pfn(struct hlist_head *hmm_hash, unsigned long gpa)
> +{
> +	struct kvmppc_hmm_pfn_entry *p;
> +	struct hlist_head *list;
> +
> +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> +	hlist_for_each_entry(p, list, hlist) {
> +		if (p->addr == gpa) {
> +			hlist_del(&p->hlist);
> +			kfree(p);
> +			return;
> +		}
> +	}
> +}
> +
> +/*
> + * Get a free HMM PFN from the pool
> + *
> + * Called when a normal page is moved to secure memory (UV_PAGE_IN). HMM
> + * PFN will be used to keep track of the secure page on HV side.
> + */
> +static struct page *kvmppc_hmm_get_page(struct hlist_head *hmm_hash,
> +					unsigned long gpa, unsigned int lpid)
> +{
> +	struct page *dpage = NULL;
> +	unsigned long bit;
> +	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
> +				kvmppc_hmm->devmem->pfn_first;
> +	struct hlist_head *list;
> +	struct kvmppc_hmm_pfn_entry *p;
> +	bool found = false;
> +	unsigned long flags;
> +	struct kvmppc_hmm_page_pvt *pvt;
> +
> +	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
> +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> +	hlist_for_each_entry(p, list, hlist) {
> +		if (p->addr == gpa) {
> +			found = true;
> +			break;
> +		}
> +	}
> +	if (!found) {
> +		p = kzalloc(sizeof(struct kvmppc_hmm_pfn_entry), GFP_ATOMIC);
> +		if (!p) {
> +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +			return NULL;
> +		}
> +		p->addr = gpa;
> +		bit = find_first_zero_bit(kvmppc_hmm->pfn_bitmap, nr_pfns);
> +		if (bit >= nr_pfns) {
> +			kfree(p);

Why not consider resizing the bitmap, from the code below, it looks
like this should never happen - BUG_ON()?

> +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +			return NULL;
> +		}
> +		bitmap_set(kvmppc_hmm->pfn_bitmap, bit, 1);
> +		p->hmm_pfn = bit + kvmppc_hmm->devmem->pfn_first;
> +		INIT_HLIST_NODE(&p->hlist);
> +		hlist_add_head(&p->hlist, list);
> +	} else {
> +		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +		return NULL;

For found=true, why do we return NULL?

> +	}
> +	dpage = pfn_to_page(p->hmm_pfn);
> +
> +	if (!trylock_page(dpage)) {
> +		bitmap_clear(kvmppc_hmm->pfn_bitmap,
> +			     p->hmm_pfn - kvmppc_hmm->devmem->pfn_first, 1);
> +		hlist_del(&p->hlist);
> +		kfree(p);
> +		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +		return NULL;

What race are you avoiding here?

> +	}
> +	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +
> +	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
> +	pvt->hmm_hash = hmm_hash;
> +	pvt->gpa = gpa;
> +	pvt->lpid = lpid;
> +	hmm_devmem_page_set_drvdata(dpage, (unsigned long)pvt);

Do we care about when this store is seen?

> +
> +	get_page(dpage);
> +	return dpage;
> +}
> +
> +/*
> + * Release the HMM PFN back to the pool
> + *
> + * Called when secure page becomes a normal page during UV_PAGE_OUT.
> + */
> +static void kvmppc_hmm_put_page(struct page *page)
> +{
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	struct kvmppc_hmm_page_pvt *pvt;
> +
> +	pvt = (struct kvmppc_hmm_page_pvt *)hmm_devmem_page_get_drvdata(page);
> +	hmm_devmem_page_set_drvdata(page, 0);
> +
> +	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
> +	bitmap_clear(kvmppc_hmm->pfn_bitmap,
> +		     pfn - kvmppc_hmm->devmem->pfn_first, 1);
> +	kvmppc_hmm_hash_free_pfn(pvt->hmm_hash, pvt->gpa);

Don't we need a put_page here?

> +	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> +	kfree(pvt);
> +}
> +
> +static void
> +kvmppc_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
> +				  const unsigned long *src_pfns,
> +				  unsigned long *dst_pfns,
> +				  unsigned long start,
> +				  unsigned long end,
> +				  void *private)
> +{
> +	unsigned long addr;
> +	struct kvmppc_hmm_migrate_args *args = private;
> +	unsigned long page_size = 1UL << args->page_shift;
> +
> +	for (addr = start; addr < end;
> +		addr += page_size, src_pfns++, dst_pfns++) {
> +		struct page *spage = migrate_pfn_to_page(*src_pfns);
> +		struct page *dpage;
> +		unsigned long pfn = *src_pfns >> MIGRATE_PFN_SHIFT;
> +
> +		*dst_pfns = 0;
> +		if (!spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
> +			continue;
> +
> +		if (spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
> +			continue;
> +

Isn't this the same as !(*src_pfns & MIGRATE_PFN_MIGRATE) continue?
That probably indicates the page is backed by a file. Do such pages
ever get migrated?

> +		dpage = kvmppc_hmm_get_page(args->hmm_hash, args->gpa,
> +					    args->lpid);
> +		if (!dpage)
> +			continue;

Aaah.. I see if the page is already in secure memory - kvmppc_hmm_get_page()
returns NULL and no migration is done! dpage is the struct page for the
secure memory.

> +
> +		if (spage)
> +			uv_page_in(args->lpid, pfn << args->page_shift,
> +				   args->gpa, 0, args->page_shift);
> +
> +		*dst_pfns = migrate_pfn(page_to_pfn(dpage)) |
> +			    MIGRATE_PFN_DEVICE | MIGRATE_PFN_LOCKED;
> +	}
> +}
> +
> +static void
> +kvmppc_hmm_migrate_finalize_and_map(struct vm_area_struct *vma,
> +				    const unsigned long *src_pfns,
> +				    const unsigned long *dst_pfns,
> +				    unsigned long start,
> +				    unsigned long end,
> +				    void *private)
> +{
> +}
> +
> +static const struct migrate_vma_ops kvmppc_hmm_migrate_ops = {
> +	.alloc_and_copy = kvmppc_hmm_migrate_alloc_and_copy,
> +	.finalize_and_map = kvmppc_hmm_migrate_finalize_and_map,
> +};
> +
> +static unsigned long kvmppc_gpa_to_hva(struct kvm *kvm, unsigned long gpa,
> +				       unsigned long page_shift)
> +{
> +	unsigned long gfn, hva;
> +	struct kvm_memory_slot *memslot;
> +
> +	gfn = gpa >> page_shift;
> +	memslot = gfn_to_memslot(kvm, gfn);
> +	hva = gfn_to_hva_memslot(memslot, gfn);
> +
> +	return hva;
> +}

This doesn't sound like the rightplace to have this function

> +
> +/*
> + * Move page from normal memory to secure memory.
> + */
> +unsigned long
> +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> +		     unsigned long flags, unsigned long page_shift)
> +{
> +	unsigned long addr, end;
> +	unsigned long src_pfn, dst_pfn;
> +	struct kvmppc_hmm_migrate_args args;
> +	struct mm_struct *mm = get_task_mm(current);
> +	struct vm_area_struct *vma;
> +	int ret = H_SUCCESS;
> +
> +	if (page_shift != PAGE_SHIFT)
> +		return H_P3;

So no large page support?

> +
> +	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
> +	if (!addr)
> +		return H_PARAMETER;

So addr == 0 is invalid?

> +	end = addr + (1UL << page_shift);
> +
> +	if (flags)
> +		return H_P2;
> +
> +	args.hmm_hash = kvm->arch.hmm_hash;
> +	args.lpid = kvm->arch.lpid;
> +	args.gpa = gpa;
> +	args.page_shift = page_shift;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma_intersection(mm, addr, end);
> +	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
> +		ret = H_PARAMETER;
> +		goto out;
> +	}
> +	ret = migrate_vma(&kvmppc_hmm_migrate_ops, vma, addr, end,
> +			  &src_pfn, &dst_pfn, &args);
> +	if (ret < 0)
> +		ret = H_PARAMETER;
> +out:
> +	up_read(&mm->mmap_sem);
> +	return ret;
> +}
> +
> +static void
> +kvmppc_hmm_fault_migrate_alloc_and_copy(struct vm_area_struct *vma,
> +					const unsigned long *src_pfn,
> +					unsigned long *dst_pfn,
> +					unsigned long start,
> +					unsigned long end,
> +					void *private)
> +{
> +	struct page *dpage, *spage;
> +	struct kvmppc_hmm_page_pvt *pvt;
> +	unsigned long pfn;
> +	int ret = U_SUCCESS;
> +
> +	*dst_pfn = MIGRATE_PFN_ERROR;
> +	spage = migrate_pfn_to_page(*src_pfn);
> +	if (!spage || !(*src_pfn & MIGRATE_PFN_MIGRATE))
> +		return;
> +	if (!is_zone_device_page(spage))
> +		return;
> +	dpage = hmm_vma_alloc_locked_page(vma, start);
> +	if (!dpage)

We've probably already OOM'd here :)

> +		return;
> +	pvt = (struct kvmppc_hmm_page_pvt *)
> +	       hmm_devmem_page_get_drvdata(spage);
> +
> +	pfn = page_to_pfn(dpage);
> +	ret = uv_page_out(pvt->lpid, pfn << PAGE_SHIFT,

Do we need pfn << PAGE_SHIFT?

> +			  pvt->gpa, 0, PAGE_SHIFT);
> +	if (ret == U_SUCCESS)
> +		*dst_pfn = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> +}
> +
> +static void
> +kvmppc_hmm_fault_migrate_finalize_and_map(struct vm_area_struct *vma,
> +					  const unsigned long *src_pfns,
> +					  const unsigned long *dst_pfns,
> +					  unsigned long start,
> +					  unsigned long end,
> +					  void *private)
> +{
> +}
> +
> +static const struct migrate_vma_ops kvmppc_hmm_fault_migrate_ops = {
> +	.alloc_and_copy = kvmppc_hmm_fault_migrate_alloc_and_copy,
> +	.finalize_and_map = kvmppc_hmm_fault_migrate_finalize_and_map,
> +};
> +
> +/*
> + * Fault handler callback when HV touches any page that has been
> + * moved to secure memory, we ask UV to give back the page by
> + * issuing a UV_PAGE_OUT uvcall.
> + */
> +static int kvmppc_hmm_devmem_fault(struct hmm_devmem *devmem,
> +				   struct vm_area_struct *vma,
> +				   unsigned long addr,
> +				   const struct page *page,
> +				   unsigned int flags,
> +				   pmd_t *pmdp)
> +{
> +	unsigned long end = addr + PAGE_SIZE;
> +	unsigned long src_pfn, dst_pfn = 0;
> +
> +	if (migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
> +			&src_pfn, &dst_pfn, NULL))
> +		return VM_FAULT_SIGBUS;
> +	if (dst_pfn == MIGRATE_PFN_ERROR)
> +		return VM_FAULT_SIGBUS;
> +	return 0;
> +}
> +
> +static void kvmppc_hmm_devmem_free(struct hmm_devmem *devmem,
> +				   struct page *page)
> +{
> +	kvmppc_hmm_put_page(page);
> +}
> +
> +static const struct hmm_devmem_ops kvmppc_hmm_devmem_ops = {
> +	.free = kvmppc_hmm_devmem_free,
> +	.fault = kvmppc_hmm_devmem_fault,
> +};
> +
> +/*
> + * Move page from secure memory to normal memory.
> + */
> +unsigned long
> +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
> +		      unsigned long flags, unsigned long page_shift)
> +{
> +	unsigned long addr, end;
> +	struct mm_struct *mm = get_task_mm(current);
> +	struct vm_area_struct *vma;
> +	unsigned long src_pfn, dst_pfn = 0;
> +	int ret = H_SUCCESS;
> +
> +	if (page_shift != PAGE_SHIFT)
> +		return H_P4;
> +
> +	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
> +	if (!addr)
> +		return H_P2;
> +	end = addr + (1UL << page_shift);
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma_intersection(mm, addr, end);
> +	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
> +		ret = H_PARAMETER;
> +		goto out;
> +	}
> +	ret = migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
> +			  &src_pfn, &dst_pfn, NULL);
> +	if (ret < 0)
> +		ret = H_PARAMETER;
> +out:
> +	up_read(&mm->mmap_sem);
> +	return ret;
> +}
> +
> +/*
> + * TODO: Number of secure pages and the page size order would probably come
> + * via DT or via some uvcall. Return 8G for now.
> + */
> +static unsigned long kvmppc_get_secmem_size(void)
> +{
> +	return (1UL << 33);
> +}
> +
> +static int kvmppc_hmm_pages_init(void)
> +{
> +	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
> +				kvmppc_hmm->devmem->pfn_first;
> +
> +	kvmppc_hmm->pfn_bitmap = kcalloc(BITS_TO_LONGS(nr_pfns),
> +					 sizeof(unsigned long), GFP_KERNEL);
> +	if (!kvmppc_hmm->pfn_bitmap)
> +		return -ENOMEM;
> +
> +	spin_lock_init(&kvmppc_hmm_lock);
> +
> +	return 0;
> +}
> +
> +int kvmppc_hmm_init(void)
> +{
> +	int ret = 0;
> +	unsigned long size = kvmppc_get_secmem_size();

Can you split secmem to secure_mem?

> +
> +	kvmppc_hmm = kzalloc(sizeof(*kvmppc_hmm), GFP_KERNEL);
> +	if (!kvmppc_hmm) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	kvmppc_hmm->device = hmm_device_new(NULL);
> +	if (IS_ERR(kvmppc_hmm->device)) {
> +		ret = PTR_ERR(kvmppc_hmm->device);
> +		goto out_free;
> +	}
> +
> +	kvmppc_hmm->devmem = hmm_devmem_add(&kvmppc_hmm_devmem_ops,
> +					    &kvmppc_hmm->device->device, size);

IIUC, there is just one HMM device for all the secure memory in the
system?

> +	if (IS_ERR(kvmppc_hmm->devmem)) {
> +		ret = PTR_ERR(kvmppc_hmm->devmem);
> +		goto out_device;
> +	}
> +	ret = kvmppc_hmm_pages_init();
> +	if (ret < 0)
> +		goto out_devmem;
> +
> +	return ret;
> +
> +out_devmem:
> +	hmm_devmem_remove(kvmppc_hmm->devmem);
> +out_device:
> +	hmm_device_put(kvmppc_hmm->device);
> +out_free:
> +	kfree(kvmppc_hmm);
> +	kvmppc_hmm = NULL;
> +out:
> +	return ret;
> +}
> +
> +void kvmppc_hmm_free(void)
> +{
> +	kfree(kvmppc_hmm->pfn_bitmap);
> +	hmm_devmem_remove(kvmppc_hmm->devmem);
> +	hmm_device_put(kvmppc_hmm->device);
> +	kfree(kvmppc_hmm);
> +	kvmppc_hmm = NULL;
> +}

Balbir Singh.
Bharata B Rao Nov. 12, 2018, 9:28 a.m. | #5
On Tue, Oct 30, 2018 at 04:03:00PM +1100, Paul Mackerras wrote:
> On Mon, Oct 22, 2018 at 10:48:34AM +0530, Bharata B Rao wrote:
> > HMM driver for KVM PPC to manage page transitions of
> > secure guest via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.
> > 
> > H_SVM_PAGE_IN: Move the content of a normal page to secure page
> > H_SVM_PAGE_OUT: Move the content of a secure page to normal page
> 
> Comments below...

Thanks for your comments, I have addressed all of your comments in the
next version I am about to post out soon. I will reply only to the
questions here:

> > diff --git a/arch/powerpc/kvm/book3s_hv_hmm.c b/arch/powerpc/kvm/book3s_hv_hmm.c
> > new file mode 100644
> > index 000000000000..a2ee3163a312
> > --- /dev/null
> > +++ b/arch/powerpc/kvm/book3s_hv_hmm.c
> > @@ -0,0 +1,514 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * HMM driver to manage page migration between normal and secure
> > + * memory.
> > + *
> > + * Based on Jérôme Glisse's HMM dummy driver.
> > + *
> > + * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
> > + */
> > +
> > +/*
> > + * A pseries guest can be run as a secure guest on Ultravisor-enabled
> > + * POWER platforms. On such platforms, this driver will be used to manage
> > + * the movement of guest pages between the normal memory managed by
> > + * hypervisor (HV) and secure memory managed by Ultravisor (UV).
> > + *
> > + * Private ZONE_DEVICE memory equal to the amount of secure memory
> > + * available in the platform for running secure guests is created
> > + * via a HMM device. The movement of pages between normal and secure
> > + * memory is done by ->alloc_and_copy() callback routine of migrate_vma().
> > + *
> > + * The page-in or page-out requests from UV will come to HV as hcalls and
> > + * HV will call back into UV via uvcalls to satisfy these page requests.
> > + *
> > + * For each page that gets moved into secure memory, a HMM PFN is used
> > + * on the HV side and HMM migration PTE corresponding to that PFN would be
> > + * populated in the QEMU page tables. A per-guest hash table is created to
> > + * manage the pool of HMM PFNs. Guest real address is used as key to index
> > + * into the hash table and choose a free HMM PFN.
> > + */
> > +
> > +#include <linux/hmm.h>
> > +#include <linux/kvm_host.h>
> > +#include <linux/sched/mm.h>
> > +#include <asm/ucall-api.h>
> > +
> > +static struct kvmppc_hmm_device *kvmppc_hmm;
> > +spinlock_t kvmppc_hmm_lock;
> > +
> > +#define KVMPPC_HMM_HASH_BITS    10
> > +#define KVMPPC_HMM_HASH_SIZE   (1 << KVMPPC_HMM_HASH_BITS)
> > +
> > +struct kvmppc_hmm_pfn_entry {
> > +	struct hlist_node hlist;
> > +	unsigned long addr;
> > +	unsigned long hmm_pfn;
> > +};
> > +
> > +struct kvmppc_hmm_page_pvt {
> > +	struct hlist_head *hmm_hash;
> > +	unsigned int lpid;
> > +	unsigned long gpa;
> > +};
> > +
> > +struct kvmppc_hmm_migrate_args {
> > +	struct hlist_head *hmm_hash;
> > +	unsigned int lpid;
> > +	unsigned long gpa;
> > +	unsigned long page_shift;
> > +};
> > +
> > +int kvmppc_hmm_hash_create(struct kvm *kvm)
> > +{
> > +	int i;
> > +
> > +	kvm->arch.hmm_hash = kzalloc(KVMPPC_HMM_HASH_SIZE *
> > +				     sizeof(struct hlist_head), GFP_KERNEL);
> > +	if (!kvm->arch.hmm_hash)
> > +		return -ENOMEM;
> > +
> > +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++)
> > +		INIT_HLIST_HEAD(&kvm->arch.hmm_hash[i]);
> > +	return 0;
> > +}
> 
> I gather that this hash table maps a guest real address (or pfn) to a
> hmm_pfn, is that right?  I think that mapping could be stored in the
> rmap arrays attached to the memslots, since there won't be any
> hypervisor-side mappings of the page while it is over in the secure
> space.  But maybe that could be a future optimization.

Hash table exists for two reasons:

- When guest/UV requests for a shared page-in, we need to quickly lookup
  if HV has ever given that page to UV as secure page and hence maintains
  only HMM PFN for it. We can walk the linux page table to figure this
  out and since only a small fraction of guest's pages are shared, the
  cost of lookup shouldn't cause much overhead.
- When the guest is destroyed, we need release all HMM pages that HV
  is using to keep track of secure pages. So it becomes easier if there
  is one centralized place (hash table in this case) where we can release
  all pages. I think using rmap arrays as you suggest should cover this
  case.

> 
> > +/*
> > + * Cleanup the HMM pages hash table when guest terminates
> > + *
> > + * Iterate over all the HMM pages hash list entries and release
> > + * reference on them. The actual freeing of the entry happens
> > + * via hmm_devmem_ops.free path.
> > + */
> > +void kvmppc_hmm_hash_destroy(struct kvm *kvm)
> > +{
> > +	int i;
> > +	struct kvmppc_hmm_pfn_entry *p;
> > +	struct page *hmm_page;
> > +
> > +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++) {
> > +		while (!hlist_empty(&kvm->arch.hmm_hash[i])) {
> > +			p = hlist_entry(kvm->arch.hmm_hash[i].first,
> > +					struct kvmppc_hmm_pfn_entry,
> > +					hlist);
> > +			hmm_page = pfn_to_page(p->hmm_pfn);
> > +			put_page(hmm_page);
> > +		}
> > +	}
> > +	kfree(kvm->arch.hmm_hash);
> > +}
> > +
> > +static u64 kvmppc_hmm_pfn_hash_fn(u64 addr)
> > +{
> > +	return hash_64(addr, KVMPPC_HMM_HASH_BITS);
> > +}
> > +
> > +static void
> > +kvmppc_hmm_hash_free_pfn(struct hlist_head *hmm_hash, unsigned long gpa)
> > +{
> > +	struct kvmppc_hmm_pfn_entry *p;
> > +	struct hlist_head *list;
> > +
> > +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> > +	hlist_for_each_entry(p, list, hlist) {
> > +		if (p->addr == gpa) {
> > +			hlist_del(&p->hlist);
> > +			kfree(p);
> > +			return;
> > +		}
> > +	}
> > +}
> > +
> > +/*
> > + * Get a free HMM PFN from the pool
> > + *
> > + * Called when a normal page is moved to secure memory (UV_PAGE_IN). HMM
> > + * PFN will be used to keep track of the secure page on HV side.
> > + */
> > +static struct page *kvmppc_hmm_get_page(struct hlist_head *hmm_hash,
> > +					unsigned long gpa, unsigned int lpid)
> > +{
> > +	struct page *dpage = NULL;
> > +	unsigned long bit;
> > +	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
> > +				kvmppc_hmm->devmem->pfn_first;
> > +	struct hlist_head *list;
> > +	struct kvmppc_hmm_pfn_entry *p;
> > +	bool found = false;
> > +	unsigned long flags;
> > +	struct kvmppc_hmm_page_pvt *pvt;
> > +
> > +	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
> > +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> > +	hlist_for_each_entry(p, list, hlist) {
> > +		if (p->addr == gpa) {
> > +			found = true;
> > +			break;
> > +		}
> > +	}
> > +	if (!found) {
> > +		p = kzalloc(sizeof(struct kvmppc_hmm_pfn_entry), GFP_ATOMIC);
> > +		if (!p) {
> > +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +			return NULL;
> > +		}
> > +		p->addr = gpa;
> > +		bit = find_first_zero_bit(kvmppc_hmm->pfn_bitmap, nr_pfns);
> > +		if (bit >= nr_pfns) {
> > +			kfree(p);
> > +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +			return NULL;
> > +		}
> > +		bitmap_set(kvmppc_hmm->pfn_bitmap, bit, 1);
> > +		p->hmm_pfn = bit + kvmppc_hmm->devmem->pfn_first;
> > +		INIT_HLIST_NODE(&p->hlist);
> > +		hlist_add_head(&p->hlist, list);
> > +	} else {
> > +		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +		return NULL;
> > +	}
> 
> I suggest you redo that if statement as:
> 
> 	if (found) {
> 		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> 		return NULL;
> 	}
> 
> and then have the main path (the code starting with p = kzalloc...) at
> a single level of indentation, so the normal/successful path is more
> obvious.
> 
> > +	dpage = pfn_to_page(p->hmm_pfn);
> > +
> > +	if (!trylock_page(dpage)) {
> 
> Where does this page get unlocked?

At the end of migrate_vma() in whose alloc_and_copy() callback this
lock_page() is done.

Regards,
Bharata.
Bharata B Rao Nov. 12, 2018, 9:59 a.m. | #6
On Thu, Nov 01, 2018 at 05:43:39PM +1100, Balbir Singh wrote:
> On Mon, Oct 22, 2018 at 10:48:34AM +0530, Bharata B Rao wrote:
> > HMM driver for KVM PPC to manage page transitions of
> > secure guest via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.
> > 
> > H_SVM_PAGE_IN: Move the content of a normal page to secure page
> > H_SVM_PAGE_OUT: Move the content of a secure page to normal page
> > 
> > Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> > ---
> >  arch/powerpc/include/asm/hvcall.h    |   7 +-
> >  arch/powerpc/include/asm/kvm_host.h  |  15 +
> >  arch/powerpc/include/asm/kvm_ppc.h   |  28 ++
> >  arch/powerpc/include/asm/ucall-api.h |  20 ++
> >  arch/powerpc/kvm/Makefile            |   3 +
> >  arch/powerpc/kvm/book3s_hv.c         |  38 ++
> >  arch/powerpc/kvm/book3s_hv_hmm.c     | 514 +++++++++++++++++++++++++++
> >  7 files changed, 624 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/powerpc/include/asm/ucall-api.h
> >  create mode 100644 arch/powerpc/kvm/book3s_hv_hmm.c
> > 
> > diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> > index a0b17f9f1ea4..89e6b70c1857 100644
> > --- a/arch/powerpc/include/asm/hvcall.h
> > +++ b/arch/powerpc/include/asm/hvcall.h
> > @@ -158,6 +158,9 @@
> >  /* Each control block has to be on a 4K boundary */
> >  #define H_CB_ALIGNMENT          4096
> >  
> > +/* Flags for H_SVM_PAGE_IN */
> > +#define H_PAGE_IN_SHARED	0x1
> > +
> >  /* pSeries hypervisor opcodes */
> >  #define H_REMOVE		0x04
> >  #define H_ENTER			0x08
> > @@ -295,7 +298,9 @@
> >  #define H_INT_ESB               0x3C8
> >  #define H_INT_SYNC              0x3CC
> >  #define H_INT_RESET             0x3D0
> > -#define MAX_HCALL_OPCODE	H_INT_RESET
> > +#define H_SVM_PAGE_IN		0x3D4
> > +#define H_SVM_PAGE_OUT		0x3D8
> > +#define MAX_HCALL_OPCODE	H_SVM_PAGE_OUT
> >  
> >  /* H_VIOCTL functions */
> >  #define H_GET_VIOA_DUMP_SIZE	0x01
> > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> > index 906bcbdfd2a1..194e6e0ff239 100644
> > --- a/arch/powerpc/include/asm/kvm_host.h
> > +++ b/arch/powerpc/include/asm/kvm_host.h
> > @@ -310,6 +310,9 @@ struct kvm_arch {
> >  	struct kvmppc_passthru_irqmap *pimap;
> >  #endif
> >  	struct kvmppc_ops *kvm_ops;
> > +#ifdef CONFIG_PPC_SVM
> > +	struct hlist_head *hmm_hash;
> > +#endif
> >  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> >  	/* This array can grow quite large, keep it at the end */
> >  	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
> > @@ -830,4 +833,16 @@ static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
> >  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
> >  static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
> >  
> > +#ifdef CONFIG_PPC_SVM
> > +struct kvmppc_hmm_device {
> > +	struct hmm_device *device;
> > +	struct hmm_devmem *devmem;
> > +	unsigned long *pfn_bitmap;
> > +};
> > +
> > +extern int kvmppc_hmm_init(void);
> > +extern void kvmppc_hmm_free(void);
> > +extern int kvmppc_hmm_hash_create(struct kvm *kvm);
> > +extern void kvmppc_hmm_hash_destroy(struct kvm *kvm);
> > +#endif
> >  #endif /* __POWERPC_KVM_HOST_H__ */
> > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> > index e991821dd7fa..ba81a07e2bdf 100644
> > --- a/arch/powerpc/include/asm/kvm_ppc.h
> > +++ b/arch/powerpc/include/asm/kvm_ppc.h
> > @@ -906,4 +906,32 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
> >  
> >  extern void xics_wake_cpu(int cpu);
> >  
> > +#ifdef CONFIG_PPC_SVM
> > +extern unsigned long kvmppc_h_svm_page_in(struct kvm *kvm,
> > +					  unsigned int lpid,
> > +					  unsigned long gra,
> > +					  unsigned long flags,
> > +					  unsigned long page_shift);
> > +extern unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
> > +					  unsigned int lpid,
> > +					  unsigned long gra,
> > +					  unsigned long flags,
> > +					  unsigned long page_shift);
> > +#else
> > +static inline unsigned long
> > +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned int lpid,
> > +		     unsigned long gra, unsigned long flags,
> > +		     unsigned long page_shift)
> > +{
> > +	return H_UNSUPPORTED;
> > +}
> > +
> > +static inline unsigned long
> > +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned int lpid,
> > +		      unsigned long gra, unsigned long flags,
> > +		      unsigned long page_shift)
> > +{
> > +	return H_UNSUPPORTED;
> > +}
> > +#endif
> >  #endif /* __POWERPC_KVM_PPC_H__ */
> > diff --git a/arch/powerpc/include/asm/ucall-api.h b/arch/powerpc/include/asm/ucall-api.h
> > new file mode 100644
> > index 000000000000..2c12f514f8ab
> > --- /dev/null
> > +++ b/arch/powerpc/include/asm/ucall-api.h
> > @@ -0,0 +1,20 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_POWERPC_UCALL_API_H
> > +#define _ASM_POWERPC_UCALL_API_H
> > +
> > +#define U_SUCCESS 0
> > +
> > +/*
> > + * TODO: Dummy uvcalls, will be replaced by real calls
> > + */
> > +static inline int uv_page_in(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
> > +{
> > +	return U_SUCCESS;
> > +}
> > +
> > +static inline int uv_page_out(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
> > +{
> > +	return U_SUCCESS;
> > +}
> > +
> > +#endif	/* _ASM_POWERPC_UCALL_API_H */
> > diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> > index f872c04bb5b1..6945ffc18679 100644
> > --- a/arch/powerpc/kvm/Makefile
> > +++ b/arch/powerpc/kvm/Makefile
> > @@ -77,6 +77,9 @@ kvm-hv-y += \
> >  	book3s_64_mmu_hv.o \
> >  	book3s_64_mmu_radix.o
> >  
> > +kvm-hv-$(CONFIG_PPC_SVM) += \
> > +	book3s_hv_hmm.o
> > +
> >  kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
> >  	book3s_hv_tm.o
> >  
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index 3e3a71594e63..05084eb8aadd 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -73,6 +73,7 @@
> >  #include <asm/opal.h>
> >  #include <asm/xics.h>
> >  #include <asm/xive.h>
> > +#include <asm/kvm_host.h>
> >  
> >  #include "book3s.h"
> >  
> > @@ -935,6 +936,20 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
> >  		if (ret == H_TOO_HARD)
> >  			return RESUME_HOST;
> >  		break;
> > +	case H_SVM_PAGE_IN:
> > +		ret = kvmppc_h_svm_page_in(vcpu->kvm,
> > +					   kvmppc_get_gpr(vcpu, 4),
> > +					   kvmppc_get_gpr(vcpu, 5),
> > +					   kvmppc_get_gpr(vcpu, 6),
> > +					   kvmppc_get_gpr(vcpu, 7));
> > +		break;
> > +	case H_SVM_PAGE_OUT:
> > +		ret = kvmppc_h_svm_page_out(vcpu->kvm,
> > +					    kvmppc_get_gpr(vcpu, 4),
> > +					    kvmppc_get_gpr(vcpu, 5),
> > +					    kvmppc_get_gpr(vcpu, 6),
> > +					    kvmppc_get_gpr(vcpu, 7));
> 
> Won't just touching the page cause page_out? I wonder if you could
> avoid code duplication by touch addr. So under mmap_sem if you
> did get_user_pages() and put_page(), it might simplify all the
> additional code you've got.

Yes touching the page from HV will case the page out, but this is to
support the hcall which UV can use to cause page out. And this in fact
shares migrate_vma() callbacks with fault handler callbacks.

> 
> > +		break;
> >  	default:
> >  		return RESUME_HOST;
> >  	}
> > @@ -961,6 +976,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
> >  	case H_IPOLL:
> >  	case H_XIRR_X:
> >  #endif
> > +	case H_SVM_PAGE_IN:
> > +	case H_SVM_PAGE_OUT:
> >  		return 1;
> >  	}
> >  
> > @@ -3938,6 +3955,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
> >  		return -ENOMEM;
> >  	kvm->arch.lpid = lpid;
> >  
> > +#ifdef CONFIG_PPC_SVM
> > +	ret = kvmppc_hmm_hash_create(kvm);
> > +	if (ret) {
> > +		kvmppc_free_lpid(kvm->arch.lpid);
> > +		return ret;
> > +	}
> > +#endif
> >  	kvmppc_alloc_host_rm_ops();
> >  
> >  	/*
> > @@ -4073,6 +4097,9 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
> >  
> >  	kvmppc_free_vcores(kvm);
> >  
> > +#ifdef CONFIG_PPC_SVM
> > +	kvmppc_hmm_hash_destroy(kvm);
> > +#endif
> >  	kvmppc_free_lpid(kvm->arch.lpid);
> >  
> >  	if (kvm_is_radix(kvm))
> > @@ -4384,6 +4411,8 @@ static unsigned int default_hcall_list[] = {
> >  	H_XIRR,
> >  	H_XIRR_X,
> >  #endif
> > +	H_SVM_PAGE_IN,
> > +	H_SVM_PAGE_OUT,
> >  	0
> >  };
> >  
> > @@ -4596,11 +4625,20 @@ static int kvmppc_book3s_init_hv(void)
> >  			no_mixing_hpt_and_radix = true;
> >  	}
> >  
> > +#ifdef CONFIG_PPC_SVM
> > +	r = kvmppc_hmm_init();
> > +	if (r < 0)
> > +		pr_err("KVM-HV: kvmppc_hmm_init failed %d\n", r);
> > +#endif
> > +
> >  	return r;
> >  }
> >  
> >  static void kvmppc_book3s_exit_hv(void)
> >  {
> > +#ifdef CONFIG_PPC_SVM
> > +	kvmppc_hmm_free();
> > +#endif
> >  	kvmppc_free_host_rm_ops();
> >  	if (kvmppc_radix_possible())
> >  		kvmppc_radix_exit();
> > diff --git a/arch/powerpc/kvm/book3s_hv_hmm.c b/arch/powerpc/kvm/book3s_hv_hmm.c
> > new file mode 100644
> > index 000000000000..a2ee3163a312
> > --- /dev/null
> > +++ b/arch/powerpc/kvm/book3s_hv_hmm.c
> > @@ -0,0 +1,514 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * HMM driver to manage page migration between normal and secure
> > + * memory.
> > + *
> > + * Based on Jérôme Glisse's HMM dummy driver.
> > + *
> > + * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
> > + */
> > +
> > +/*
> > + * A pseries guest can be run as a secure guest on Ultravisor-enabled
> > + * POWER platforms. On such platforms, this driver will be used to manage
> > + * the movement of guest pages between the normal memory managed by
> > + * hypervisor (HV) and secure memory managed by Ultravisor (UV).
> > + *
> > + * Private ZONE_DEVICE memory equal to the amount of secure memory
> > + * available in the platform for running secure guests is created
> > + * via a HMM device. The movement of pages between normal and secure
> > + * memory is done by ->alloc_and_copy() callback routine of migrate_vma().
> > + *
> > + * The page-in or page-out requests from UV will come to HV as hcalls and
> > + * HV will call back into UV via uvcalls to satisfy these page requests.
> > + *
> > + * For each page that gets moved into secure memory, a HMM PFN is used
> > + * on the HV side and HMM migration PTE corresponding to that PFN would be
> > + * populated in the QEMU page tables. A per-guest hash table is created to
> > + * manage the pool of HMM PFNs. Guest real address is used as key to index
> > + * into the hash table and choose a free HMM PFN.
> > + */
> > +
> > +#include <linux/hmm.h>
> > +#include <linux/kvm_host.h>
> > +#include <linux/sched/mm.h>
> > +#include <asm/ucall-api.h>
> > +
> > +static struct kvmppc_hmm_device *kvmppc_hmm;
> > +spinlock_t kvmppc_hmm_lock;
> > +
> > +#define KVMPPC_HMM_HASH_BITS    10
> > +#define KVMPPC_HMM_HASH_SIZE   (1 << KVMPPC_HMM_HASH_BITS)
> > +
> > +struct kvmppc_hmm_pfn_entry {
> > +	struct hlist_node hlist;
> > +	unsigned long addr;
> > +	unsigned long hmm_pfn;
> > +};
> > +
> > +struct kvmppc_hmm_page_pvt {
> > +	struct hlist_head *hmm_hash;
> > +	unsigned int lpid;
> > +	unsigned long gpa;
> > +};
> > +
> > +struct kvmppc_hmm_migrate_args {
> > +	struct hlist_head *hmm_hash;
> > +	unsigned int lpid;
> > +	unsigned long gpa;
> > +	unsigned long page_shift;
> > +};
> > +
> > +int kvmppc_hmm_hash_create(struct kvm *kvm)
> > +{
> > +	int i;
> > +
> > +	kvm->arch.hmm_hash = kzalloc(KVMPPC_HMM_HASH_SIZE *
> > +				     sizeof(struct hlist_head), GFP_KERNEL);
> > +	if (!kvm->arch.hmm_hash)
> > +		return -ENOMEM;
> > +
> > +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++)
> > +		INIT_HLIST_HEAD(&kvm->arch.hmm_hash[i]);
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Cleanup the HMM pages hash table when guest terminates
> > + *
> > + * Iterate over all the HMM pages hash list entries and release
> > + * reference on them. The actual freeing of the entry happens
> > + * via hmm_devmem_ops.free path.
> > + */
> > +void kvmppc_hmm_hash_destroy(struct kvm *kvm)
> > +{
> > +	int i;
> > +	struct kvmppc_hmm_pfn_entry *p;
> > +	struct page *hmm_page;
> > +
> > +	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++) {
> > +		while (!hlist_empty(&kvm->arch.hmm_hash[i])) {
> > +			p = hlist_entry(kvm->arch.hmm_hash[i].first,
> > +					struct kvmppc_hmm_pfn_entry,
> > +					hlist);
> > +			hmm_page = pfn_to_page(p->hmm_pfn);
> > +			put_page(hmm_page);
> > +		}
> > +	}
> > +	kfree(kvm->arch.hmm_hash);
> > +}
> > +
> > +static u64 kvmppc_hmm_pfn_hash_fn(u64 addr)
> > +{
> > +	return hash_64(addr, KVMPPC_HMM_HASH_BITS);
> > +}
> > +
> > +static void
> > +kvmppc_hmm_hash_free_pfn(struct hlist_head *hmm_hash, unsigned long gpa)
> > +{
> > +	struct kvmppc_hmm_pfn_entry *p;
> > +	struct hlist_head *list;
> > +
> > +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> > +	hlist_for_each_entry(p, list, hlist) {
> > +		if (p->addr == gpa) {
> > +			hlist_del(&p->hlist);
> > +			kfree(p);
> > +			return;
> > +		}
> > +	}
> > +}
> > +
> > +/*
> > + * Get a free HMM PFN from the pool
> > + *
> > + * Called when a normal page is moved to secure memory (UV_PAGE_IN). HMM
> > + * PFN will be used to keep track of the secure page on HV side.
> > + */
> > +static struct page *kvmppc_hmm_get_page(struct hlist_head *hmm_hash,
> > +					unsigned long gpa, unsigned int lpid)
> > +{
> > +	struct page *dpage = NULL;
> > +	unsigned long bit;
> > +	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
> > +				kvmppc_hmm->devmem->pfn_first;
> > +	struct hlist_head *list;
> > +	struct kvmppc_hmm_pfn_entry *p;
> > +	bool found = false;
> > +	unsigned long flags;
> > +	struct kvmppc_hmm_page_pvt *pvt;
> > +
> > +	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
> > +	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
> > +	hlist_for_each_entry(p, list, hlist) {
> > +		if (p->addr == gpa) {
> > +			found = true;
> > +			break;
> > +		}
> > +	}
> > +	if (!found) {
> > +		p = kzalloc(sizeof(struct kvmppc_hmm_pfn_entry), GFP_ATOMIC);
> > +		if (!p) {
> > +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +			return NULL;
> > +		}
> > +		p->addr = gpa;
> > +		bit = find_first_zero_bit(kvmppc_hmm->pfn_bitmap, nr_pfns);
> > +		if (bit >= nr_pfns) {
> > +			kfree(p);
> 
> Why not consider resizing the bitmap, from the code below, it looks
> like this should never happen - BUG_ON()?

Yes this should never happen. This means UV is asking for a page-in
and HV doesn't have any HMM PFNs left to use. We refuse to migrate
anything and UV can take a call based on that.

> 
> > +			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +			return NULL;
> > +		}
> > +		bitmap_set(kvmppc_hmm->pfn_bitmap, bit, 1);
> > +		p->hmm_pfn = bit + kvmppc_hmm->devmem->pfn_first;
> > +		INIT_HLIST_NODE(&p->hlist);
> > +		hlist_add_head(&p->hlist, list);
> > +	} else {
> > +		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +		return NULL;
> 
> For found=true, why do we return NULL?
> 
> > +	}
> > +	dpage = pfn_to_page(p->hmm_pfn);
> > +
> > +	if (!trylock_page(dpage)) {
> > +		bitmap_clear(kvmppc_hmm->pfn_bitmap,
> > +			     p->hmm_pfn - kvmppc_hmm->devmem->pfn_first, 1);
> > +		hlist_del(&p->hlist);
> > +		kfree(p);
> > +		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +		return NULL;
> 
> What race are you avoiding here?

Based on Jerome's dummy driver, should check with him.

> 
> > +	}
> > +	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +
> > +	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
> > +	pvt->hmm_hash = hmm_hash;
> > +	pvt->gpa = gpa;
> > +	pvt->lpid = lpid;
> > +	hmm_devmem_page_set_drvdata(dpage, (unsigned long)pvt);
> 
> Do we care about when this store is seen?

Can you explain what is the potential problem here ? Essentially we
use this pvt data later if the page participates in page-out or when
the page is being released.

> 
> > +
> > +	get_page(dpage);
> > +	return dpage;
> > +}
> > +
> > +/*
> > + * Release the HMM PFN back to the pool
> > + *
> > + * Called when secure page becomes a normal page during UV_PAGE_OUT.
> > + */
> > +static void kvmppc_hmm_put_page(struct page *page)
> > +{
> > +	unsigned long pfn = page_to_pfn(page);
> > +	unsigned long flags;
> > +	struct kvmppc_hmm_page_pvt *pvt;
> > +
> > +	pvt = (struct kvmppc_hmm_page_pvt *)hmm_devmem_page_get_drvdata(page);
> > +	hmm_devmem_page_set_drvdata(page, 0);
> > +
> > +	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
> > +	bitmap_clear(kvmppc_hmm->pfn_bitmap,
> > +		     pfn - kvmppc_hmm->devmem->pfn_first, 1);
> > +	kvmppc_hmm_hash_free_pfn(pvt->hmm_hash, pvt->gpa);
> 
> Don't we need a put_page here?

In fact, we come here during last put_page().

> 
> > +	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
> > +	kfree(pvt);
> > +}
> > +
> > +static void
> > +kvmppc_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
> > +				  const unsigned long *src_pfns,
> > +				  unsigned long *dst_pfns,
> > +				  unsigned long start,
> > +				  unsigned long end,
> > +				  void *private)
> > +{
> > +	unsigned long addr;
> > +	struct kvmppc_hmm_migrate_args *args = private;
> > +	unsigned long page_size = 1UL << args->page_shift;
> > +
> > +	for (addr = start; addr < end;
> > +		addr += page_size, src_pfns++, dst_pfns++) {
> > +		struct page *spage = migrate_pfn_to_page(*src_pfns);
> > +		struct page *dpage;
> > +		unsigned long pfn = *src_pfns >> MIGRATE_PFN_SHIFT;
> > +
> > +		*dst_pfns = 0;
> > +		if (!spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> > +		if (spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
> > +			continue;
> > +
> 
> Isn't this the same as !(*src_pfns & MIGRATE_PFN_MIGRATE) continue?
> That probably indicates the page is backed by a file. Do such pages
> ever get migrated?

Will address.

> 
> > +		dpage = kvmppc_hmm_get_page(args->hmm_hash, args->gpa,
> > +					    args->lpid);
> > +		if (!dpage)
> > +			continue;
> 
> Aaah.. I see if the page is already in secure memory - kvmppc_hmm_get_page()
> returns NULL and no migration is done! dpage is the struct page for the
> secure memory.
> 
> > +
> > +		if (spage)
> > +			uv_page_in(args->lpid, pfn << args->page_shift,
> > +				   args->gpa, 0, args->page_shift);
> > +
> > +		*dst_pfns = migrate_pfn(page_to_pfn(dpage)) |
> > +			    MIGRATE_PFN_DEVICE | MIGRATE_PFN_LOCKED;
> > +	}
> > +}
> > +
> > +static void
> > +kvmppc_hmm_migrate_finalize_and_map(struct vm_area_struct *vma,
> > +				    const unsigned long *src_pfns,
> > +				    const unsigned long *dst_pfns,
> > +				    unsigned long start,
> > +				    unsigned long end,
> > +				    void *private)
> > +{
> > +}
> > +
> > +static const struct migrate_vma_ops kvmppc_hmm_migrate_ops = {
> > +	.alloc_and_copy = kvmppc_hmm_migrate_alloc_and_copy,
> > +	.finalize_and_map = kvmppc_hmm_migrate_finalize_and_map,
> > +};
> > +
> > +static unsigned long kvmppc_gpa_to_hva(struct kvm *kvm, unsigned long gpa,
> > +				       unsigned long page_shift)
> > +{
> > +	unsigned long gfn, hva;
> > +	struct kvm_memory_slot *memslot;
> > +
> > +	gfn = gpa >> page_shift;
> > +	memslot = gfn_to_memslot(kvm, gfn);
> > +	hva = gfn_to_hva_memslot(memslot, gfn);
> > +
> > +	return hva;
> > +}
> 
> This doesn't sound like the rightplace to have this function

Paul also pointed to the same, next version will have this replaced.

> 
> > +
> > +/*
> > + * Move page from normal memory to secure memory.
> > + */
> > +unsigned long
> > +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> > +		     unsigned long flags, unsigned long page_shift)
> > +{
> > +	unsigned long addr, end;
> > +	unsigned long src_pfn, dst_pfn;
> > +	struct kvmppc_hmm_migrate_args args;
> > +	struct mm_struct *mm = get_task_mm(current);
> > +	struct vm_area_struct *vma;
> > +	int ret = H_SUCCESS;
> > +
> > +	if (page_shift != PAGE_SHIFT)
> > +		return H_P3;
> 
> So no large page support?

No yet.

> 
> > +
> > +	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
> > +	if (!addr)
> > +		return H_PARAMETER;
> 
> So addr == 0 is invalid?

Switched to gfn_to_hva() and kvm_is_error_hva() which should cover all
the valid cases.

> 
> > +	end = addr + (1UL << page_shift);
> > +
> > +	if (flags)
> > +		return H_P2;
> > +
> > +	args.hmm_hash = kvm->arch.hmm_hash;
> > +	args.lpid = kvm->arch.lpid;
> > +	args.gpa = gpa;
> > +	args.page_shift = page_shift;
> > +
> > +	down_read(&mm->mmap_sem);
> > +	vma = find_vma_intersection(mm, addr, end);
> > +	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
> > +		ret = H_PARAMETER;
> > +		goto out;
> > +	}
> > +	ret = migrate_vma(&kvmppc_hmm_migrate_ops, vma, addr, end,
> > +			  &src_pfn, &dst_pfn, &args);
> > +	if (ret < 0)
> > +		ret = H_PARAMETER;
> > +out:
> > +	up_read(&mm->mmap_sem);
> > +	return ret;
> > +}
> > +
> > +static void
> > +kvmppc_hmm_fault_migrate_alloc_and_copy(struct vm_area_struct *vma,
> > +					const unsigned long *src_pfn,
> > +					unsigned long *dst_pfn,
> > +					unsigned long start,
> > +					unsigned long end,
> > +					void *private)
> > +{
> > +	struct page *dpage, *spage;
> > +	struct kvmppc_hmm_page_pvt *pvt;
> > +	unsigned long pfn;
> > +	int ret = U_SUCCESS;
> > +
> > +	*dst_pfn = MIGRATE_PFN_ERROR;
> > +	spage = migrate_pfn_to_page(*src_pfn);
> > +	if (!spage || !(*src_pfn & MIGRATE_PFN_MIGRATE))
> > +		return;
> > +	if (!is_zone_device_page(spage))
> > +		return;
> > +	dpage = hmm_vma_alloc_locked_page(vma, start);
> > +	if (!dpage)
> 
> We've probably already OOM'd here :)
> 
> > +		return;
> > +	pvt = (struct kvmppc_hmm_page_pvt *)
> > +	       hmm_devmem_page_get_drvdata(spage);
> > +
> > +	pfn = page_to_pfn(dpage);
> > +	ret = uv_page_out(pvt->lpid, pfn << PAGE_SHIFT,
> 
> Do we need pfn << PAGE_SHIFT?

Yes, uv_page_out() needs physical address as argument.

> 
> > +			  pvt->gpa, 0, PAGE_SHIFT);
> > +	if (ret == U_SUCCESS)
> > +		*dst_pfn = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> > +}
> > +
> > +static void
> > +kvmppc_hmm_fault_migrate_finalize_and_map(struct vm_area_struct *vma,
> > +					  const unsigned long *src_pfns,
> > +					  const unsigned long *dst_pfns,
> > +					  unsigned long start,
> > +					  unsigned long end,
> > +					  void *private)
> > +{
> > +}
> > +
> > +static const struct migrate_vma_ops kvmppc_hmm_fault_migrate_ops = {
> > +	.alloc_and_copy = kvmppc_hmm_fault_migrate_alloc_and_copy,
> > +	.finalize_and_map = kvmppc_hmm_fault_migrate_finalize_and_map,
> > +};
> > +
> > +/*
> > + * Fault handler callback when HV touches any page that has been
> > + * moved to secure memory, we ask UV to give back the page by
> > + * issuing a UV_PAGE_OUT uvcall.
> > + */
> > +static int kvmppc_hmm_devmem_fault(struct hmm_devmem *devmem,
> > +				   struct vm_area_struct *vma,
> > +				   unsigned long addr,
> > +				   const struct page *page,
> > +				   unsigned int flags,
> > +				   pmd_t *pmdp)
> > +{
> > +	unsigned long end = addr + PAGE_SIZE;
> > +	unsigned long src_pfn, dst_pfn = 0;
> > +
> > +	if (migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
> > +			&src_pfn, &dst_pfn, NULL))
> > +		return VM_FAULT_SIGBUS;
> > +	if (dst_pfn == MIGRATE_PFN_ERROR)
> > +		return VM_FAULT_SIGBUS;
> > +	return 0;
> > +}
> > +
> > +static void kvmppc_hmm_devmem_free(struct hmm_devmem *devmem,
> > +				   struct page *page)
> > +{
> > +	kvmppc_hmm_put_page(page);
> > +}
> > +
> > +static const struct hmm_devmem_ops kvmppc_hmm_devmem_ops = {
> > +	.free = kvmppc_hmm_devmem_free,
> > +	.fault = kvmppc_hmm_devmem_fault,
> > +};
> > +
> > +/*
> > + * Move page from secure memory to normal memory.
> > + */
> > +unsigned long
> > +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
> > +		      unsigned long flags, unsigned long page_shift)
> > +{
> > +	unsigned long addr, end;
> > +	struct mm_struct *mm = get_task_mm(current);
> > +	struct vm_area_struct *vma;
> > +	unsigned long src_pfn, dst_pfn = 0;
> > +	int ret = H_SUCCESS;
> > +
> > +	if (page_shift != PAGE_SHIFT)
> > +		return H_P4;
> > +
> > +	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
> > +	if (!addr)
> > +		return H_P2;
> > +	end = addr + (1UL << page_shift);
> > +
> > +	down_read(&mm->mmap_sem);
> > +	vma = find_vma_intersection(mm, addr, end);
> > +	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
> > +		ret = H_PARAMETER;
> > +		goto out;
> > +	}
> > +	ret = migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
> > +			  &src_pfn, &dst_pfn, NULL);
> > +	if (ret < 0)
> > +		ret = H_PARAMETER;
> > +out:
> > +	up_read(&mm->mmap_sem);
> > +	return ret;
> > +}
> > +
> > +/*
> > + * TODO: Number of secure pages and the page size order would probably come
> > + * via DT or via some uvcall. Return 8G for now.
> > + */
> > +static unsigned long kvmppc_get_secmem_size(void)
> > +{
> > +	return (1UL << 33);
> > +}
> > +
> > +static int kvmppc_hmm_pages_init(void)
> > +{
> > +	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
> > +				kvmppc_hmm->devmem->pfn_first;
> > +
> > +	kvmppc_hmm->pfn_bitmap = kcalloc(BITS_TO_LONGS(nr_pfns),
> > +					 sizeof(unsigned long), GFP_KERNEL);
> > +	if (!kvmppc_hmm->pfn_bitmap)
> > +		return -ENOMEM;
> > +
> > +	spin_lock_init(&kvmppc_hmm_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +int kvmppc_hmm_init(void)
> > +{
> > +	int ret = 0;
> > +	unsigned long size = kvmppc_get_secmem_size();
> 
> Can you split secmem to secure_mem?
> 
> > +
> > +	kvmppc_hmm = kzalloc(sizeof(*kvmppc_hmm), GFP_KERNEL);
> > +	if (!kvmppc_hmm) {
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +	kvmppc_hmm->device = hmm_device_new(NULL);
> > +	if (IS_ERR(kvmppc_hmm->device)) {
> > +		ret = PTR_ERR(kvmppc_hmm->device);
> > +		goto out_free;
> > +	}
> > +
> > +	kvmppc_hmm->devmem = hmm_devmem_add(&kvmppc_hmm_devmem_ops,
> > +					    &kvmppc_hmm->device->device, size);
> 
> IIUC, there is just one HMM device for all the secure memory in the
> system?

Yes.

Thanks for your review.

Regards,
Bharata.

Patch

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index a0b17f9f1ea4..89e6b70c1857 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -158,6 +158,9 @@ 
 /* Each control block has to be on a 4K boundary */
 #define H_CB_ALIGNMENT          4096
 
+/* Flags for H_SVM_PAGE_IN */
+#define H_PAGE_IN_SHARED	0x1
+
 /* pSeries hypervisor opcodes */
 #define H_REMOVE		0x04
 #define H_ENTER			0x08
@@ -295,7 +298,9 @@ 
 #define H_INT_ESB               0x3C8
 #define H_INT_SYNC              0x3CC
 #define H_INT_RESET             0x3D0
-#define MAX_HCALL_OPCODE	H_INT_RESET
+#define H_SVM_PAGE_IN		0x3D4
+#define H_SVM_PAGE_OUT		0x3D8
+#define MAX_HCALL_OPCODE	H_SVM_PAGE_OUT
 
 /* H_VIOCTL functions */
 #define H_GET_VIOA_DUMP_SIZE	0x01
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 906bcbdfd2a1..194e6e0ff239 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -310,6 +310,9 @@  struct kvm_arch {
 	struct kvmppc_passthru_irqmap *pimap;
 #endif
 	struct kvmppc_ops *kvm_ops;
+#ifdef CONFIG_PPC_SVM
+	struct hlist_head *hmm_hash;
+#endif
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	/* This array can grow quite large, keep it at the end */
 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
@@ -830,4 +833,16 @@  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+#ifdef CONFIG_PPC_SVM
+struct kvmppc_hmm_device {
+	struct hmm_device *device;
+	struct hmm_devmem *devmem;
+	unsigned long *pfn_bitmap;
+};
+
+extern int kvmppc_hmm_init(void);
+extern void kvmppc_hmm_free(void);
+extern int kvmppc_hmm_hash_create(struct kvm *kvm);
+extern void kvmppc_hmm_hash_destroy(struct kvm *kvm);
+#endif
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e991821dd7fa..ba81a07e2bdf 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -906,4 +906,32 @@  static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
 
 extern void xics_wake_cpu(int cpu);
 
+#ifdef CONFIG_PPC_SVM
+extern unsigned long kvmppc_h_svm_page_in(struct kvm *kvm,
+					  unsigned int lpid,
+					  unsigned long gra,
+					  unsigned long flags,
+					  unsigned long page_shift);
+extern unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
+					  unsigned int lpid,
+					  unsigned long gra,
+					  unsigned long flags,
+					  unsigned long page_shift);
+#else
+static inline unsigned long
+kvmppc_h_svm_page_in(struct kvm *kvm, unsigned int lpid,
+		     unsigned long gra, unsigned long flags,
+		     unsigned long page_shift)
+{
+	return H_UNSUPPORTED;
+}
+
+static inline unsigned long
+kvmppc_h_svm_page_out(struct kvm *kvm, unsigned int lpid,
+		      unsigned long gra, unsigned long flags,
+		      unsigned long page_shift)
+{
+	return H_UNSUPPORTED;
+}
+#endif
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/ucall-api.h b/arch/powerpc/include/asm/ucall-api.h
new file mode 100644
index 000000000000..2c12f514f8ab
--- /dev/null
+++ b/arch/powerpc/include/asm/ucall-api.h
@@ -0,0 +1,20 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_UCALL_API_H
+#define _ASM_POWERPC_UCALL_API_H
+
+#define U_SUCCESS 0
+
+/*
+ * TODO: Dummy uvcalls, will be replaced by real calls
+ */
+static inline int uv_page_in(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+	return U_SUCCESS;
+}
+
+static inline int uv_page_out(u64 lpid, u64 dw0, u64 dw1, u64 dw2, u64 dw3)
+{
+	return U_SUCCESS;
+}
+
+#endif	/* _ASM_POWERPC_UCALL_API_H */
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index f872c04bb5b1..6945ffc18679 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -77,6 +77,9 @@  kvm-hv-y += \
 	book3s_64_mmu_hv.o \
 	book3s_64_mmu_radix.o
 
+kvm-hv-$(CONFIG_PPC_SVM) += \
+	book3s_hv_hmm.o
+
 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
 	book3s_hv_tm.o
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3e3a71594e63..05084eb8aadd 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -73,6 +73,7 @@ 
 #include <asm/opal.h>
 #include <asm/xics.h>
 #include <asm/xive.h>
+#include <asm/kvm_host.h>
 
 #include "book3s.h"
 
@@ -935,6 +936,20 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		if (ret == H_TOO_HARD)
 			return RESUME_HOST;
 		break;
+	case H_SVM_PAGE_IN:
+		ret = kvmppc_h_svm_page_in(vcpu->kvm,
+					   kvmppc_get_gpr(vcpu, 4),
+					   kvmppc_get_gpr(vcpu, 5),
+					   kvmppc_get_gpr(vcpu, 6),
+					   kvmppc_get_gpr(vcpu, 7));
+		break;
+	case H_SVM_PAGE_OUT:
+		ret = kvmppc_h_svm_page_out(vcpu->kvm,
+					    kvmppc_get_gpr(vcpu, 4),
+					    kvmppc_get_gpr(vcpu, 5),
+					    kvmppc_get_gpr(vcpu, 6),
+					    kvmppc_get_gpr(vcpu, 7));
+		break;
 	default:
 		return RESUME_HOST;
 	}
@@ -961,6 +976,8 @@  static int kvmppc_hcall_impl_hv(unsigned long cmd)
 	case H_IPOLL:
 	case H_XIRR_X:
 #endif
+	case H_SVM_PAGE_IN:
+	case H_SVM_PAGE_OUT:
 		return 1;
 	}
 
@@ -3938,6 +3955,13 @@  static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 		return -ENOMEM;
 	kvm->arch.lpid = lpid;
 
+#ifdef CONFIG_PPC_SVM
+	ret = kvmppc_hmm_hash_create(kvm);
+	if (ret) {
+		kvmppc_free_lpid(kvm->arch.lpid);
+		return ret;
+	}
+#endif
 	kvmppc_alloc_host_rm_ops();
 
 	/*
@@ -4073,6 +4097,9 @@  static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
 	kvmppc_free_vcores(kvm);
 
+#ifdef CONFIG_PPC_SVM
+	kvmppc_hmm_hash_destroy(kvm);
+#endif
 	kvmppc_free_lpid(kvm->arch.lpid);
 
 	if (kvm_is_radix(kvm))
@@ -4384,6 +4411,8 @@  static unsigned int default_hcall_list[] = {
 	H_XIRR,
 	H_XIRR_X,
 #endif
+	H_SVM_PAGE_IN,
+	H_SVM_PAGE_OUT,
 	0
 };
 
@@ -4596,11 +4625,20 @@  static int kvmppc_book3s_init_hv(void)
 			no_mixing_hpt_and_radix = true;
 	}
 
+#ifdef CONFIG_PPC_SVM
+	r = kvmppc_hmm_init();
+	if (r < 0)
+		pr_err("KVM-HV: kvmppc_hmm_init failed %d\n", r);
+#endif
+
 	return r;
 }
 
 static void kvmppc_book3s_exit_hv(void)
 {
+#ifdef CONFIG_PPC_SVM
+	kvmppc_hmm_free();
+#endif
 	kvmppc_free_host_rm_ops();
 	if (kvmppc_radix_possible())
 		kvmppc_radix_exit();
diff --git a/arch/powerpc/kvm/book3s_hv_hmm.c b/arch/powerpc/kvm/book3s_hv_hmm.c
new file mode 100644
index 000000000000..a2ee3163a312
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_hmm.c
@@ -0,0 +1,514 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HMM driver to manage page migration between normal and secure
+ * memory.
+ *
+ * Based on Jérôme Glisse's HMM dummy driver.
+ *
+ * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
+ */
+
+/*
+ * A pseries guest can be run as a secure guest on Ultravisor-enabled
+ * POWER platforms. On such platforms, this driver will be used to manage
+ * the movement of guest pages between the normal memory managed by
+ * hypervisor (HV) and secure memory managed by Ultravisor (UV).
+ *
+ * Private ZONE_DEVICE memory equal to the amount of secure memory
+ * available in the platform for running secure guests is created
+ * via a HMM device. The movement of pages between normal and secure
+ * memory is done by ->alloc_and_copy() callback routine of migrate_vma().
+ *
+ * The page-in or page-out requests from UV will come to HV as hcalls and
+ * HV will call back into UV via uvcalls to satisfy these page requests.
+ *
+ * For each page that gets moved into secure memory, a HMM PFN is used
+ * on the HV side and HMM migration PTE corresponding to that PFN would be
+ * populated in the QEMU page tables. A per-guest hash table is created to
+ * manage the pool of HMM PFNs. Guest real address is used as key to index
+ * into the hash table and choose a free HMM PFN.
+ */
+
+#include <linux/hmm.h>
+#include <linux/kvm_host.h>
+#include <linux/sched/mm.h>
+#include <asm/ucall-api.h>
+
+static struct kvmppc_hmm_device *kvmppc_hmm;
+spinlock_t kvmppc_hmm_lock;
+
+#define KVMPPC_HMM_HASH_BITS    10
+#define KVMPPC_HMM_HASH_SIZE   (1 << KVMPPC_HMM_HASH_BITS)
+
+struct kvmppc_hmm_pfn_entry {
+	struct hlist_node hlist;
+	unsigned long addr;
+	unsigned long hmm_pfn;
+};
+
+struct kvmppc_hmm_page_pvt {
+	struct hlist_head *hmm_hash;
+	unsigned int lpid;
+	unsigned long gpa;
+};
+
+struct kvmppc_hmm_migrate_args {
+	struct hlist_head *hmm_hash;
+	unsigned int lpid;
+	unsigned long gpa;
+	unsigned long page_shift;
+};
+
+int kvmppc_hmm_hash_create(struct kvm *kvm)
+{
+	int i;
+
+	kvm->arch.hmm_hash = kzalloc(KVMPPC_HMM_HASH_SIZE *
+				     sizeof(struct hlist_head), GFP_KERNEL);
+	if (!kvm->arch.hmm_hash)
+		return -ENOMEM;
+
+	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&kvm->arch.hmm_hash[i]);
+	return 0;
+}
+
+/*
+ * Cleanup the HMM pages hash table when guest terminates
+ *
+ * Iterate over all the HMM pages hash list entries and release
+ * reference on them. The actual freeing of the entry happens
+ * via hmm_devmem_ops.free path.
+ */
+void kvmppc_hmm_hash_destroy(struct kvm *kvm)
+{
+	int i;
+	struct kvmppc_hmm_pfn_entry *p;
+	struct page *hmm_page;
+
+	for (i = 0; i < KVMPPC_HMM_HASH_SIZE; i++) {
+		while (!hlist_empty(&kvm->arch.hmm_hash[i])) {
+			p = hlist_entry(kvm->arch.hmm_hash[i].first,
+					struct kvmppc_hmm_pfn_entry,
+					hlist);
+			hmm_page = pfn_to_page(p->hmm_pfn);
+			put_page(hmm_page);
+		}
+	}
+	kfree(kvm->arch.hmm_hash);
+}
+
+static u64 kvmppc_hmm_pfn_hash_fn(u64 addr)
+{
+	return hash_64(addr, KVMPPC_HMM_HASH_BITS);
+}
+
+static void
+kvmppc_hmm_hash_free_pfn(struct hlist_head *hmm_hash, unsigned long gpa)
+{
+	struct kvmppc_hmm_pfn_entry *p;
+	struct hlist_head *list;
+
+	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
+	hlist_for_each_entry(p, list, hlist) {
+		if (p->addr == gpa) {
+			hlist_del(&p->hlist);
+			kfree(p);
+			return;
+		}
+	}
+}
+
+/*
+ * Get a free HMM PFN from the pool
+ *
+ * Called when a normal page is moved to secure memory (UV_PAGE_IN). HMM
+ * PFN will be used to keep track of the secure page on HV side.
+ */
+static struct page *kvmppc_hmm_get_page(struct hlist_head *hmm_hash,
+					unsigned long gpa, unsigned int lpid)
+{
+	struct page *dpage = NULL;
+	unsigned long bit;
+	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
+				kvmppc_hmm->devmem->pfn_first;
+	struct hlist_head *list;
+	struct kvmppc_hmm_pfn_entry *p;
+	bool found = false;
+	unsigned long flags;
+	struct kvmppc_hmm_page_pvt *pvt;
+
+	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
+	list = &hmm_hash[kvmppc_hmm_pfn_hash_fn(gpa)];
+	hlist_for_each_entry(p, list, hlist) {
+		if (p->addr == gpa) {
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		p = kzalloc(sizeof(struct kvmppc_hmm_pfn_entry), GFP_ATOMIC);
+		if (!p) {
+			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
+			return NULL;
+		}
+		p->addr = gpa;
+		bit = find_first_zero_bit(kvmppc_hmm->pfn_bitmap, nr_pfns);
+		if (bit >= nr_pfns) {
+			kfree(p);
+			spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
+			return NULL;
+		}
+		bitmap_set(kvmppc_hmm->pfn_bitmap, bit, 1);
+		p->hmm_pfn = bit + kvmppc_hmm->devmem->pfn_first;
+		INIT_HLIST_NODE(&p->hlist);
+		hlist_add_head(&p->hlist, list);
+	} else {
+		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
+		return NULL;
+	}
+	dpage = pfn_to_page(p->hmm_pfn);
+
+	if (!trylock_page(dpage)) {
+		bitmap_clear(kvmppc_hmm->pfn_bitmap,
+			     p->hmm_pfn - kvmppc_hmm->devmem->pfn_first, 1);
+		hlist_del(&p->hlist);
+		kfree(p);
+		spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
+		return NULL;
+	}
+	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
+
+	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
+	pvt->hmm_hash = hmm_hash;
+	pvt->gpa = gpa;
+	pvt->lpid = lpid;
+	hmm_devmem_page_set_drvdata(dpage, (unsigned long)pvt);
+
+	get_page(dpage);
+	return dpage;
+}
+
+/*
+ * Release the HMM PFN back to the pool
+ *
+ * Called when secure page becomes a normal page during UV_PAGE_OUT.
+ */
+static void kvmppc_hmm_put_page(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long flags;
+	struct kvmppc_hmm_page_pvt *pvt;
+
+	pvt = (struct kvmppc_hmm_page_pvt *)hmm_devmem_page_get_drvdata(page);
+	hmm_devmem_page_set_drvdata(page, 0);
+
+	spin_lock_irqsave(&kvmppc_hmm_lock, flags);
+	bitmap_clear(kvmppc_hmm->pfn_bitmap,
+		     pfn - kvmppc_hmm->devmem->pfn_first, 1);
+	kvmppc_hmm_hash_free_pfn(pvt->hmm_hash, pvt->gpa);
+	spin_unlock_irqrestore(&kvmppc_hmm_lock, flags);
+	kfree(pvt);
+}
+
+static void
+kvmppc_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
+				  const unsigned long *src_pfns,
+				  unsigned long *dst_pfns,
+				  unsigned long start,
+				  unsigned long end,
+				  void *private)
+{
+	unsigned long addr;
+	struct kvmppc_hmm_migrate_args *args = private;
+	unsigned long page_size = 1UL << args->page_shift;
+
+	for (addr = start; addr < end;
+		addr += page_size, src_pfns++, dst_pfns++) {
+		struct page *spage = migrate_pfn_to_page(*src_pfns);
+		struct page *dpage;
+		unsigned long pfn = *src_pfns >> MIGRATE_PFN_SHIFT;
+
+		*dst_pfns = 0;
+		if (!spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		if (spage && !(*src_pfns & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		dpage = kvmppc_hmm_get_page(args->hmm_hash, args->gpa,
+					    args->lpid);
+		if (!dpage)
+			continue;
+
+		if (spage)
+			uv_page_in(args->lpid, pfn << args->page_shift,
+				   args->gpa, 0, args->page_shift);
+
+		*dst_pfns = migrate_pfn(page_to_pfn(dpage)) |
+			    MIGRATE_PFN_DEVICE | MIGRATE_PFN_LOCKED;
+	}
+}
+
+static void
+kvmppc_hmm_migrate_finalize_and_map(struct vm_area_struct *vma,
+				    const unsigned long *src_pfns,
+				    const unsigned long *dst_pfns,
+				    unsigned long start,
+				    unsigned long end,
+				    void *private)
+{
+}
+
+static const struct migrate_vma_ops kvmppc_hmm_migrate_ops = {
+	.alloc_and_copy = kvmppc_hmm_migrate_alloc_and_copy,
+	.finalize_and_map = kvmppc_hmm_migrate_finalize_and_map,
+};
+
+static unsigned long kvmppc_gpa_to_hva(struct kvm *kvm, unsigned long gpa,
+				       unsigned long page_shift)
+{
+	unsigned long gfn, hva;
+	struct kvm_memory_slot *memslot;
+
+	gfn = gpa >> page_shift;
+	memslot = gfn_to_memslot(kvm, gfn);
+	hva = gfn_to_hva_memslot(memslot, gfn);
+
+	return hva;
+}
+
+/*
+ * Move page from normal memory to secure memory.
+ */
+unsigned long
+kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
+		     unsigned long flags, unsigned long page_shift)
+{
+	unsigned long addr, end;
+	unsigned long src_pfn, dst_pfn;
+	struct kvmppc_hmm_migrate_args args;
+	struct mm_struct *mm = get_task_mm(current);
+	struct vm_area_struct *vma;
+	int ret = H_SUCCESS;
+
+	if (page_shift != PAGE_SHIFT)
+		return H_P3;
+
+	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
+	if (!addr)
+		return H_PARAMETER;
+	end = addr + (1UL << page_shift);
+
+	if (flags)
+		return H_P2;
+
+	args.hmm_hash = kvm->arch.hmm_hash;
+	args.lpid = kvm->arch.lpid;
+	args.gpa = gpa;
+	args.page_shift = page_shift;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma_intersection(mm, addr, end);
+	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
+		ret = H_PARAMETER;
+		goto out;
+	}
+	ret = migrate_vma(&kvmppc_hmm_migrate_ops, vma, addr, end,
+			  &src_pfn, &dst_pfn, &args);
+	if (ret < 0)
+		ret = H_PARAMETER;
+out:
+	up_read(&mm->mmap_sem);
+	return ret;
+}
+
+static void
+kvmppc_hmm_fault_migrate_alloc_and_copy(struct vm_area_struct *vma,
+					const unsigned long *src_pfn,
+					unsigned long *dst_pfn,
+					unsigned long start,
+					unsigned long end,
+					void *private)
+{
+	struct page *dpage, *spage;
+	struct kvmppc_hmm_page_pvt *pvt;
+	unsigned long pfn;
+	int ret = U_SUCCESS;
+
+	*dst_pfn = MIGRATE_PFN_ERROR;
+	spage = migrate_pfn_to_page(*src_pfn);
+	if (!spage || !(*src_pfn & MIGRATE_PFN_MIGRATE))
+		return;
+	if (!is_zone_device_page(spage))
+		return;
+	dpage = hmm_vma_alloc_locked_page(vma, start);
+	if (!dpage)
+		return;
+	pvt = (struct kvmppc_hmm_page_pvt *)
+	       hmm_devmem_page_get_drvdata(spage);
+
+	pfn = page_to_pfn(dpage);
+	ret = uv_page_out(pvt->lpid, pfn << PAGE_SHIFT,
+			  pvt->gpa, 0, PAGE_SHIFT);
+	if (ret == U_SUCCESS)
+		*dst_pfn = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
+}
+
+static void
+kvmppc_hmm_fault_migrate_finalize_and_map(struct vm_area_struct *vma,
+					  const unsigned long *src_pfns,
+					  const unsigned long *dst_pfns,
+					  unsigned long start,
+					  unsigned long end,
+					  void *private)
+{
+}
+
+static const struct migrate_vma_ops kvmppc_hmm_fault_migrate_ops = {
+	.alloc_and_copy = kvmppc_hmm_fault_migrate_alloc_and_copy,
+	.finalize_and_map = kvmppc_hmm_fault_migrate_finalize_and_map,
+};
+
+/*
+ * Fault handler callback when HV touches any page that has been
+ * moved to secure memory, we ask UV to give back the page by
+ * issuing a UV_PAGE_OUT uvcall.
+ */
+static int kvmppc_hmm_devmem_fault(struct hmm_devmem *devmem,
+				   struct vm_area_struct *vma,
+				   unsigned long addr,
+				   const struct page *page,
+				   unsigned int flags,
+				   pmd_t *pmdp)
+{
+	unsigned long end = addr + PAGE_SIZE;
+	unsigned long src_pfn, dst_pfn = 0;
+
+	if (migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
+			&src_pfn, &dst_pfn, NULL))
+		return VM_FAULT_SIGBUS;
+	if (dst_pfn == MIGRATE_PFN_ERROR)
+		return VM_FAULT_SIGBUS;
+	return 0;
+}
+
+static void kvmppc_hmm_devmem_free(struct hmm_devmem *devmem,
+				   struct page *page)
+{
+	kvmppc_hmm_put_page(page);
+}
+
+static const struct hmm_devmem_ops kvmppc_hmm_devmem_ops = {
+	.free = kvmppc_hmm_devmem_free,
+	.fault = kvmppc_hmm_devmem_fault,
+};
+
+/*
+ * Move page from secure memory to normal memory.
+ */
+unsigned long
+kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
+		      unsigned long flags, unsigned long page_shift)
+{
+	unsigned long addr, end;
+	struct mm_struct *mm = get_task_mm(current);
+	struct vm_area_struct *vma;
+	unsigned long src_pfn, dst_pfn = 0;
+	int ret = H_SUCCESS;
+
+	if (page_shift != PAGE_SHIFT)
+		return H_P4;
+
+	addr = kvmppc_gpa_to_hva(kvm, gpa, page_shift);
+	if (!addr)
+		return H_P2;
+	end = addr + (1UL << page_shift);
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma_intersection(mm, addr, end);
+	if (!vma || vma->vm_start > addr || vma->vm_end < end) {
+		ret = H_PARAMETER;
+		goto out;
+	}
+	ret = migrate_vma(&kvmppc_hmm_fault_migrate_ops, vma, addr, end,
+			  &src_pfn, &dst_pfn, NULL);
+	if (ret < 0)
+		ret = H_PARAMETER;
+out:
+	up_read(&mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * TODO: Number of secure pages and the page size order would probably come
+ * via DT or via some uvcall. Return 8G for now.
+ */
+static unsigned long kvmppc_get_secmem_size(void)
+{
+	return (1UL << 33);
+}
+
+static int kvmppc_hmm_pages_init(void)
+{
+	unsigned long nr_pfns = kvmppc_hmm->devmem->pfn_last -
+				kvmppc_hmm->devmem->pfn_first;
+
+	kvmppc_hmm->pfn_bitmap = kcalloc(BITS_TO_LONGS(nr_pfns),
+					 sizeof(unsigned long), GFP_KERNEL);
+	if (!kvmppc_hmm->pfn_bitmap)
+		return -ENOMEM;
+
+	spin_lock_init(&kvmppc_hmm_lock);
+
+	return 0;
+}
+
+int kvmppc_hmm_init(void)
+{
+	int ret = 0;
+	unsigned long size = kvmppc_get_secmem_size();
+
+	kvmppc_hmm = kzalloc(sizeof(*kvmppc_hmm), GFP_KERNEL);
+	if (!kvmppc_hmm) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	kvmppc_hmm->device = hmm_device_new(NULL);
+	if (IS_ERR(kvmppc_hmm->device)) {
+		ret = PTR_ERR(kvmppc_hmm->device);
+		goto out_free;
+	}
+
+	kvmppc_hmm->devmem = hmm_devmem_add(&kvmppc_hmm_devmem_ops,
+					    &kvmppc_hmm->device->device, size);
+	if (IS_ERR(kvmppc_hmm->devmem)) {
+		ret = PTR_ERR(kvmppc_hmm->devmem);
+		goto out_device;
+	}
+	ret = kvmppc_hmm_pages_init();
+	if (ret < 0)
+		goto out_devmem;
+
+	return ret;
+
+out_devmem:
+	hmm_devmem_remove(kvmppc_hmm->devmem);
+out_device:
+	hmm_device_put(kvmppc_hmm->device);
+out_free:
+	kfree(kvmppc_hmm);
+	kvmppc_hmm = NULL;
+out:
+	return ret;
+}
+
+void kvmppc_hmm_free(void)
+{
+	kfree(kvmppc_hmm->pfn_bitmap);
+	hmm_devmem_remove(kvmppc_hmm->devmem);
+	hmm_device_put(kvmppc_hmm->device);
+	kfree(kvmppc_hmm);
+	kvmppc_hmm = NULL;
+}