diff mbox series

[v7,1/7] kvmppc: Driver to manage pages of secure guest

Message ID 20190822102620.21897-2-bharata@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series KVMPPC driver to manage secure guest pages | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch warning Failed to apply on branch next (0e4523c0b4f64eaf7abe59e143e6bdf8f972acff)
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch

Commit Message

Bharata B Rao Aug. 22, 2019, 10:26 a.m. UTC
KVMPPC driver to manage page transitions of secure guest
via H_SVM_PAGE_IN and H_SVM_PAGE_OUT hcalls.

H_SVM_PAGE_IN: Move the content of a normal page to secure page
H_SVM_PAGE_OUT: Move the content of a secure page to normal page

Private ZONE_DEVICE memory equal to the amount of secure memory
available in the platform for running secure guests is created.
Whenever a page belonging to the guest becomes secure, a page from
this private device memory is used to represent and track that secure
page on the HV side. The movement of pages between normal and secure
memory is done via migrate_vma_pages() using UV_PAGE_IN and
UV_PAGE_OUT ucalls.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
 arch/powerpc/include/asm/hvcall.h          |   4 +
 arch/powerpc/include/asm/kvm_book3s_devm.h |  29 ++
 arch/powerpc/include/asm/kvm_host.h        |  23 ++
 arch/powerpc/include/asm/ultravisor-api.h  |   2 +
 arch/powerpc/include/asm/ultravisor.h      |  14 +
 arch/powerpc/kvm/Makefile                  |   3 +
 arch/powerpc/kvm/book3s_hv.c               |  19 +
 arch/powerpc/kvm/book3s_hv_devm.c          | 438 +++++++++++++++++++++
 8 files changed, 532 insertions(+)
 create mode 100644 arch/powerpc/include/asm/kvm_book3s_devm.h
 create mode 100644 arch/powerpc/kvm/book3s_hv_devm.c

Comments

Sukadev Bhattiprolu Aug. 29, 2019, 3:02 a.m. UTC | #1
Some minor comments/questions below. Overall, the patches look
fine to me.

> +#include <linux/pagemap.h>
> +#include <linux/migrate.h>
> +#include <linux/kvm_host.h>
> +#include <asm/ultravisor.h>
> +
> +static struct dev_pagemap kvmppc_devm_pgmap;
> +static unsigned long *kvmppc_devm_pfn_bitmap;
> +static DEFINE_SPINLOCK(kvmppc_devm_pfn_lock);

Is this lock protecting just the pfn_bitmap?

> +
> +struct kvmppc_devm_page_pvt {
> +	unsigned long *rmap;
> +	unsigned int lpid;
> +	unsigned long gpa;
> +};
> +
> +/*
> + * Get a free device PFN from the pool
> + *
> + * Called when a normal page is moved to secure memory (UV_PAGE_IN). Device
> + * PFN will be used to keep track of the secure page on HV side.
> + *
> + * @rmap here is the slot in the rmap array that corresponds to @gpa.
> + * Thus a non-zero rmap entry indicates that the corresponding guest
> + * page has become secure, and is not mapped on the HV side.
> + *
> + * NOTE: In this and subsequent functions, we pass around and access
> + * individual elements of kvm_memory_slot->arch.rmap[] without any
> + * protection. Should we use lock_rmap() here?
> + */
> +static struct page *kvmppc_devm_get_page(unsigned long *rmap, unsigned long gpa,
> +					 unsigned int lpid)
> +{
> +	struct page *dpage = NULL;
> +	unsigned long bit, devm_pfn;
> +	unsigned long flags;
> +	struct kvmppc_devm_page_pvt *pvt;
> +	unsigned long pfn_last, pfn_first;
> +
> +	if (kvmppc_rmap_is_devm_pfn(*rmap))
> +		return NULL;
> +
> +	pfn_first = kvmppc_devm_pgmap.res.start >> PAGE_SHIFT;
> +	pfn_last = pfn_first +
> +		   (resource_size(&kvmppc_devm_pgmap.res) >> PAGE_SHIFT);
> +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);

Blank lines around spin_lock() would help.

> +	bit = find_first_zero_bit(kvmppc_devm_pfn_bitmap, pfn_last - pfn_first);
> +	if (bit >= (pfn_last - pfn_first))
> +		goto out;
> +
> +	bitmap_set(kvmppc_devm_pfn_bitmap, bit, 1);
> +	devm_pfn = bit + pfn_first;

Can we drop the &kvmppc_devm_pfn_lock here or after the trylock_page()?
Or does it also protect the ->zone_device_data' assignment below as well?
If so, maybe drop the 'pfn_' from the name of the lock?

Besides, we don't seem to hold this lock when accessing ->zone_device_data
in kvmppc_share_page(). Maybe &kvmppc_devm_pfn_lock just protects the bitmap?


> +	dpage = pfn_to_page(devm_pfn);

Does this code and hence CONFIG_PPC_UV depend on a specific model like
CONFIG_SPARSEMEM_VMEMMAP?
> +
> +	if (!trylock_page(dpage))
> +		goto out_clear;
> +
> +	*rmap = devm_pfn | KVMPPC_RMAP_DEVM_PFN;
> +	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
> +	if (!pvt)
> +		goto out_unlock;
> +	pvt->rmap = rmap;
> +	pvt->gpa = gpa;
> +	pvt->lpid = lpid;
> +	dpage->zone_device_data = pvt;

->zone_device_data is set after locking the dpage here, but in
kvmppc_share_page() and kvmppc_devm_fault_migrate_alloc_and_copy()
it is accessed without locking the page?

> +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> +
> +	get_page(dpage);
> +	return dpage;
> +
> +out_unlock:
> +	unlock_page(dpage);
> +out_clear:
> +	bitmap_clear(kvmppc_devm_pfn_bitmap, devm_pfn - pfn_first, 1);
> +out:
> +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> +	return NULL;
> +}
> +
> +/*
> + * Alloc a PFN from private device memory pool and copy page from normal
> + * memory to secure memory.
> + */
> +static int
> +kvmppc_devm_migrate_alloc_and_copy(struct migrate_vma *mig,
> +				   unsigned long *rmap, unsigned long gpa,
> +				   unsigned int lpid, unsigned long page_shift)
> +{
> +	struct page *spage = migrate_pfn_to_page(*mig->src);
> +	unsigned long pfn = *mig->src >> MIGRATE_PFN_SHIFT;
> +	struct page *dpage;
> +
> +	*mig->dst = 0;
> +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> +		return 0;
> +
> +	dpage = kvmppc_devm_get_page(rmap, gpa, lpid);
> +	if (!dpage)
> +		return -EINVAL;
> +
> +	if (spage)
> +		uv_page_in(lpid, pfn << page_shift, gpa, 0, page_shift);
> +
> +	*mig->dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> +	return 0;
> +}
> +
> +/*
> + * Move page from normal memory to secure memory.
> + */
> +unsigned long
> +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> +		     unsigned long flags, unsigned long page_shift)
> +{
> +	unsigned long addr, end;
> +	unsigned long src_pfn, dst_pfn;

These are the host frame numbers correct? Trying to distinguish them
from 'gfn' and 'gpa' used in the function.

> +	struct migrate_vma mig;
> +	struct vm_area_struct *vma;
> +	int srcu_idx;
> +	unsigned long gfn = gpa >> page_shift;
> +	struct kvm_memory_slot *slot;
> +	unsigned long *rmap;
> +	int ret;
> +
> +	if (page_shift != PAGE_SHIFT)
> +		return H_P3;
> +
> +	if (flags)
> +		return H_P2;
> +
> +	ret = H_PARAMETER;
> +	down_read(&kvm->mm->mmap_sem);
> +	srcu_idx = srcu_read_lock(&kvm->srcu);
> +	slot = gfn_to_memslot(kvm, gfn);

Can slot be NULL? could be a bug in UV...

> +	rmap = &slot->arch.rmap[gfn - slot->base_gfn];
> +	addr = gfn_to_hva(kvm, gpa >> page_shift);

Use 'gfn' as the second parameter? 

Nit. for consistency with gpa and gfn, maybe rename 'addr' to
'hva' or to match 'end' maybe to 'start'.

Also, can we check 'kvmppc_rmap_is_devm_pfn(*rmap)' here and bail out
if its already shared? We currently do it further down the call chain
in kvmppc_devm_get_page() after doing more work.


> +	if (kvm_is_error_hva(addr))
> +		goto out;
> +
> +	end = addr + (1UL << page_shift);
> +	vma = find_vma_intersection(kvm->mm, addr, end);
> +	if (!vma || vma->vm_start > addr || vma->vm_end < end)
> +		goto out;
> +
> +	memset(&mig, 0, sizeof(mig));
> +	mig.vma = vma;
> +	mig.start = addr;
> +	mig.end = end;
> +	mig.src = &src_pfn;
> +	mig.dst = &dst_pfn;
> +
> +	if (migrate_vma_setup(&mig))
> +		goto out;
> +
> +	if (kvmppc_devm_migrate_alloc_and_copy(&mig, rmap, gpa,
> +					       kvm->arch.lpid, page_shift))
> +		goto out_finalize;
> +
> +	migrate_vma_pages(&mig);
> +	ret = H_SUCCESS;
> +out_finalize:
> +	migrate_vma_finalize(&mig);
> +out:
> +	srcu_read_unlock(&kvm->srcu, srcu_idx);
> +	up_read(&kvm->mm->mmap_sem);
> +	return ret;
> +}
> +
> +/*
> + * Provision a new page on HV side and copy over the contents
> + * from secure memory.
> + */
> +static int
> +kvmppc_devm_fault_migrate_alloc_and_copy(struct migrate_vma *mig,
> +					 unsigned long page_shift)
> +{
> +	struct page *dpage, *spage;
> +	struct kvmppc_devm_page_pvt *pvt;
> +	unsigned long pfn;
> +	int ret;
> +
> +	spage = migrate_pfn_to_page(*mig->src);
> +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> +		return 0;
> +	if (!is_zone_device_page(spage))
> +		return 0;

What does it mean if its not a zone_device page at this point? Caller
would then proceed to migrage_vma_pages() if we return 0 right?

> +
> +	dpage = alloc_page_vma(GFP_HIGHUSER, mig->vma, mig->start);
> +	if (!dpage)
> +		return -EINVAL;
> +	lock_page(dpage);
> +	pvt = spage->zone_device_data;
> +
> +	pfn = page_to_pfn(dpage);
> +	ret = uv_page_out(pvt->lpid, pfn << page_shift, pvt->gpa, 0,
> +			  page_shift);
> +	if (ret == U_SUCCESS)
> +		*mig->dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> +	else {
> +		unlock_page(dpage);
> +		__free_page(dpage);
> +	}
> +	return ret;
> +}
> +
> +/*
> + * Fault handler callback when HV touches any page that has been
> + * moved to secure memory, we ask UV to give back the page by
> + * issuing a UV_PAGE_OUT uvcall.
> + *
> + * This eventually results in dropping of device PFN and the newly
> + * provisioned page/PFN gets populated in QEMU page tables.
> + */
> +static vm_fault_t kvmppc_devm_migrate_to_ram(struct vm_fault *vmf)
> +{
> +	unsigned long src_pfn, dst_pfn = 0;
> +	struct migrate_vma mig;
> +	int ret = 0;
> +
> +	memset(&mig, 0, sizeof(mig));
> +	mig.vma = vmf->vma;
> +	mig.start = vmf->address;
> +	mig.end = vmf->address + PAGE_SIZE;
> +	mig.src = &src_pfn;
> +	mig.dst = &dst_pfn;
> +
> +	if (migrate_vma_setup(&mig)) {
> +		ret = VM_FAULT_SIGBUS;
> +		goto out;
> +	}
> +
> +	if (kvmppc_devm_fault_migrate_alloc_and_copy(&mig, PAGE_SHIFT)) {
> +		ret = VM_FAULT_SIGBUS;
> +		goto out_finalize;
> +	}
> +
> +	migrate_vma_pages(&mig);
> +out_finalize:
> +	migrate_vma_finalize(&mig);
> +out:
> +	return ret;
> +}
> +
> +/*
> + * Release the device PFN back to the pool
> + *
> + * Gets called when secure page becomes a normal page during UV_PAGE_OUT.

Nit: Should that be H_SVM_PAGE_OUT?

> + */
> +static void kvmppc_devm_page_free(struct page *page)
> +{
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	struct kvmppc_devm_page_pvt *pvt;
> +
> +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
> +	pvt = page->zone_device_data;
> +	page->zone_device_data = NULL;

If the pfn_lock only protects the bitmap, would be better to move
it here?

> +
> +	bitmap_clear(kvmppc_devm_pfn_bitmap,
> +		     pfn - (kvmppc_devm_pgmap.res.start >> PAGE_SHIFT), 1);
> +	*pvt->rmap = 0;
> +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> +	kfree(pvt);
> +}
> +
> +static const struct dev_pagemap_ops kvmppc_devm_ops = {
> +	.page_free = kvmppc_devm_page_free,
> +	.migrate_to_ram	= kvmppc_devm_migrate_to_ram,
> +};
> +
> +/*
> + * Move page from secure memory to normal memory.
> + */
> +unsigned long
> +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
> +		      unsigned long flags, unsigned long page_shift)
> +{
> +	struct migrate_vma mig;
> +	unsigned long addr, end;
> +	struct vm_area_struct *vma;
> +	unsigned long src_pfn, dst_pfn = 0;
> +	int srcu_idx;
> +	int ret;

Nit: Not sure its a coding style requirement, but many functions seem
to "sort" these local variables in descending order of line length for
appearance :-)  (eg: migrate_vma* functions).

> +
> +	if (page_shift != PAGE_SHIFT)
> +		return H_P3;
> +
> +	if (flags)
> +		return H_P2;
> +
> +	ret = H_PARAMETER;
> +	down_read(&kvm->mm->mmap_sem);
> +	srcu_idx = srcu_read_lock(&kvm->srcu);
> +	addr = gfn_to_hva(kvm, gpa >> page_shift);
> +	if (kvm_is_error_hva(addr))
> +		goto out;
> +
> +	end = addr + (1UL << page_shift);
> +	vma = find_vma_intersection(kvm->mm, addr, end);
> +	if (!vma || vma->vm_start > addr || vma->vm_end < end)
> +		goto out;
> +
> +	memset(&mig, 0, sizeof(mig));
> +	mig.vma = vma;
> +	mig.start = addr;
> +	mig.end = end;
> +	mig.src = &src_pfn;
> +	mig.dst = &dst_pfn;
> +	if (migrate_vma_setup(&mig))
> +		goto out;
> +
> +	ret = kvmppc_devm_fault_migrate_alloc_and_copy(&mig, page_shift);
> +	if (ret)
> +		goto out_finalize;
> +
> +	migrate_vma_pages(&mig);
> +	ret = H_SUCCESS;

Nit: Blank line here?

> +out_finalize:
> +	migrate_vma_finalize(&mig);
> +out:
> +	srcu_read_unlock(&kvm->srcu, srcu_idx);
> +	up_read(&kvm->mm->mmap_sem);
> +	return ret;
> +}
> +
> +static u64 kvmppc_get_secmem_size(void)
> +{
> +	struct device_node *np;
> +	int i, len;
> +	const __be32 *prop;
> +	u64 size = 0;
> +
> +	np = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
> +	if (!np)
> +		goto out;
> +
> +	prop = of_get_property(np, "secure-memory-ranges", &len);
> +	if (!prop)
> +		goto out_put;
> +
> +	for (i = 0; i < len / (sizeof(*prop) * 4); i++)
> +		size += of_read_number(prop + (i * 4) + 2, 2);
> +
> +out_put:
> +	of_node_put(np);
> +out:
> +	return size;
> +}
> +
> +int kvmppc_devm_init(void)
> +{
> +	int ret = 0;
> +	unsigned long size;
> +	struct resource *res;
> +	void *addr;
> +	unsigned long pfn_last, pfn_first;
> +
> +	size = kvmppc_get_secmem_size();
> +	if (!size) {
> +		ret = -ENODEV;
> +		goto out;
> +	}
> +
> +	res = request_free_mem_region(&iomem_resource, size, "kvmppc_devm");
> +	if (IS_ERR(res)) {
> +		ret = PTR_ERR(res);
> +		goto out;
> +	}
> +
> +	kvmppc_devm_pgmap.type = MEMORY_DEVICE_PRIVATE;
> +	kvmppc_devm_pgmap.res = *res;
> +	kvmppc_devm_pgmap.ops = &kvmppc_devm_ops;
> +	addr = memremap_pages(&kvmppc_devm_pgmap, -1);
> +	if (IS_ERR(addr)) {
> +		ret = PTR_ERR(addr);
> +		goto out_free_region;
> +	}
> +
> +	pfn_first = res->start >> PAGE_SHIFT;
> +	pfn_last = pfn_first + (resource_size(res) >> PAGE_SHIFT);
> +	kvmppc_devm_pfn_bitmap = kcalloc(BITS_TO_LONGS(pfn_last - pfn_first),
> +					 sizeof(unsigned long), GFP_KERNEL);
> +	if (!kvmppc_devm_pfn_bitmap) {
> +		ret = -ENOMEM;
> +		goto out_unmap;
> +	}
> +
> +	pr_info("KVMPPC-DEVM: Secure Memory size 0x%lx\n", size);
> +	return ret;

Nit: Blank line here?

> +out_unmap:
> +	memunmap_pages(&kvmppc_devm_pgmap);
> +out_free_region:
> +	release_mem_region(res->start, size);
> +out:
> +	return ret;
> +}
> +
> +void kvmppc_devm_free(void)
> +{
> +	memunmap_pages(&kvmppc_devm_pgmap);
> +	release_mem_region(kvmppc_devm_pgmap.res.start,
> +			   resource_size(&kvmppc_devm_pgmap.res));
> +	kfree(kvmppc_devm_pfn_bitmap);
> +}
> -- 
> 2.21.0
Bharata B Rao Aug. 29, 2019, 6:56 a.m. UTC | #2
On Wed, Aug 28, 2019 at 08:02:19PM -0700, Sukadev Bhattiprolu wrote:
> Some minor comments/questions below. Overall, the patches look
> fine to me.
> 
> > +#include <linux/pagemap.h>
> > +#include <linux/migrate.h>
> > +#include <linux/kvm_host.h>
> > +#include <asm/ultravisor.h>
> > +
> > +static struct dev_pagemap kvmppc_devm_pgmap;
> > +static unsigned long *kvmppc_devm_pfn_bitmap;
> > +static DEFINE_SPINLOCK(kvmppc_devm_pfn_lock);
> 
> Is this lock protecting just the pfn_bitmap?

Yes.

> 
> > +
> > +struct kvmppc_devm_page_pvt {
> > +	unsigned long *rmap;
> > +	unsigned int lpid;
> > +	unsigned long gpa;
> > +};
> > +
> > +/*
> > + * Get a free device PFN from the pool
> > + *
> > + * Called when a normal page is moved to secure memory (UV_PAGE_IN). Device
> > + * PFN will be used to keep track of the secure page on HV side.
> > + *
> > + * @rmap here is the slot in the rmap array that corresponds to @gpa.
> > + * Thus a non-zero rmap entry indicates that the corresponding guest
> > + * page has become secure, and is not mapped on the HV side.
> > + *
> > + * NOTE: In this and subsequent functions, we pass around and access
> > + * individual elements of kvm_memory_slot->arch.rmap[] without any
> > + * protection. Should we use lock_rmap() here?
> > + */
> > +static struct page *kvmppc_devm_get_page(unsigned long *rmap, unsigned long gpa,
> > +					 unsigned int lpid)
> > +{
> > +	struct page *dpage = NULL;
> > +	unsigned long bit, devm_pfn;
> > +	unsigned long flags;
> > +	struct kvmppc_devm_page_pvt *pvt;
> > +	unsigned long pfn_last, pfn_first;
> > +
> > +	if (kvmppc_rmap_is_devm_pfn(*rmap))
> > +		return NULL;
> > +
> > +	pfn_first = kvmppc_devm_pgmap.res.start >> PAGE_SHIFT;
> > +	pfn_last = pfn_first +
> > +		   (resource_size(&kvmppc_devm_pgmap.res) >> PAGE_SHIFT);
> > +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
> 
> Blank lines around spin_lock() would help.

You mean blank line before lock and after unlock to clearly see
where the lock starts and ends?

> 
> > +	bit = find_first_zero_bit(kvmppc_devm_pfn_bitmap, pfn_last - pfn_first);
> > +	if (bit >= (pfn_last - pfn_first))
> > +		goto out;
> > +
> > +	bitmap_set(kvmppc_devm_pfn_bitmap, bit, 1);
> > +	devm_pfn = bit + pfn_first;
> 
> Can we drop the &kvmppc_devm_pfn_lock here or after the trylock_page()?
> Or does it also protect the ->zone_device_data' assignment below as well?
> If so, maybe drop the 'pfn_' from the name of the lock?
> 
> Besides, we don't seem to hold this lock when accessing ->zone_device_data
> in kvmppc_share_page(). Maybe &kvmppc_devm_pfn_lock just protects the bitmap?

Will move the unlock to appropriately.

> 
> 
> > +	dpage = pfn_to_page(devm_pfn);
> 
> Does this code and hence CONFIG_PPC_UV depend on a specific model like
> CONFIG_SPARSEMEM_VMEMMAP?

I don't think so. Irrespective of that pfn_to_page() should just work
for us.

> > +
> > +	if (!trylock_page(dpage))
> > +		goto out_clear;
> > +
> > +	*rmap = devm_pfn | KVMPPC_RMAP_DEVM_PFN;
> > +	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
> > +	if (!pvt)
> > +		goto out_unlock;
> > +	pvt->rmap = rmap;
> > +	pvt->gpa = gpa;
> > +	pvt->lpid = lpid;
> > +	dpage->zone_device_data = pvt;
> 
> ->zone_device_data is set after locking the dpage here, but in
> kvmppc_share_page() and kvmppc_devm_fault_migrate_alloc_and_copy()
> it is accessed without locking the page?
> 
> > +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> > +
> > +	get_page(dpage);
> > +	return dpage;
> > +
> > +out_unlock:
> > +	unlock_page(dpage);
> > +out_clear:
> > +	bitmap_clear(kvmppc_devm_pfn_bitmap, devm_pfn - pfn_first, 1);
> > +out:
> > +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> > +	return NULL;
> > +}
> > +
> > +/*
> > + * Alloc a PFN from private device memory pool and copy page from normal
> > + * memory to secure memory.
> > + */
> > +static int
> > +kvmppc_devm_migrate_alloc_and_copy(struct migrate_vma *mig,
> > +				   unsigned long *rmap, unsigned long gpa,
> > +				   unsigned int lpid, unsigned long page_shift)
> > +{
> > +	struct page *spage = migrate_pfn_to_page(*mig->src);
> > +	unsigned long pfn = *mig->src >> MIGRATE_PFN_SHIFT;
> > +	struct page *dpage;
> > +
> > +	*mig->dst = 0;
> > +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> > +		return 0;
> > +
> > +	dpage = kvmppc_devm_get_page(rmap, gpa, lpid);
> > +	if (!dpage)
> > +		return -EINVAL;
> > +
> > +	if (spage)
> > +		uv_page_in(lpid, pfn << page_shift, gpa, 0, page_shift);
> > +
> > +	*mig->dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> > +	return 0;
> > +}
> > +
> > +/*
> > + * Move page from normal memory to secure memory.
> > + */
> > +unsigned long
> > +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> > +		     unsigned long flags, unsigned long page_shift)
> > +{
> > +	unsigned long addr, end;
> > +	unsigned long src_pfn, dst_pfn;
> 
> These are the host frame numbers correct? Trying to distinguish them
> from 'gfn' and 'gpa' used in the function.

Yes host pfns.

> 
> > +	struct migrate_vma mig;
> > +	struct vm_area_struct *vma;
> > +	int srcu_idx;
> > +	unsigned long gfn = gpa >> page_shift;
> > +	struct kvm_memory_slot *slot;
> > +	unsigned long *rmap;
> > +	int ret;
> > +
> > +	if (page_shift != PAGE_SHIFT)
> > +		return H_P3;
> > +
> > +	if (flags)
> > +		return H_P2;
> > +
> > +	ret = H_PARAMETER;
> > +	down_read(&kvm->mm->mmap_sem);
> > +	srcu_idx = srcu_read_lock(&kvm->srcu);
> > +	slot = gfn_to_memslot(kvm, gfn);
> 
> Can slot be NULL? could be a bug in UV...

Will add a check to test this failure.

> 
> > +	rmap = &slot->arch.rmap[gfn - slot->base_gfn];
> > +	addr = gfn_to_hva(kvm, gpa >> page_shift);
> 
> Use 'gfn' as the second parameter? 

Yes.

> 
> Nit. for consistency with gpa and gfn, maybe rename 'addr' to
> 'hva' or to match 'end' maybe to 'start'.

Guess using hva improves readability, sure.

> 
> Also, can we check 'kvmppc_rmap_is_devm_pfn(*rmap)' here and bail out
> if its already shared? We currently do it further down the call chain
> in kvmppc_devm_get_page() after doing more work.

If the page is already shared, we just give the same back to UV if
UV indeed asks for it to be re-shared.

That said, I think we can have kvmppc_rmap_is_devm_pfn early in
regular page-in (non-shared case) path so that we don't even setup
anything required for migrate_vma_pages.

> 
> 
> > +	if (kvm_is_error_hva(addr))
> > +		goto out;
> > +
> > +	end = addr + (1UL << page_shift);
> > +	vma = find_vma_intersection(kvm->mm, addr, end);
> > +	if (!vma || vma->vm_start > addr || vma->vm_end < end)
> > +		goto out;
> > +
> > +	memset(&mig, 0, sizeof(mig));
> > +	mig.vma = vma;
> > +	mig.start = addr;
> > +	mig.end = end;
> > +	mig.src = &src_pfn;
> > +	mig.dst = &dst_pfn;
> > +
> > +	if (migrate_vma_setup(&mig))
> > +		goto out;
> > +
> > +	if (kvmppc_devm_migrate_alloc_and_copy(&mig, rmap, gpa,
> > +					       kvm->arch.lpid, page_shift))
> > +		goto out_finalize;
> > +
> > +	migrate_vma_pages(&mig);
> > +	ret = H_SUCCESS;
> > +out_finalize:
> > +	migrate_vma_finalize(&mig);
> > +out:
> > +	srcu_read_unlock(&kvm->srcu, srcu_idx);
> > +	up_read(&kvm->mm->mmap_sem);
> > +	return ret;
> > +}
> > +
> > +/*
> > + * Provision a new page on HV side and copy over the contents
> > + * from secure memory.
> > + */
> > +static int
> > +kvmppc_devm_fault_migrate_alloc_and_copy(struct migrate_vma *mig,
> > +					 unsigned long page_shift)
> > +{
> > +	struct page *dpage, *spage;
> > +	struct kvmppc_devm_page_pvt *pvt;
> > +	unsigned long pfn;
> > +	int ret;
> > +
> > +	spage = migrate_pfn_to_page(*mig->src);
> > +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> > +		return 0;
> > +	if (!is_zone_device_page(spage))
> > +		return 0;
> 
> What does it mean if its not a zone_device page at this point? Caller
> would then proceed to migrage_vma_pages() if we return 0 right?

kvmppc_devm_fault_migrate_alloc_and_copy() can be called from two paths:

1. Fault path when HV touches the secure page. In this case the page
has to be a device page.

2. When page-out is issued for a page that is already paged-in. In this
case also it has be a device page.

For both the above cases, that check is redundant.

There is a 3rd case which is possible. If UV ever issues a page-out
for a shared page, this check will result in page-out hcall silently
succeeding w/o doing any migration (as we don't populate the dst_pfn)

> 
> > +
> > +	dpage = alloc_page_vma(GFP_HIGHUSER, mig->vma, mig->start);
> > +	if (!dpage)
> > +		return -EINVAL;
> > +	lock_page(dpage);
> > +	pvt = spage->zone_device_data;
> > +
> > +	pfn = page_to_pfn(dpage);
> > +	ret = uv_page_out(pvt->lpid, pfn << page_shift, pvt->gpa, 0,
> > +			  page_shift);
> > +	if (ret == U_SUCCESS)
> > +		*mig->dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> > +	else {
> > +		unlock_page(dpage);
> > +		__free_page(dpage);
> > +	}
> > +	return ret;
> > +}
> > +
> > +/*
> > + * Fault handler callback when HV touches any page that has been
> > + * moved to secure memory, we ask UV to give back the page by
> > + * issuing a UV_PAGE_OUT uvcall.
> > + *
> > + * This eventually results in dropping of device PFN and the newly
> > + * provisioned page/PFN gets populated in QEMU page tables.
> > + */
> > +static vm_fault_t kvmppc_devm_migrate_to_ram(struct vm_fault *vmf)
> > +{
> > +	unsigned long src_pfn, dst_pfn = 0;
> > +	struct migrate_vma mig;
> > +	int ret = 0;
> > +
> > +	memset(&mig, 0, sizeof(mig));
> > +	mig.vma = vmf->vma;
> > +	mig.start = vmf->address;
> > +	mig.end = vmf->address + PAGE_SIZE;
> > +	mig.src = &src_pfn;
> > +	mig.dst = &dst_pfn;
> > +
> > +	if (migrate_vma_setup(&mig)) {
> > +		ret = VM_FAULT_SIGBUS;
> > +		goto out;
> > +	}
> > +
> > +	if (kvmppc_devm_fault_migrate_alloc_and_copy(&mig, PAGE_SHIFT)) {
> > +		ret = VM_FAULT_SIGBUS;
> > +		goto out_finalize;
> > +	}
> > +
> > +	migrate_vma_pages(&mig);
> > +out_finalize:
> > +	migrate_vma_finalize(&mig);
> > +out:
> > +	return ret;
> > +}
> > +
> > +/*
> > + * Release the device PFN back to the pool
> > + *
> > + * Gets called when secure page becomes a normal page during UV_PAGE_OUT.
> 
> Nit: Should that be H_SVM_PAGE_OUT?

Yes, will reword.

> 
> > + */
> > +static void kvmppc_devm_page_free(struct page *page)
> > +{
> > +	unsigned long pfn = page_to_pfn(page);
> > +	unsigned long flags;
> > +	struct kvmppc_devm_page_pvt *pvt;
> > +
> > +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
> > +	pvt = page->zone_device_data;
> > +	page->zone_device_data = NULL;
> 
> If the pfn_lock only protects the bitmap, would be better to move
> it here?

Yes.

> 
> > +
> > +	bitmap_clear(kvmppc_devm_pfn_bitmap,
> > +		     pfn - (kvmppc_devm_pgmap.res.start >> PAGE_SHIFT), 1);
> > +	*pvt->rmap = 0;
> > +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> > +	kfree(pvt);
> > +}
> > +
> > +static const struct dev_pagemap_ops kvmppc_devm_ops = {
> > +	.page_free = kvmppc_devm_page_free,
> > +	.migrate_to_ram	= kvmppc_devm_migrate_to_ram,
> > +};
> > +
> > +/*
> > + * Move page from secure memory to normal memory.
> > + */
> > +unsigned long
> > +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
> > +		      unsigned long flags, unsigned long page_shift)
> > +{
> > +	struct migrate_vma mig;
> > +	unsigned long addr, end;
> > +	struct vm_area_struct *vma;
> > +	unsigned long src_pfn, dst_pfn = 0;
> > +	int srcu_idx;
> > +	int ret;
> 
> Nit: Not sure its a coding style requirement, but many functions seem
> to "sort" these local variables in descending order of line length for
> appearance :-)  (eg: migrate_vma* functions).

It has ended up like this over multiple versions when variables got added,
moved and re-added.

> 
> > +
> > +	if (page_shift != PAGE_SHIFT)
> > +		return H_P3;
> > +
> > +	if (flags)
> > +		return H_P2;
> > +
> > +	ret = H_PARAMETER;
> > +	down_read(&kvm->mm->mmap_sem);
> > +	srcu_idx = srcu_read_lock(&kvm->srcu);
> > +	addr = gfn_to_hva(kvm, gpa >> page_shift);
> > +	if (kvm_is_error_hva(addr))
> > +		goto out;
> > +
> > +	end = addr + (1UL << page_shift);
> > +	vma = find_vma_intersection(kvm->mm, addr, end);
> > +	if (!vma || vma->vm_start > addr || vma->vm_end < end)
> > +		goto out;
> > +
> > +	memset(&mig, 0, sizeof(mig));
> > +	mig.vma = vma;
> > +	mig.start = addr;
> > +	mig.end = end;
> > +	mig.src = &src_pfn;
> > +	mig.dst = &dst_pfn;
> > +	if (migrate_vma_setup(&mig))
> > +		goto out;
> > +
> > +	ret = kvmppc_devm_fault_migrate_alloc_and_copy(&mig, page_shift);
> > +	if (ret)
> > +		goto out_finalize;
> > +
> > +	migrate_vma_pages(&mig);
> > +	ret = H_SUCCESS;
> 
> Nit: Blank line here?

With a blank like above the label line (which is blank for the most part),
it looks a bit too much of blank to me :)

However I do have blank line at a few other places. I have been removing
them whenever I touch the surrounding lines.

Thanks for your review.

Christoph - You did review this patch in the last iteration. Do you have
any additional comments?

Regards,
Bharata.
Christoph Hellwig Aug. 29, 2019, 8:38 a.m. UTC | #3
On Thu, Aug 22, 2019 at 03:56:14PM +0530, Bharata B Rao wrote:
> +/*
> + * Bits 60:56 in the rmap entry will be used to identify the
> + * different uses/functions of rmap.
> + */
> +#define KVMPPC_RMAP_DEVM_PFN	(0x2ULL << 56)

How did you come up with this specific value?

> +
> +static inline bool kvmppc_rmap_is_devm_pfn(unsigned long pfn)
> +{
> +	return !!(pfn & KVMPPC_RMAP_DEVM_PFN);
> +}

No need for !! when returning a bool.  Also the helper seems a little
pointless, just opencoding it would make the code more readable in my
opinion.

> +#ifdef CONFIG_PPC_UV
> +extern int kvmppc_devm_init(void);
> +extern void kvmppc_devm_free(void);

There is no need for extern in a function declaration.

> +static int
> +kvmppc_devm_migrate_alloc_and_copy(struct migrate_vma *mig,
> +				   unsigned long *rmap, unsigned long gpa,
> +				   unsigned int lpid, unsigned long page_shift)
> +{
> +	struct page *spage = migrate_pfn_to_page(*mig->src);
> +	unsigned long pfn = *mig->src >> MIGRATE_PFN_SHIFT;
> +	struct page *dpage;
> +
> +	*mig->dst = 0;
> +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> +		return 0;
> +
> +	dpage = kvmppc_devm_get_page(rmap, gpa, lpid);
> +	if (!dpage)
> +		return -EINVAL;
> +
> +	if (spage)
> +		uv_page_in(lpid, pfn << page_shift, gpa, 0, page_shift);
> +
> +	*mig->dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> +	return 0;
> +}

I think you can just merge this trivial helper into the only caller.

> +static int
> +kvmppc_devm_fault_migrate_alloc_and_copy(struct migrate_vma *mig,
> +					 unsigned long page_shift)
> +{
> +	struct page *dpage, *spage;
> +	struct kvmppc_devm_page_pvt *pvt;
> +	unsigned long pfn;
> +	int ret;
> +
> +	spage = migrate_pfn_to_page(*mig->src);
> +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> +		return 0;
> +	if (!is_zone_device_page(spage))
> +		return 0;
> +
> +	dpage = alloc_page_vma(GFP_HIGHUSER, mig->vma, mig->start);
> +	if (!dpage)
> +		return -EINVAL;
> +	lock_page(dpage);
> +	pvt = spage->zone_device_data;
> +
> +	pfn = page_to_pfn(dpage);
> +	ret = uv_page_out(pvt->lpid, pfn << page_shift, pvt->gpa, 0,
> +			  page_shift);
> +	if (ret == U_SUCCESS)
> +		*mig->dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> +	else {
> +		unlock_page(dpage);
> +		__free_page(dpage);
> +	}
> +	return ret;
> +}

Here we actually have two callers, but they have a fair amount of
duplicate code in them.  I think you want to move that common
code (including setting up the migrate_vma structure) into this
function and maybe also give it a more descriptive name.

> +static void kvmppc_devm_page_free(struct page *page)
> +{
> +	unsigned long pfn = page_to_pfn(page);
> +	unsigned long flags;
> +	struct kvmppc_devm_page_pvt *pvt;
> +
> +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
> +	pvt = page->zone_device_data;
> +	page->zone_device_data = NULL;
> +
> +	bitmap_clear(kvmppc_devm_pfn_bitmap,
> +		     pfn - (kvmppc_devm_pgmap.res.start >> PAGE_SHIFT), 1);

Nit: I'd just initialize pfn to the value you want from the start.
That makes the code a little easier to read, and keeps a tiny bit more
code outside the spinlock.

	unsigned long pfn = page_to_pfn(page) -
			(kvmppc_devm_pgmap.res.start >> PAGE_SHIFT);

	..

	 bitmap_clear(kvmppc_devm_pfn_bitmap, pfn, 1);


> +	kvmppc_devm_pgmap.type = MEMORY_DEVICE_PRIVATE;
> +	kvmppc_devm_pgmap.res = *res;
> +	kvmppc_devm_pgmap.ops = &kvmppc_devm_ops;
> +	addr = memremap_pages(&kvmppc_devm_pgmap, -1);

This -1 should be NUMA_NO_NODE for clarity.
Sukadev Bhattiprolu Aug. 29, 2019, 7:39 p.m. UTC | #4
Bharata B Rao [bharata@linux.ibm.com] wrote:
> On Wed, Aug 28, 2019 at 08:02:19PM -0700, Sukadev Bhattiprolu wrote:
> > Some minor comments/questions below. Overall, the patches look
> > fine to me.
> > 
> > > +#include <linux/pagemap.h>
> > > +#include <linux/migrate.h>
> > > +#include <linux/kvm_host.h>
> > > +#include <asm/ultravisor.h>
> > > +
> > > +static struct dev_pagemap kvmppc_devm_pgmap;
> > > +static unsigned long *kvmppc_devm_pfn_bitmap;
> > > +static DEFINE_SPINLOCK(kvmppc_devm_pfn_lock);
> > 
> > Is this lock protecting just the pfn_bitmap?
> 
> Yes.
> 
> > 
> > > +
> > > +struct kvmppc_devm_page_pvt {
> > > +	unsigned long *rmap;
> > > +	unsigned int lpid;
> > > +	unsigned long gpa;
> > > +};
> > > +
> > > +/*
> > > + * Get a free device PFN from the pool
> > > + *
> > > + * Called when a normal page is moved to secure memory (UV_PAGE_IN). Device
> > > + * PFN will be used to keep track of the secure page on HV side.
> > > + *
> > > + * @rmap here is the slot in the rmap array that corresponds to @gpa.
> > > + * Thus a non-zero rmap entry indicates that the corresponding guest
> > > + * page has become secure, and is not mapped on the HV side.
> > > + *
> > > + * NOTE: In this and subsequent functions, we pass around and access
> > > + * individual elements of kvm_memory_slot->arch.rmap[] without any
> > > + * protection. Should we use lock_rmap() here?

Where do we serialize two threads attempting to H_SVM_PAGE_IN the same gfn
at the same time? Or one thread issuing a H_SVM_PAGE_IN and another a
H_SVM_PAGE_OUT for the same page?

> > > + */
> > > +static struct page *kvmppc_devm_get_page(unsigned long *rmap, unsigned long gpa,
> > > +					 unsigned int lpid)
> > > +{
> > > +	struct page *dpage = NULL;
> > > +	unsigned long bit, devm_pfn;
> > > +	unsigned long flags;
> > > +	struct kvmppc_devm_page_pvt *pvt;
> > > +	unsigned long pfn_last, pfn_first;
> > > +
> > > +	if (kvmppc_rmap_is_devm_pfn(*rmap))
> > > +		return NULL;
> > > +
> > > +	pfn_first = kvmppc_devm_pgmap.res.start >> PAGE_SHIFT;
> > > +	pfn_last = pfn_first +
> > > +		   (resource_size(&kvmppc_devm_pgmap.res) >> PAGE_SHIFT);
> > > +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
> > 
> > Blank lines around spin_lock() would help.
> 
> You mean blank line before lock and after unlock to clearly see
> where the lock starts and ends?
> 
> > 
> > > +	bit = find_first_zero_bit(kvmppc_devm_pfn_bitmap, pfn_last - pfn_first);
> > > +	if (bit >= (pfn_last - pfn_first))
> > > +		goto out;
> > > +
> > > +	bitmap_set(kvmppc_devm_pfn_bitmap, bit, 1);
> > > +	devm_pfn = bit + pfn_first;
> > 
> > Can we drop the &kvmppc_devm_pfn_lock here or after the trylock_page()?
> > Or does it also protect the ->zone_device_data' assignment below as well?
> > If so, maybe drop the 'pfn_' from the name of the lock?
> > 
> > Besides, we don't seem to hold this lock when accessing ->zone_device_data
> > in kvmppc_share_page(). Maybe &kvmppc_devm_pfn_lock just protects the bitmap?
> 
> Will move the unlock to appropriately.
> 
> > 
> > 
> > > +	dpage = pfn_to_page(devm_pfn);
> > 
> > Does this code and hence CONFIG_PPC_UV depend on a specific model like
> > CONFIG_SPARSEMEM_VMEMMAP?
> 
> I don't think so. Irrespective of that pfn_to_page() should just work
> for us.
> 
> > > +
> > > +	if (!trylock_page(dpage))
> > > +		goto out_clear;
> > > +
> > > +	*rmap = devm_pfn | KVMPPC_RMAP_DEVM_PFN;
> > > +	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
> > > +	if (!pvt)
> > > +		goto out_unlock;

If we fail to alloc, we don't clear the KVMPPC_RMAP_DEVM_PFN?

Also, when/where do we clear this flag on an uv-page-out?
kvmppc_devm_drop_pages() drops the flag on a local variable but not
in the rmap? If we don't clear the flag on page-out, would the
subsequent H_SVM_PAGE_IN of this page fail?

> > > +	pvt->rmap = rmap;
> > > +	pvt->gpa = gpa;
> > > +	pvt->lpid = lpid;
> > > +	dpage->zone_device_data = pvt;
> > 
> > ->zone_device_data is set after locking the dpage here, but in
> > kvmppc_share_page() and kvmppc_devm_fault_migrate_alloc_and_copy()
> > it is accessed without locking the page?
> > 
> > > +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> > > +
> > > +	get_page(dpage);
> > > +	return dpage;
> > > +
> > > +out_unlock:
> > > +	unlock_page(dpage);
> > > +out_clear:
> > > +	bitmap_clear(kvmppc_devm_pfn_bitmap, devm_pfn - pfn_first, 1);
> > > +out:
> > > +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> > > +	return NULL;
> > > +}
> > > +
> > > +/*
> > > + * Alloc a PFN from private device memory pool and copy page from normal
> > > + * memory to secure memory.
> > > + */
> > > +static int
> > > +kvmppc_devm_migrate_alloc_and_copy(struct migrate_vma *mig,
> > > +				   unsigned long *rmap, unsigned long gpa,
> > > +				   unsigned int lpid, unsigned long page_shift)
> > > +{
> > > +	struct page *spage = migrate_pfn_to_page(*mig->src);
> > > +	unsigned long pfn = *mig->src >> MIGRATE_PFN_SHIFT;
> > > +	struct page *dpage;
> > > +
> > > +	*mig->dst = 0;
> > > +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> > > +		return 0;
> > > +
> > > +	dpage = kvmppc_devm_get_page(rmap, gpa, lpid);
> > > +	if (!dpage)
> > > +		return -EINVAL;
> > > +
> > > +	if (spage)
> > > +		uv_page_in(lpid, pfn << page_shift, gpa, 0, page_shift);
> > > +
> > > +	*mig->dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> > > +	return 0;
> > > +}
> > > +
> > > +/*
> > > + * Move page from normal memory to secure memory.
> > > + */
> > > +unsigned long
> > > +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
> > > +		     unsigned long flags, unsigned long page_shift)
> > > +{
> > > +	unsigned long addr, end;
> > > +	unsigned long src_pfn, dst_pfn;
> > 
> > These are the host frame numbers correct? Trying to distinguish them
> > from 'gfn' and 'gpa' used in the function.
> 
> Yes host pfns.
> 
> > 
> > > +	struct migrate_vma mig;
> > > +	struct vm_area_struct *vma;
> > > +	int srcu_idx;
> > > +	unsigned long gfn = gpa >> page_shift;
> > > +	struct kvm_memory_slot *slot;
> > > +	unsigned long *rmap;
> > > +	int ret;
> > > +
> > > +	if (page_shift != PAGE_SHIFT)
> > > +		return H_P3;
> > > +
> > > +	if (flags)
> > > +		return H_P2;
> > > +
> > > +	ret = H_PARAMETER;
> > > +	down_read(&kvm->mm->mmap_sem);
> > > +	srcu_idx = srcu_read_lock(&kvm->srcu);
> > > +	slot = gfn_to_memslot(kvm, gfn);
> > 
> > Can slot be NULL? could be a bug in UV...
> 
> Will add a check to test this failure.
> 
> > 
> > > +	rmap = &slot->arch.rmap[gfn - slot->base_gfn];
> > > +	addr = gfn_to_hva(kvm, gpa >> page_shift);
> > 
> > Use 'gfn' as the second parameter? 
> 
> Yes.
> 
> > 
> > Nit. for consistency with gpa and gfn, maybe rename 'addr' to
> > 'hva' or to match 'end' maybe to 'start'.
> 
> Guess using hva improves readability, sure.
> 
> > 
> > Also, can we check 'kvmppc_rmap_is_devm_pfn(*rmap)' here and bail out
> > if its already shared? We currently do it further down the call chain
> > in kvmppc_devm_get_page() after doing more work.
> 
> If the page is already shared, we just give the same back to UV if
> UV indeed asks for it to be re-shared.
> 
> That said, I think we can have kvmppc_rmap_is_devm_pfn early in
> regular page-in (non-shared case) path so that we don't even setup
> anything required for migrate_vma_pages.
> 
> > 
> > 
> > > +	if (kvm_is_error_hva(addr))
> > > +		goto out;
> > > +
> > > +	end = addr + (1UL << page_shift);
> > > +	vma = find_vma_intersection(kvm->mm, addr, end);
> > > +	if (!vma || vma->vm_start > addr || vma->vm_end < end)
> > > +		goto out;
> > > +
> > > +	memset(&mig, 0, sizeof(mig));
> > > +	mig.vma = vma;
> > > +	mig.start = addr;
> > > +	mig.end = end;
> > > +	mig.src = &src_pfn;
> > > +	mig.dst = &dst_pfn;
> > > +
> > > +	if (migrate_vma_setup(&mig))
> > > +		goto out;
> > > +
> > > +	if (kvmppc_devm_migrate_alloc_and_copy(&mig, rmap, gpa,
> > > +					       kvm->arch.lpid, page_shift))
> > > +		goto out_finalize;
> > > +
> > > +	migrate_vma_pages(&mig);
> > > +	ret = H_SUCCESS;
> > > +out_finalize:
> > > +	migrate_vma_finalize(&mig);
> > > +out:
> > > +	srcu_read_unlock(&kvm->srcu, srcu_idx);
> > > +	up_read(&kvm->mm->mmap_sem);
> > > +	return ret;
> > > +}
> > > +
> > > +/*
> > > + * Provision a new page on HV side and copy over the contents
> > > + * from secure memory.
> > > + */
> > > +static int
> > > +kvmppc_devm_fault_migrate_alloc_and_copy(struct migrate_vma *mig,
> > > +					 unsigned long page_shift)
> > > +{
> > > +	struct page *dpage, *spage;
> > > +	struct kvmppc_devm_page_pvt *pvt;
> > > +	unsigned long pfn;
> > > +	int ret;
> > > +
> > > +	spage = migrate_pfn_to_page(*mig->src);
> > > +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> > > +		return 0;
> > > +	if (!is_zone_device_page(spage))
> > > +		return 0;
> > 
> > What does it mean if its not a zone_device page at this point? Caller
> > would then proceed to migrage_vma_pages() if we return 0 right?
> 
> kvmppc_devm_fault_migrate_alloc_and_copy() can be called from two paths:
> 
> 1. Fault path when HV touches the secure page. In this case the page
> has to be a device page.
> 
> 2. When page-out is issued for a page that is already paged-in. In this
> case also it has be a device page.
> 
> For both the above cases, that check is redundant.
> 
> There is a 3rd case which is possible. If UV ever issues a page-out
> for a shared page, this check will result in page-out hcall silently
> succeeding w/o doing any migration (as we don't populate the dst_pfn)

Ok. Nit. thought we can drop the "_fault" in the function name but would
collide the other "alloc_and_copy" function used during H_SVM_PAGE_IN.
If the two alloc_and_copy functions are symmetric, maybe they could
have "page_in" and "page_out" in the (already long) names.

> 
> > 
> > > +
> > > +	dpage = alloc_page_vma(GFP_HIGHUSER, mig->vma, mig->start);
> > > +	if (!dpage)
> > > +		return -EINVAL;
> > > +	lock_page(dpage);
> > > +	pvt = spage->zone_device_data;
> > > +
> > > +	pfn = page_to_pfn(dpage);
> > > +	ret = uv_page_out(pvt->lpid, pfn << page_shift, pvt->gpa, 0,
> > > +			  page_shift);
> > > +	if (ret == U_SUCCESS)
> > > +		*mig->dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> > > +	else {
> > > +		unlock_page(dpage);
> > > +		__free_page(dpage);
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > > +/*
> > > + * Fault handler callback when HV touches any page that has been
> > > + * moved to secure memory, we ask UV to give back the page by
> > > + * issuing a UV_PAGE_OUT uvcall.
> > > + *
> > > + * This eventually results in dropping of device PFN and the newly
> > > + * provisioned page/PFN gets populated in QEMU page tables.
> > > + */
> > > +static vm_fault_t kvmppc_devm_migrate_to_ram(struct vm_fault *vmf)
> > > +{
> > > +	unsigned long src_pfn, dst_pfn = 0;
> > > +	struct migrate_vma mig;
> > > +	int ret = 0;
> > > +
> > > +	memset(&mig, 0, sizeof(mig));
> > > +	mig.vma = vmf->vma;
> > > +	mig.start = vmf->address;
> > > +	mig.end = vmf->address + PAGE_SIZE;
> > > +	mig.src = &src_pfn;
> > > +	mig.dst = &dst_pfn;
> > > +
> > > +	if (migrate_vma_setup(&mig)) {
> > > +		ret = VM_FAULT_SIGBUS;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if (kvmppc_devm_fault_migrate_alloc_and_copy(&mig, PAGE_SHIFT)) {
> > > +		ret = VM_FAULT_SIGBUS;
> > > +		goto out_finalize;
> > > +	}
> > > +
> > > +	migrate_vma_pages(&mig);
> > > +out_finalize:
> > > +	migrate_vma_finalize(&mig);
> > > +out:
> > > +	return ret;
> > > +}
> > > +
> > > +/*
> > > + * Release the device PFN back to the pool
> > > + *
> > > + * Gets called when secure page becomes a normal page during UV_PAGE_OUT.
> > 
> > Nit: Should that be H_SVM_PAGE_OUT?
> 
> Yes, will reword.
> 
> > 
> > > + */
> > > +static void kvmppc_devm_page_free(struct page *page)
> > > +{
> > > +	unsigned long pfn = page_to_pfn(page);
> > > +	unsigned long flags;
> > > +	struct kvmppc_devm_page_pvt *pvt;
> > > +
> > > +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
> > > +	pvt = page->zone_device_data;
> > > +	page->zone_device_data = NULL;
> > 
> > If the pfn_lock only protects the bitmap, would be better to move
> > it here?
> 
> Yes.
> 
> > 
> > > +
> > > +	bitmap_clear(kvmppc_devm_pfn_bitmap,
> > > +		     pfn - (kvmppc_devm_pgmap.res.start >> PAGE_SHIFT), 1);
> > > +	*pvt->rmap = 0;
> > > +	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
> > > +	kfree(pvt);
> > > +}
> > > +
> > > +static const struct dev_pagemap_ops kvmppc_devm_ops = {
> > > +	.page_free = kvmppc_devm_page_free,
> > > +	.migrate_to_ram	= kvmppc_devm_migrate_to_ram,
> > > +};
> > > +
> > > +/*
> > > + * Move page from secure memory to normal memory.
> > > + */
> > > +unsigned long
> > > +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
> > > +		      unsigned long flags, unsigned long page_shift)
> > > +{
> > > +	struct migrate_vma mig;
> > > +	unsigned long addr, end;
> > > +	struct vm_area_struct *vma;
> > > +	unsigned long src_pfn, dst_pfn = 0;
> > > +	int srcu_idx;
> > > +	int ret;
> > 
> > Nit: Not sure its a coding style requirement, but many functions seem
> > to "sort" these local variables in descending order of line length for
> > appearance :-)  (eg: migrate_vma* functions).
> 
> It has ended up like this over multiple versions when variables got added,
> moved and re-added.
> 
> > 
> > > +
> > > +	if (page_shift != PAGE_SHIFT)
> > > +		return H_P3;
> > > +
> > > +	if (flags)
> > > +		return H_P2;
> > > +
> > > +	ret = H_PARAMETER;
> > > +	down_read(&kvm->mm->mmap_sem);
> > > +	srcu_idx = srcu_read_lock(&kvm->srcu);
> > > +	addr = gfn_to_hva(kvm, gpa >> page_shift);
> > > +	if (kvm_is_error_hva(addr))
> > > +		goto out;
> > > +
> > > +	end = addr + (1UL << page_shift);
> > > +	vma = find_vma_intersection(kvm->mm, addr, end);
> > > +	if (!vma || vma->vm_start > addr || vma->vm_end < end)
> > > +		goto out;
> > > +
> > > +	memset(&mig, 0, sizeof(mig));
> > > +	mig.vma = vma;
> > > +	mig.start = addr;
> > > +	mig.end = end;
> > > +	mig.src = &src_pfn;
> > > +	mig.dst = &dst_pfn;
> > > +	if (migrate_vma_setup(&mig))
> > > +		goto out;
> > > +
> > > +	ret = kvmppc_devm_fault_migrate_alloc_and_copy(&mig, page_shift);
> > > +	if (ret)
> > > +		goto out_finalize;
> > > +
> > > +	migrate_vma_pages(&mig);
> > > +	ret = H_SUCCESS;
> > 
> > Nit: Blank line here?
> 
> With a blank like above the label line (which is blank for the most part),
> it looks a bit too much of blank to me :)
> 
> However I do have blank line at a few other places. I have been removing
> them whenever I touch the surrounding lines.
> 
> Thanks for your review.
> 
> Christoph - You did review this patch in the last iteration. Do you have
> any additional comments?
> 
> Regards,
> Bharata.
Bharata B Rao Aug. 30, 2019, 3:42 a.m. UTC | #5
On Thu, Aug 29, 2019 at 10:38:10AM +0200, Christoph Hellwig wrote:
> On Thu, Aug 22, 2019 at 03:56:14PM +0530, Bharata B Rao wrote:
> > +/*
> > + * Bits 60:56 in the rmap entry will be used to identify the
> > + * different uses/functions of rmap.
> > + */
> > +#define KVMPPC_RMAP_DEVM_PFN	(0x2ULL << 56)
> 
> How did you come up with this specific value?

Different usage types of RMAP array are being defined.
https://patchwork.ozlabs.org/patch/1149791/

The above value is reserved for device pfn usage.

> 
> > +
> > +static inline bool kvmppc_rmap_is_devm_pfn(unsigned long pfn)
> > +{
> > +	return !!(pfn & KVMPPC_RMAP_DEVM_PFN);
> > +}
> 
> No need for !! when returning a bool.  Also the helper seems a little
> pointless, just opencoding it would make the code more readable in my
> opinion.

I expect similar routines for other usages of RMAP to come up.

> 
> > +#ifdef CONFIG_PPC_UV
> > +extern int kvmppc_devm_init(void);
> > +extern void kvmppc_devm_free(void);
> 
> There is no need for extern in a function declaration.
> 
> > +static int
> > +kvmppc_devm_migrate_alloc_and_copy(struct migrate_vma *mig,
> > +				   unsigned long *rmap, unsigned long gpa,
> > +				   unsigned int lpid, unsigned long page_shift)
> > +{
> > +	struct page *spage = migrate_pfn_to_page(*mig->src);
> > +	unsigned long pfn = *mig->src >> MIGRATE_PFN_SHIFT;
> > +	struct page *dpage;
> > +
> > +	*mig->dst = 0;
> > +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> > +		return 0;
> > +
> > +	dpage = kvmppc_devm_get_page(rmap, gpa, lpid);
> > +	if (!dpage)
> > +		return -EINVAL;
> > +
> > +	if (spage)
> > +		uv_page_in(lpid, pfn << page_shift, gpa, 0, page_shift);
> > +
> > +	*mig->dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> > +	return 0;
> > +}
> 
> I think you can just merge this trivial helper into the only caller.

Yes I can, but felt it is nicely abstracted out to a function right now.

> 
> > +static int
> > +kvmppc_devm_fault_migrate_alloc_and_copy(struct migrate_vma *mig,
> > +					 unsigned long page_shift)
> > +{
> > +	struct page *dpage, *spage;
> > +	struct kvmppc_devm_page_pvt *pvt;
> > +	unsigned long pfn;
> > +	int ret;
> > +
> > +	spage = migrate_pfn_to_page(*mig->src);
> > +	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
> > +		return 0;
> > +	if (!is_zone_device_page(spage))
> > +		return 0;
> > +
> > +	dpage = alloc_page_vma(GFP_HIGHUSER, mig->vma, mig->start);
> > +	if (!dpage)
> > +		return -EINVAL;
> > +	lock_page(dpage);
> > +	pvt = spage->zone_device_data;
> > +
> > +	pfn = page_to_pfn(dpage);
> > +	ret = uv_page_out(pvt->lpid, pfn << page_shift, pvt->gpa, 0,
> > +			  page_shift);
> > +	if (ret == U_SUCCESS)
> > +		*mig->dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
> > +	else {
> > +		unlock_page(dpage);
> > +		__free_page(dpage);
> > +	}
> > +	return ret;
> > +}
> 
> Here we actually have two callers, but they have a fair amount of
> duplicate code in them.  I think you want to move that common
> code (including setting up the migrate_vma structure) into this
> function and maybe also give it a more descriptive name.

Sure, I will give this a try. The name is already very descriptive, will
come up with an appropriate name.

BTW this file and the fuction prefixes in this file started out with
kvmppc_hmm, switched to kvmppc_devm when HMM routines weren't used anymore.
Now with the use of only non-dev versions, planning to swtich to
kvmppc_uvmem_

> 
> > +static void kvmppc_devm_page_free(struct page *page)
> > +{
> > +	unsigned long pfn = page_to_pfn(page);
> > +	unsigned long flags;
> > +	struct kvmppc_devm_page_pvt *pvt;
> > +
> > +	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
> > +	pvt = page->zone_device_data;
> > +	page->zone_device_data = NULL;
> > +
> > +	bitmap_clear(kvmppc_devm_pfn_bitmap,
> > +		     pfn - (kvmppc_devm_pgmap.res.start >> PAGE_SHIFT), 1);
> 
> Nit: I'd just initialize pfn to the value you want from the start.
> That makes the code a little easier to read, and keeps a tiny bit more
> code outside the spinlock.
> 
> 	unsigned long pfn = page_to_pfn(page) -
> 			(kvmppc_devm_pgmap.res.start >> PAGE_SHIFT);
> 
> 	..
> 
> 	 bitmap_clear(kvmppc_devm_pfn_bitmap, pfn, 1);

Sure.

> 
> 
> > +	kvmppc_devm_pgmap.type = MEMORY_DEVICE_PRIVATE;
> > +	kvmppc_devm_pgmap.res = *res;
> > +	kvmppc_devm_pgmap.ops = &kvmppc_devm_ops;
> > +	addr = memremap_pages(&kvmppc_devm_pgmap, -1);
> 
> This -1 should be NUMA_NO_NODE for clarity.

Right.

Regards,
Bharata.
Bharata B Rao Aug. 30, 2019, 11:13 a.m. UTC | #6
On Thu, Aug 29, 2019 at 12:39:11PM -0700, Sukadev Bhattiprolu wrote:
> Bharata B Rao [bharata@linux.ibm.com] wrote:
> > On Wed, Aug 28, 2019 at 08:02:19PM -0700, Sukadev Bhattiprolu wrote:
> Where do we serialize two threads attempting to H_SVM_PAGE_IN the same gfn
> at the same time? Or one thread issuing a H_SVM_PAGE_IN and another a
> H_SVM_PAGE_OUT for the same page?

I am not not serializing page-in/out calls on same gfn, I thought you take
care of that in UV, guess UV doesn't yet.

I can probably use rmap_lock() and serialize such calls in HV if UV can't
prevent such calls easily.

> > > > +
> > > > +	if (!trylock_page(dpage))
> > > > +		goto out_clear;
> > > > +
> > > > +	*rmap = devm_pfn | KVMPPC_RMAP_DEVM_PFN;
> > > > +	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
> > > > +	if (!pvt)
> > > > +		goto out_unlock;
> 
> If we fail to alloc, we don't clear the KVMPPC_RMAP_DEVM_PFN?

Right, I will move the assignment to *rmap to after kzalloc.

> 
> Also, when/where do we clear this flag on an uv-page-out?
> kvmppc_devm_drop_pages() drops the flag on a local variable but not
> in the rmap? If we don't clear the flag on page-out, would the
> subsequent H_SVM_PAGE_IN of this page fail?

It gets cleared in kvmppc_devm_page_free().

> 
> Ok. Nit. thought we can drop the "_fault" in the function name but would
> collide the other "alloc_and_copy" function used during H_SVM_PAGE_IN.
> If the two alloc_and_copy functions are symmetric, maybe they could
> have "page_in" and "page_out" in the (already long) names.

Christoph also suggested to reorganize these two calls. Will take care.

Regards,
Bharata.
Christoph Hellwig Sept. 2, 2019, 7:53 a.m. UTC | #7
On Fri, Aug 30, 2019 at 09:12:59AM +0530, Bharata B Rao wrote:
> On Thu, Aug 29, 2019 at 10:38:10AM +0200, Christoph Hellwig wrote:
> > On Thu, Aug 22, 2019 at 03:56:14PM +0530, Bharata B Rao wrote:
> > > +/*
> > > + * Bits 60:56 in the rmap entry will be used to identify the
> > > + * different uses/functions of rmap.
> > > + */
> > > +#define KVMPPC_RMAP_DEVM_PFN	(0x2ULL << 56)
> > 
> > How did you come up with this specific value?
> 
> Different usage types of RMAP array are being defined.
> https://patchwork.ozlabs.org/patch/1149791/
> 
> The above value is reserved for device pfn usage.

Shouldn't all these defintions go in together in a patch?  Also is bi
t 56+ a set of values, so is there 1 << 56 and 3 << 56 as well?  Seems
like even that other patch doesn't fully define these "pfn" values.

> > No need for !! when returning a bool.  Also the helper seems a little
> > pointless, just opencoding it would make the code more readable in my
> > opinion.
> 
> I expect similar routines for other usages of RMAP to come up.

Please drop them all.  Having to wade through a header to check for
a specific bit that also is set manually elsewhere in related code
just obsfucates it for the reader.

> > > +	*mig->dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
> > > +	return 0;
> > > +}
> > 
> > I think you can just merge this trivial helper into the only caller.
> 
> Yes I can, but felt it is nicely abstracted out to a function right now.

Not really.  It just fits the old calling conventions before I removed
the indirection.

> > Here we actually have two callers, but they have a fair amount of
> > duplicate code in them.  I think you want to move that common
> > code (including setting up the migrate_vma structure) into this
> > function and maybe also give it a more descriptive name.
> 
> Sure, I will give this a try. The name is already very descriptive, will
> come up with an appropriate name.

I don't think alloc_and_copy is very helpful.  It matches some of the
implementation, but not the intent.  Why not kvmppc_svm_page_in/out
similar to the hypervisor calls calling them?  Yes, for one case it
also gets called from the pagefault handler, but it still performs
these basic page in/out actions.

> BTW this file and the fuction prefixes in this file started out with
> kvmppc_hmm, switched to kvmppc_devm when HMM routines weren't used anymore.
> Now with the use of only non-dev versions, planning to swtich to
> kvmppc_uvmem_

That prefix sounds fine to me as well.
Bharata B Rao Sept. 6, 2019, 11:36 a.m. UTC | #8
On Mon, Sep 02, 2019 at 09:53:56AM +0200, Christoph Hellwig wrote:
> On Fri, Aug 30, 2019 at 09:12:59AM +0530, Bharata B Rao wrote:
> > On Thu, Aug 29, 2019 at 10:38:10AM +0200, Christoph Hellwig wrote:
> > > On Thu, Aug 22, 2019 at 03:56:14PM +0530, Bharata B Rao wrote:
> > > > +/*
> > > > + * Bits 60:56 in the rmap entry will be used to identify the
> > > > + * different uses/functions of rmap.
> > > > + */
> > > > +#define KVMPPC_RMAP_DEVM_PFN	(0x2ULL << 56)
> > > 
> > > How did you come up with this specific value?
> > 
> > Different usage types of RMAP array are being defined.
> > https://patchwork.ozlabs.org/patch/1149791/
> > 
> > The above value is reserved for device pfn usage.
> 
> Shouldn't all these defintions go in together in a patch?

Ideally yes, but the above patch is already in Paul's tree, I will sync
up with him about this.

> Also is bit 56+ a set of values, so is there 1 << 56 and 3 << 56 as well?  Seems
> like even that other patch doesn't fully define these "pfn" values.

I realized that the bit numbers have changed, it is no longer bits 60:56,
but instead top 8bits. 

#define KVMPPC_RMAP_UVMEM_PFN   0x0200000000000000
static inline bool kvmppc_rmap_is_uvmem_pfn(unsigned long *rmap)
{
        return ((*rmap & 0xff00000000000000) == KVMPPC_RMAP_UVMEM_PFN);
}

> 
> > > No need for !! when returning a bool.  Also the helper seems a little
> > > pointless, just opencoding it would make the code more readable in my
> > > opinion.
> > 
> > I expect similar routines for other usages of RMAP to come up.
> 
> Please drop them all.  Having to wade through a header to check for
> a specific bit that also is set manually elsewhere in related code
> just obsfucates it for the reader.

I am currently using the routine kvmppc_rmap_is_uvmem_pfn() (shown
above) instead open coding it at multiple places, but I can drop it if
you prefer.

Regards,
Bharata.
Christoph Hellwig Sept. 6, 2019, 4:32 p.m. UTC | #9
On Fri, Sep 06, 2019 at 05:06:39PM +0530, Bharata B Rao wrote:
> > Also is bit 56+ a set of values, so is there 1 << 56 and 3 << 56 as well?  Seems
> > like even that other patch doesn't fully define these "pfn" values.
> 
> I realized that the bit numbers have changed, it is no longer bits 60:56,
> but instead top 8bits. 
> 
> #define KVMPPC_RMAP_UVMEM_PFN   0x0200000000000000
> static inline bool kvmppc_rmap_is_uvmem_pfn(unsigned long *rmap)
> {
>         return ((*rmap & 0xff00000000000000) == KVMPPC_RMAP_UVMEM_PFN);
> }

In that overall scheme I'd actually much prefer something like (names
just made up, they should vaguely match the spec this written to):

static inline unsigned long kvmppc_rmap_type(unsigned long *rmap)
{
	return (rmap & 0xff00000000000000);
}

And then where you check it you can use:

	if (kvmppc_rmap_type(*rmap) == KVMPPC_RMAP_UVMEM_PFN)

and where you set it you do:

	*rmap |= KVMPPC_RMAP_UVMEM_PFN;

as in the current patch to keep things symmetric.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 463c63a9fcf1..2f6b952deb0f 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -337,6 +337,10 @@ 
 #define H_TLB_INVALIDATE	0xF808
 #define H_COPY_TOFROM_GUEST	0xF80C
 
+/* Platform-specific hcalls used by the Ultravisor */
+#define H_SVM_PAGE_IN		0xEF00
+#define H_SVM_PAGE_OUT		0xEF04
+
 /* Values for 2nd argument to H_SET_MODE */
 #define H_SET_MODE_RESOURCE_SET_CIABR		1
 #define H_SET_MODE_RESOURCE_SET_DAWR		2
diff --git a/arch/powerpc/include/asm/kvm_book3s_devm.h b/arch/powerpc/include/asm/kvm_book3s_devm.h
new file mode 100644
index 000000000000..9603c2b48d67
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_book3s_devm.h
@@ -0,0 +1,29 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __POWERPC_KVM_PPC_HMM_H__
+#define __POWERPC_KVM_PPC_HMM_H__
+
+#ifdef CONFIG_PPC_UV
+unsigned long kvmppc_h_svm_page_in(struct kvm *kvm,
+				   unsigned long gra,
+				   unsigned long flags,
+				   unsigned long page_shift);
+unsigned long kvmppc_h_svm_page_out(struct kvm *kvm,
+				    unsigned long gra,
+				    unsigned long flags,
+				    unsigned long page_shift);
+#else
+static inline unsigned long
+kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gra,
+		     unsigned long flags, unsigned long page_shift)
+{
+	return H_UNSUPPORTED;
+}
+
+static inline unsigned long
+kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gra,
+		      unsigned long flags, unsigned long page_shift)
+{
+	return H_UNSUPPORTED;
+}
+#endif /* CONFIG_PPC_UV */
+#endif /* __POWERPC_KVM_PPC_HMM_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 4bb552d639b8..855d82730f44 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -242,6 +242,17 @@  struct revmap_entry {
 #define KVMPPC_RMAP_PRESENT	0x100000000ul
 #define KVMPPC_RMAP_INDEX	0xfffffffful
 
+/*
+ * Bits 60:56 in the rmap entry will be used to identify the
+ * different uses/functions of rmap.
+ */
+#define KVMPPC_RMAP_DEVM_PFN	(0x2ULL << 56)
+
+static inline bool kvmppc_rmap_is_devm_pfn(unsigned long pfn)
+{
+	return !!(pfn & KVMPPC_RMAP_DEVM_PFN);
+}
+
 struct kvm_arch_memory_slot {
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	unsigned long *rmap;
@@ -849,4 +860,16 @@  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+#ifdef CONFIG_PPC_UV
+extern int kvmppc_devm_init(void);
+extern void kvmppc_devm_free(void);
+#else
+static inline int kvmppc_devm_init(void)
+{
+	return 0;
+}
+
+static inline void kvmppc_devm_free(void) {}
+#endif /* CONFIG_PPC_UV */
+
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h
index 6a0f9c74f959..1cd1f595fd81 100644
--- a/arch/powerpc/include/asm/ultravisor-api.h
+++ b/arch/powerpc/include/asm/ultravisor-api.h
@@ -25,5 +25,7 @@ 
 /* opcodes */
 #define UV_WRITE_PATE			0xF104
 #define UV_RETURN			0xF11C
+#define UV_PAGE_IN			0xF128
+#define UV_PAGE_OUT			0xF12C
 
 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */
diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h
index d7aa97aa7834..0fc4a974b2e8 100644
--- a/arch/powerpc/include/asm/ultravisor.h
+++ b/arch/powerpc/include/asm/ultravisor.h
@@ -31,4 +31,18 @@  static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1)
 	return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1);
 }
 
+static inline int uv_page_in(u64 lpid, u64 src_ra, u64 dst_gpa, u64 flags,
+			     u64 page_shift)
+{
+	return ucall_norets(UV_PAGE_IN, lpid, src_ra, dst_gpa, flags,
+			    page_shift);
+}
+
+static inline int uv_page_out(u64 lpid, u64 dst_ra, u64 src_gpa, u64 flags,
+			      u64 page_shift)
+{
+	return ucall_norets(UV_PAGE_OUT, lpid, dst_ra, src_gpa, flags,
+			    page_shift);
+}
+
 #endif	/* _ASM_POWERPC_ULTRAVISOR_H */
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 4c67cc79de7c..16b40590e67c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -71,6 +71,9 @@  kvm-hv-y += \
 	book3s_64_mmu_radix.o \
 	book3s_hv_nested.o
 
+kvm-hv-$(CONFIG_PPC_UV) += \
+	book3s_hv_devm.o
+
 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
 	book3s_hv_tm.o
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ec1804f822af..00b43ee8b693 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -72,6 +72,8 @@ 
 #include <asm/xics.h>
 #include <asm/xive.h>
 #include <asm/hw_breakpoint.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_book3s_devm.h>
 
 #include "book3s.h"
 
@@ -1075,6 +1077,18 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 					 kvmppc_get_gpr(vcpu, 5),
 					 kvmppc_get_gpr(vcpu, 6));
 		break;
+	case H_SVM_PAGE_IN:
+		ret = kvmppc_h_svm_page_in(vcpu->kvm,
+					   kvmppc_get_gpr(vcpu, 4),
+					   kvmppc_get_gpr(vcpu, 5),
+					   kvmppc_get_gpr(vcpu, 6));
+		break;
+	case H_SVM_PAGE_OUT:
+		ret = kvmppc_h_svm_page_out(vcpu->kvm,
+					    kvmppc_get_gpr(vcpu, 4),
+					    kvmppc_get_gpr(vcpu, 5),
+					    kvmppc_get_gpr(vcpu, 6));
+		break;
 	default:
 		return RESUME_HOST;
 	}
@@ -5510,11 +5524,16 @@  static int kvmppc_book3s_init_hv(void)
 			no_mixing_hpt_and_radix = true;
 	}
 
+	r = kvmppc_devm_init();
+	if (r < 0)
+		pr_err("KVM-HV: kvmppc_devm_init failed %d\n", r);
+
 	return r;
 }
 
 static void kvmppc_book3s_exit_hv(void)
 {
+	kvmppc_devm_free();
 	kvmppc_free_host_rm_ops();
 	if (kvmppc_radix_possible())
 		kvmppc_radix_exit();
diff --git a/arch/powerpc/kvm/book3s_hv_devm.c b/arch/powerpc/kvm/book3s_hv_devm.c
new file mode 100644
index 000000000000..13722f27fa7d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_devm.c
@@ -0,0 +1,438 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver to manage page migration between normal and secure
+ * memory.
+ *
+ * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com>
+ */
+
+/*
+ * A pseries guest can be run as a secure guest on Ultravisor-enabled
+ * POWER platforms. On such platforms, this driver will be used to manage
+ * the movement of guest pages between the normal memory managed by
+ * hypervisor (HV) and secure memory managed by Ultravisor (UV).
+ *
+ * The page-in or page-out requests from UV will come to HV as hcalls and
+ * HV will call back into UV via ultracalls to satisfy these page requests.
+ *
+ * Private ZONE_DEVICE memory equal to the amount of secure memory
+ * available in the platform for running secure guests is hotplugged.
+ * Whenever a page belonging to the guest becomes secure, a page from this
+ * private device memory is used to represent and track that secure page
+ * on the HV side.
+ *
+ * For each page that gets moved into secure memory, a device PFN is used
+ * on the HV side and migration PTE corresponding to that PFN would be
+ * populated in the QEMU page tables. Device PFNs are stored in the rmap
+ * array. Whenever a guest page becomes secure, device PFN allocated for
+ * the same will be populated in the corresponding slot in the rmap
+ * array. The overloading of rmap array's usage which otherwise is
+ * used primarily by HPT guests means that this feature (secure
+ * guest on PEF platforms) is available only for Radix MMU guests.
+ * Also since the same rmap array is used differently by nested
+ * guests, a secure guest can't have further nested guests.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/migrate.h>
+#include <linux/kvm_host.h>
+#include <asm/ultravisor.h>
+
+static struct dev_pagemap kvmppc_devm_pgmap;
+static unsigned long *kvmppc_devm_pfn_bitmap;
+static DEFINE_SPINLOCK(kvmppc_devm_pfn_lock);
+
+struct kvmppc_devm_page_pvt {
+	unsigned long *rmap;
+	unsigned int lpid;
+	unsigned long gpa;
+};
+
+/*
+ * Get a free device PFN from the pool
+ *
+ * Called when a normal page is moved to secure memory (UV_PAGE_IN). Device
+ * PFN will be used to keep track of the secure page on HV side.
+ *
+ * @rmap here is the slot in the rmap array that corresponds to @gpa.
+ * Thus a non-zero rmap entry indicates that the corresponding guest
+ * page has become secure, and is not mapped on the HV side.
+ *
+ * NOTE: In this and subsequent functions, we pass around and access
+ * individual elements of kvm_memory_slot->arch.rmap[] without any
+ * protection. Should we use lock_rmap() here?
+ */
+static struct page *kvmppc_devm_get_page(unsigned long *rmap, unsigned long gpa,
+					 unsigned int lpid)
+{
+	struct page *dpage = NULL;
+	unsigned long bit, devm_pfn;
+	unsigned long flags;
+	struct kvmppc_devm_page_pvt *pvt;
+	unsigned long pfn_last, pfn_first;
+
+	if (kvmppc_rmap_is_devm_pfn(*rmap))
+		return NULL;
+
+	pfn_first = kvmppc_devm_pgmap.res.start >> PAGE_SHIFT;
+	pfn_last = pfn_first +
+		   (resource_size(&kvmppc_devm_pgmap.res) >> PAGE_SHIFT);
+	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
+	bit = find_first_zero_bit(kvmppc_devm_pfn_bitmap, pfn_last - pfn_first);
+	if (bit >= (pfn_last - pfn_first))
+		goto out;
+
+	bitmap_set(kvmppc_devm_pfn_bitmap, bit, 1);
+	devm_pfn = bit + pfn_first;
+	dpage = pfn_to_page(devm_pfn);
+
+	if (!trylock_page(dpage))
+		goto out_clear;
+
+	*rmap = devm_pfn | KVMPPC_RMAP_DEVM_PFN;
+	pvt = kzalloc(sizeof(*pvt), GFP_ATOMIC);
+	if (!pvt)
+		goto out_unlock;
+	pvt->rmap = rmap;
+	pvt->gpa = gpa;
+	pvt->lpid = lpid;
+	dpage->zone_device_data = pvt;
+	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
+
+	get_page(dpage);
+	return dpage;
+
+out_unlock:
+	unlock_page(dpage);
+out_clear:
+	bitmap_clear(kvmppc_devm_pfn_bitmap, devm_pfn - pfn_first, 1);
+out:
+	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
+	return NULL;
+}
+
+/*
+ * Alloc a PFN from private device memory pool and copy page from normal
+ * memory to secure memory.
+ */
+static int
+kvmppc_devm_migrate_alloc_and_copy(struct migrate_vma *mig,
+				   unsigned long *rmap, unsigned long gpa,
+				   unsigned int lpid, unsigned long page_shift)
+{
+	struct page *spage = migrate_pfn_to_page(*mig->src);
+	unsigned long pfn = *mig->src >> MIGRATE_PFN_SHIFT;
+	struct page *dpage;
+
+	*mig->dst = 0;
+	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
+		return 0;
+
+	dpage = kvmppc_devm_get_page(rmap, gpa, lpid);
+	if (!dpage)
+		return -EINVAL;
+
+	if (spage)
+		uv_page_in(lpid, pfn << page_shift, gpa, 0, page_shift);
+
+	*mig->dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
+	return 0;
+}
+
+/*
+ * Move page from normal memory to secure memory.
+ */
+unsigned long
+kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
+		     unsigned long flags, unsigned long page_shift)
+{
+	unsigned long addr, end;
+	unsigned long src_pfn, dst_pfn;
+	struct migrate_vma mig;
+	struct vm_area_struct *vma;
+	int srcu_idx;
+	unsigned long gfn = gpa >> page_shift;
+	struct kvm_memory_slot *slot;
+	unsigned long *rmap;
+	int ret;
+
+	if (page_shift != PAGE_SHIFT)
+		return H_P3;
+
+	if (flags)
+		return H_P2;
+
+	ret = H_PARAMETER;
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slot = gfn_to_memslot(kvm, gfn);
+	rmap = &slot->arch.rmap[gfn - slot->base_gfn];
+	addr = gfn_to_hva(kvm, gpa >> page_shift);
+	if (kvm_is_error_hva(addr))
+		goto out;
+
+	end = addr + (1UL << page_shift);
+	vma = find_vma_intersection(kvm->mm, addr, end);
+	if (!vma || vma->vm_start > addr || vma->vm_end < end)
+		goto out;
+
+	memset(&mig, 0, sizeof(mig));
+	mig.vma = vma;
+	mig.start = addr;
+	mig.end = end;
+	mig.src = &src_pfn;
+	mig.dst = &dst_pfn;
+
+	if (migrate_vma_setup(&mig))
+		goto out;
+
+	if (kvmppc_devm_migrate_alloc_and_copy(&mig, rmap, gpa,
+					       kvm->arch.lpid, page_shift))
+		goto out_finalize;
+
+	migrate_vma_pages(&mig);
+	ret = H_SUCCESS;
+out_finalize:
+	migrate_vma_finalize(&mig);
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+	return ret;
+}
+
+/*
+ * Provision a new page on HV side and copy over the contents
+ * from secure memory.
+ */
+static int
+kvmppc_devm_fault_migrate_alloc_and_copy(struct migrate_vma *mig,
+					 unsigned long page_shift)
+{
+	struct page *dpage, *spage;
+	struct kvmppc_devm_page_pvt *pvt;
+	unsigned long pfn;
+	int ret;
+
+	spage = migrate_pfn_to_page(*mig->src);
+	if (!spage || !(*mig->src & MIGRATE_PFN_MIGRATE))
+		return 0;
+	if (!is_zone_device_page(spage))
+		return 0;
+
+	dpage = alloc_page_vma(GFP_HIGHUSER, mig->vma, mig->start);
+	if (!dpage)
+		return -EINVAL;
+	lock_page(dpage);
+	pvt = spage->zone_device_data;
+
+	pfn = page_to_pfn(dpage);
+	ret = uv_page_out(pvt->lpid, pfn << page_shift, pvt->gpa, 0,
+			  page_shift);
+	if (ret == U_SUCCESS)
+		*mig->dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED;
+	else {
+		unlock_page(dpage);
+		__free_page(dpage);
+	}
+	return ret;
+}
+
+/*
+ * Fault handler callback when HV touches any page that has been
+ * moved to secure memory, we ask UV to give back the page by
+ * issuing a UV_PAGE_OUT uvcall.
+ *
+ * This eventually results in dropping of device PFN and the newly
+ * provisioned page/PFN gets populated in QEMU page tables.
+ */
+static vm_fault_t kvmppc_devm_migrate_to_ram(struct vm_fault *vmf)
+{
+	unsigned long src_pfn, dst_pfn = 0;
+	struct migrate_vma mig;
+	int ret = 0;
+
+	memset(&mig, 0, sizeof(mig));
+	mig.vma = vmf->vma;
+	mig.start = vmf->address;
+	mig.end = vmf->address + PAGE_SIZE;
+	mig.src = &src_pfn;
+	mig.dst = &dst_pfn;
+
+	if (migrate_vma_setup(&mig)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
+	if (kvmppc_devm_fault_migrate_alloc_and_copy(&mig, PAGE_SHIFT)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_finalize;
+	}
+
+	migrate_vma_pages(&mig);
+out_finalize:
+	migrate_vma_finalize(&mig);
+out:
+	return ret;
+}
+
+/*
+ * Release the device PFN back to the pool
+ *
+ * Gets called when secure page becomes a normal page during UV_PAGE_OUT.
+ */
+static void kvmppc_devm_page_free(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long flags;
+	struct kvmppc_devm_page_pvt *pvt;
+
+	spin_lock_irqsave(&kvmppc_devm_pfn_lock, flags);
+	pvt = page->zone_device_data;
+	page->zone_device_data = NULL;
+
+	bitmap_clear(kvmppc_devm_pfn_bitmap,
+		     pfn - (kvmppc_devm_pgmap.res.start >> PAGE_SHIFT), 1);
+	*pvt->rmap = 0;
+	spin_unlock_irqrestore(&kvmppc_devm_pfn_lock, flags);
+	kfree(pvt);
+}
+
+static const struct dev_pagemap_ops kvmppc_devm_ops = {
+	.page_free = kvmppc_devm_page_free,
+	.migrate_to_ram	= kvmppc_devm_migrate_to_ram,
+};
+
+/*
+ * Move page from secure memory to normal memory.
+ */
+unsigned long
+kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa,
+		      unsigned long flags, unsigned long page_shift)
+{
+	struct migrate_vma mig;
+	unsigned long addr, end;
+	struct vm_area_struct *vma;
+	unsigned long src_pfn, dst_pfn = 0;
+	int srcu_idx;
+	int ret;
+
+	if (page_shift != PAGE_SHIFT)
+		return H_P3;
+
+	if (flags)
+		return H_P2;
+
+	ret = H_PARAMETER;
+	down_read(&kvm->mm->mmap_sem);
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	addr = gfn_to_hva(kvm, gpa >> page_shift);
+	if (kvm_is_error_hva(addr))
+		goto out;
+
+	end = addr + (1UL << page_shift);
+	vma = find_vma_intersection(kvm->mm, addr, end);
+	if (!vma || vma->vm_start > addr || vma->vm_end < end)
+		goto out;
+
+	memset(&mig, 0, sizeof(mig));
+	mig.vma = vma;
+	mig.start = addr;
+	mig.end = end;
+	mig.src = &src_pfn;
+	mig.dst = &dst_pfn;
+	if (migrate_vma_setup(&mig))
+		goto out;
+
+	ret = kvmppc_devm_fault_migrate_alloc_and_copy(&mig, page_shift);
+	if (ret)
+		goto out_finalize;
+
+	migrate_vma_pages(&mig);
+	ret = H_SUCCESS;
+out_finalize:
+	migrate_vma_finalize(&mig);
+out:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	up_read(&kvm->mm->mmap_sem);
+	return ret;
+}
+
+static u64 kvmppc_get_secmem_size(void)
+{
+	struct device_node *np;
+	int i, len;
+	const __be32 *prop;
+	u64 size = 0;
+
+	np = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
+	if (!np)
+		goto out;
+
+	prop = of_get_property(np, "secure-memory-ranges", &len);
+	if (!prop)
+		goto out_put;
+
+	for (i = 0; i < len / (sizeof(*prop) * 4); i++)
+		size += of_read_number(prop + (i * 4) + 2, 2);
+
+out_put:
+	of_node_put(np);
+out:
+	return size;
+}
+
+int kvmppc_devm_init(void)
+{
+	int ret = 0;
+	unsigned long size;
+	struct resource *res;
+	void *addr;
+	unsigned long pfn_last, pfn_first;
+
+	size = kvmppc_get_secmem_size();
+	if (!size) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	res = request_free_mem_region(&iomem_resource, size, "kvmppc_devm");
+	if (IS_ERR(res)) {
+		ret = PTR_ERR(res);
+		goto out;
+	}
+
+	kvmppc_devm_pgmap.type = MEMORY_DEVICE_PRIVATE;
+	kvmppc_devm_pgmap.res = *res;
+	kvmppc_devm_pgmap.ops = &kvmppc_devm_ops;
+	addr = memremap_pages(&kvmppc_devm_pgmap, -1);
+	if (IS_ERR(addr)) {
+		ret = PTR_ERR(addr);
+		goto out_free_region;
+	}
+
+	pfn_first = res->start >> PAGE_SHIFT;
+	pfn_last = pfn_first + (resource_size(res) >> PAGE_SHIFT);
+	kvmppc_devm_pfn_bitmap = kcalloc(BITS_TO_LONGS(pfn_last - pfn_first),
+					 sizeof(unsigned long), GFP_KERNEL);
+	if (!kvmppc_devm_pfn_bitmap) {
+		ret = -ENOMEM;
+		goto out_unmap;
+	}
+
+	pr_info("KVMPPC-DEVM: Secure Memory size 0x%lx\n", size);
+	return ret;
+out_unmap:
+	memunmap_pages(&kvmppc_devm_pgmap);
+out_free_region:
+	release_mem_region(res->start, size);
+out:
+	return ret;
+}
+
+void kvmppc_devm_free(void)
+{
+	memunmap_pages(&kvmppc_devm_pgmap);
+	release_mem_region(kvmppc_devm_pgmap.res.start,
+			   resource_size(&kvmppc_devm_pgmap.res));
+	kfree(kvmppc_devm_pfn_bitmap);
+}