diff mbox

[3/5] KVM: PPC: Book3S HV: Handle memory slot deletion and modification correctly

Message ID 20120806100603.GD8980@bloggs.ozlabs.ibm.com
State New, archived
Headers show

Commit Message

Paul Mackerras Aug. 6, 2012, 10:06 a.m. UTC
From 44067a8ee15021583636bea4cc1d47e5370b8397 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Jul 2012 16:40:54 +1000
Subject: 

At present the HV style of KVM doesn't handle deletion or modification
of memory slots correctly.  Deletion occurs when userspace does the
KVM_SET_USER_MEMORY_REGION with arguments specifying a new length of
zero for a slot that contains memory.  Modification occurs when the
arguments specify a new guest_phys_addr or flags.

To allow the HV code to know which operation (creation, deletion or
modification) is being requested, it needs to see the old and new
contents of the memslot.  kvm_arch_prepare_memory_region has this
information, so we modify it to pass it down to
kvmppc_core_prepare_memory_region.  We then modify the HV version
of that to check which operation is being performed.  If it is a
deletion, we call a new function kvmppc_unmap_memslot to remove any
HPT (hashed page table) entries referencing the memory being removed.
Similarly, if we are modifying the guest physical address we also
remove any HPT entries.  If the KVM_MEM_LOG_DIRTY_PAGES flag is now
set and wasn't before, we call kvmppc_hv_get_dirty_log to clear any
dirty bits so we can detect all modifications from now on.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/kvm_ppc.h  |    4 +++
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   27 ++++++++++++++--
 arch/powerpc/kvm/book3s_hv.c        |   61 +++++++++++++++++++++++------------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c |    2 +-
 arch/powerpc/kvm/book3s_pr.c        |    2 ++
 arch/powerpc/kvm/booke.c            |    2 ++
 arch/powerpc/kvm/powerpc.c          |    2 +-
 7 files changed, 76 insertions(+), 24 deletions(-)

Comments

Marcelo Tosatti Aug. 9, 2012, 6:16 p.m. UTC | #1
On Mon, Aug 06, 2012 at 08:06:03PM +1000, Paul Mackerras wrote:
> >From 44067a8ee15021583636bea4cc1d47e5370b8397 Mon Sep 17 00:00:00 2001
> From: Paul Mackerras <paulus@samba.org>
> Date: Mon, 30 Jul 2012 16:40:54 +1000
> Subject: 
> 
> At present the HV style of KVM doesn't handle deletion or modification
> of memory slots correctly.  Deletion occurs when userspace does the
> KVM_SET_USER_MEMORY_REGION with arguments specifying a new length of
> zero for a slot that contains memory.  Modification occurs when the
> arguments specify a new guest_phys_addr or flags.
> 
> To allow the HV code to know which operation (creation, deletion or
> modification) is being requested, it needs to see the old and new
> contents of the memslot.  kvm_arch_prepare_memory_region has this
> information, so we modify it to pass it down to
> kvmppc_core_prepare_memory_region.  We then modify the HV version
> of that to check which operation is being performed.  If it is a
> deletion, we call a new function kvmppc_unmap_memslot to remove any
> HPT (hashed page table) entries referencing the memory being removed.
> Similarly, if we are modifying the guest physical address we also
> remove any HPT entries.  If the KVM_MEM_LOG_DIRTY_PAGES flag is now
> set and wasn't before, we call kvmppc_hv_get_dirty_log to clear any
> dirty bits so we can detect all modifications from now on.
> 
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> ---
>  arch/powerpc/include/asm/kvm_ppc.h  |    4 +++
>  arch/powerpc/kvm/book3s_64_mmu_hv.c |   27 ++++++++++++++--
>  arch/powerpc/kvm/book3s_hv.c        |   61 +++++++++++++++++++++++------------
>  arch/powerpc/kvm/book3s_hv_rm_mmu.c |    2 +-
>  arch/powerpc/kvm/book3s_pr.c        |    2 ++
>  arch/powerpc/kvm/booke.c            |    2 ++
>  arch/powerpc/kvm/powerpc.c          |    2 +-
>  7 files changed, 76 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 0124937..044c921 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -140,11 +140,15 @@ extern void kvm_release_hpt(struct kvmppc_linear_info *li);
>  extern int kvmppc_core_init_vm(struct kvm *kvm);
>  extern void kvmppc_core_destroy_vm(struct kvm *kvm);
>  extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> +                                struct kvm_memory_slot *memslot,
> +                                struct kvm_memory_slot *old,
>  				struct kvm_userspace_memory_region *mem);
>  extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
>  				struct kvm_userspace_memory_region *mem);
>  extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
>  				      struct kvm_ppc_smmu_info *info);
> +extern void kvmppc_unmap_memslot(struct kvm *kvm,
> +				 struct kvm_memory_slot *memslot);
>  
>  extern int kvmppc_bookehv_init(void);
>  extern void kvmppc_bookehv_exit(void);
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 3c635c0..87735a7 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -850,7 +850,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
>  		psize = hpte_page_size(hptep[0], ptel);
>  		if ((hptep[0] & HPTE_V_VALID) &&
>  		    hpte_rpn(ptel, psize) == gfn) {
> -			hptep[0] |= HPTE_V_ABSENT;
> +			if (kvm->arch.using_mmu_notifiers)
> +				hptep[0] |= HPTE_V_ABSENT;
>  			kvmppc_invalidate_hpte(kvm, hptep, i);
>  			/* Harvest R and C */
>  			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
> @@ -877,6 +878,28 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
>  	return 0;
>  }
>  
> +void kvmppc_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
> +{
> +	unsigned long *rmapp;
> +	unsigned long gfn;
> +	unsigned long n;
> +
> +	rmapp = memslot->rmap;
> +	gfn = memslot->base_gfn;
> +	for (n = memslot->npages; n; --n) {
> +		/*
> +		 * Testing the present bit without locking is OK because
> +		 * the memslot has been marked invalid already, and hence
> +		 * no new HPTEs referencing this page can be created,
> +		 * thus the present bit can't go from 0 to 1.
> +		 */
> +		if (*rmapp & KVMPPC_RMAP_PRESENT)
> +			kvm_unmap_rmapp(kvm, rmapp, gfn);
> +		++rmapp;
> +		++gfn;
> +	}
> +}
> +
>  static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
>  			 unsigned long gfn)
>  {
> @@ -1039,7 +1062,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
>  	rmapp = memslot->rmap;
>  	map = memslot->dirty_bitmap;
>  	for (i = 0; i < memslot->npages; ++i) {
> -		if (kvm_test_clear_dirty(kvm, rmapp))
> +		if (kvm_test_clear_dirty(kvm, rmapp) && map)
>  			__set_bit_le(i, map);
>  		++rmapp;
>  	}
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 83e929e..aad20ca0 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1299,26 +1299,6 @@ static unsigned long slb_pgsize_encoding(unsigned long psize)
>  	return senc;
>  }
>  
> -int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> -				struct kvm_userspace_memory_region *mem)
> -{
> -	unsigned long npages;
> -	unsigned long *phys;
> -
> -	/* Allocate a slot_phys array */
> -	phys = kvm->arch.slot_phys[mem->slot];
> -	if (!kvm->arch.using_mmu_notifiers && !phys) {
> -		npages = mem->memory_size >> PAGE_SHIFT;
> -		phys = vzalloc(npages * sizeof(unsigned long));
> -		if (!phys)
> -			return -ENOMEM;
> -		kvm->arch.slot_phys[mem->slot] = phys;
> -		kvm->arch.slot_npages[mem->slot] = npages;
> -	}
> -
> -	return 0;
> -}
> -
>  static void unpin_slot(struct kvm *kvm, int slot_id)
>  {
>  	unsigned long *physp;
> @@ -1343,6 +1323,47 @@ static void unpin_slot(struct kvm *kvm, int slot_id)
>  	}
>  }
>  
> +int kvmppc_core_prepare_memory_region(struct kvm *kvm,
> +				      struct kvm_memory_slot *memslot,
> +				      struct kvm_memory_slot *old,
> +				      struct kvm_userspace_memory_region *mem)
> +{
> +	unsigned long npages;
> +	unsigned long *phys;
> +
> +	/* Are we creating, deleting, or modifying the slot? */
> +	if (!memslot->npages) {
> +		/* deleting */
> +		kvmppc_unmap_memslot(kvm, old);
> +		if (!kvm->arch.using_mmu_notifiers)
> +			unpin_slot(kvm, mem->slot);
> +		return 0;
> +	}

The !memslot->npages case is handled in __kvm_set_memory_region
(please read that part, before kvm_arch_prepare_memory_region() call).

kvm_arch_flush_shadow should be implemented.

> +	if (old->npages) {
> +		/* modifying guest_phys or flags */
> +		if (old->base_gfn != memslot->base_gfn)
> +			kvmppc_unmap_memslot(kvm, old);

This case is also handled generically by the last kvm_arch_flush_shadow
call in __kvm_set_memory_region.

> +		if (memslot->dirty_bitmap &&
> +		    old->dirty_bitmap != memslot->dirty_bitmap)
> +			kvmppc_hv_get_dirty_log(kvm, old);
> +		return 0;
> +	}

Better clear dirty log unconditionally on kvm_arch_commit_memory_region,
similarly to x86 (just so its consistent).

> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> @@ -81,7 +81,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
>  	ptel = rev->guest_rpte |= rcbits;
>  	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
>  	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
> -	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
> +	if (!memslot)
>  		return;

Why remove this check? (i don't know why it was there in the first
place, just checking).

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Aug. 10, 2012, 12:34 a.m. UTC | #2
On Thu, Aug 09, 2012 at 03:16:12PM -0300, Marcelo Tosatti wrote:

> The !memslot->npages case is handled in __kvm_set_memory_region
> (please read that part, before kvm_arch_prepare_memory_region() call).
> 
> kvm_arch_flush_shadow should be implemented.

Book3S HV doesn't have shadow page tables per se, rather the hardware
page table is under the control of the hypervisor (i.e. KVM), and
entries are added and removed by the guest using hypercalls.  On
recent machines (POWER7) the hypervisor can choose whether or not to
have the hardware PTE point to a real page of memory; if it doesn't,
access by the guest will trap to the hypervisor.  On older machines
(PPC970) we don't have that flexibility, and we have to provide a real
page of memory (i.e. RAM or I/O) behind every hardware PTE.  (This is
because PPC970 provides no way for page faults in the guest to go to
the hypervisor.)

I could implement kvm_arch_flush_shadow to remove the backing pages
behind every hardware PTE, but that would be very slow and inefficient
on POWER7, and would break the guest on PPC970, particularly in the
case where userspace is removing a small memory slot containing some
I/O device and leaving the memory slot for system RAM untouched.

So the reason for unmapping the hardware PTEs in
kvm_arch_prepare_memory_region rather than kvm_arch_flush_shadow is
that that way we know which memslot is going away.

What exactly are the semantics of kvm_arch_flush_shadow?  I presume
that on x86 with NPT/EPT it basically does nothing - is that right?

> > +	if (old->npages) {
> > +		/* modifying guest_phys or flags */
> > +		if (old->base_gfn != memslot->base_gfn)
> > +			kvmppc_unmap_memslot(kvm, old);
> 
> This case is also handled generically by the last kvm_arch_flush_shadow
> call in __kvm_set_memory_region.

Again, to use this we would need to know which memslot we're
flushing.  If we could change __kvm_set_memory_region to pass the
memslot for these kvm_arch_flush_shadow calls, then I could do as you
suggest.  (Though I would need to think carefully about what would
happen with guest invalidations of hardware PTEs in the interval
between the rcu_assign_pointer(kvm->memslots, slots) and the
kvm_arch_flush_shadow, and whether the invalidation would find the
correct location in the rmap array, given that we have updated the
base_gfn in the memslot without first getting rid of any references to
those pages in the hardware page table.)

> > +		if (memslot->dirty_bitmap &&
> > +		    old->dirty_bitmap != memslot->dirty_bitmap)
> > +			kvmppc_hv_get_dirty_log(kvm, old);
> > +		return 0;
> > +	}
> 
> Better clear dirty log unconditionally on kvm_arch_commit_memory_region,
> similarly to x86 (just so its consistent).

OK.

> > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > @@ -81,7 +81,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
> >  	ptel = rev->guest_rpte |= rcbits;
> >  	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
> >  	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
> > -	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
> > +	if (!memslot)
> >  		return;
> 
> Why remove this check? (i don't know why it was there in the first
> place, just checking).

This is where we are removing the page backing a hardware PTE and thus
removing the hardware PTE from the reverse-mapping list for the page.
We want to be able to do that properly even if the memslot is in the
process of going away.  I had the flags check in there originally
because other places that used a memslot had that check, but when I
read __kvm_set_memory_region more carefully I realized that the
KVM_MEMSLOT_INVALID flag indicates that we should not create any more
references to pages in the memslot, but we do still need to be able to
handle references going away, i.e. pages in the memslot getting
unmapped.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 10, 2012, 1:25 a.m. UTC | #3
On Fri, Aug 10, 2012 at 10:34:39AM +1000, Paul Mackerras wrote:
> On Thu, Aug 09, 2012 at 03:16:12PM -0300, Marcelo Tosatti wrote:
> 
> > The !memslot->npages case is handled in __kvm_set_memory_region
> > (please read that part, before kvm_arch_prepare_memory_region() call).
> > 
> > kvm_arch_flush_shadow should be implemented.
> 
> Book3S HV doesn't have shadow page tables per se, rather the hardware
> page table is under the control of the hypervisor (i.e. KVM), and
> entries are added and removed by the guest using hypercalls.  On
> recent machines (POWER7) the hypervisor can choose whether or not to
> have the hardware PTE point to a real page of memory; if it doesn't,
> access by the guest will trap to the hypervisor.  On older machines
> (PPC970) we don't have that flexibility, and we have to provide a real
> page of memory (i.e. RAM or I/O) behind every hardware PTE.  (This is
> because PPC970 provides no way for page faults in the guest to go to
> the hypervisor.)
> 
> I could implement kvm_arch_flush_shadow to remove the backing pages
> behind every hardware PTE, but that would be very slow and inefficient
> on POWER7, and would break the guest on PPC970, particularly in the
> case where userspace is removing a small memory slot containing some
> I/O device and leaving the memory slot for system RAM untouched.
> 
> So the reason for unmapping the hardware PTEs in
> kvm_arch_prepare_memory_region rather than kvm_arch_flush_shadow is
> that that way we know which memslot is going away.
> 
> What exactly are the semantics of kvm_arch_flush_shadow?  

It removes all translations mapped via memslots. Its used in cases where
the translations become stale, or during shutdown.

> I presume that on x86 with NPT/EPT it basically does nothing - is that right?

It does, it removes all NPT/EPT ptes (named "sptes" in arch/x86/kvm/).
The translations are rebuilt on demand (when accesses by the guest fault
into the HV).

> > > +	if (old->npages) {
> > > +		/* modifying guest_phys or flags */
> > > +		if (old->base_gfn != memslot->base_gfn)
> > > +			kvmppc_unmap_memslot(kvm, old);
> > 
> > This case is also handled generically by the last kvm_arch_flush_shadow
> > call in __kvm_set_memory_region.
> 
> Again, to use this we would need to know which memslot we're
> flushing.  If we could change __kvm_set_memory_region to pass the
> memslot for these kvm_arch_flush_shadow calls, then I could do as you
> suggest.  (Though I would need to think carefully about what would
> happen with guest invalidations of hardware PTEs in the interval
> between the rcu_assign_pointer(kvm->memslots, slots) and the
> kvm_arch_flush_shadow, and whether the invalidation would find the
> correct location in the rmap array, given that we have updated the
> base_gfn in the memslot without first getting rid of any references to
> those pages in the hardware page table.)

That can be done.

I'll send a patch to flush per memslot in the next days, you can work
out the PPC details in the meantime.

To be clear: this is necessary to have consistent behaviour across
arches in the kvm_set_memory codepath which is tricky (not nitpicking).

Alternatively, kvm_arch_flush_shadow can be split into two methods (but
thats not necessary if memslot information is sufficient for PPC).

> > > +		if (memslot->dirty_bitmap &&
> > > +		    old->dirty_bitmap != memslot->dirty_bitmap)
> > > +			kvmppc_hv_get_dirty_log(kvm, old);
> > > +		return 0;
> > > +	}
> > 
> > Better clear dirty log unconditionally on kvm_arch_commit_memory_region,
> > similarly to x86 (just so its consistent).
> 
> OK.
> 
> > > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> > > @@ -81,7 +81,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
> > >  	ptel = rev->guest_rpte |= rcbits;
> > >  	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
> > >  	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
> > > -	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
> > > +	if (!memslot)
> > >  		return;
> > 
> > Why remove this check? (i don't know why it was there in the first
> > place, just checking).
> 
> This is where we are removing the page backing a hardware PTE and thus
> removing the hardware PTE from the reverse-mapping list for the page.
> We want to be able to do that properly even if the memslot is in the
> process of going away.  I had the flags check in there originally
> because other places that used a memslot had that check, but when I
> read __kvm_set_memory_region more carefully I realized that the
> KVM_MEMSLOT_INVALID flag indicates that we should not create any more
> references to pages in the memslot, but we do still need to be able to
> handle references going away, i.e. pages in the memslot getting
> unmapped.
> 
> Paul.

Yes, thats it. kvm_arch_flush_shadow requires functional memslot lookup,
for example.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 10, 2012, 1:33 a.m. UTC | #4
On Thu, Aug 09, 2012 at 10:25:32PM -0300, Marcelo Tosatti wrote:
> On Fri, Aug 10, 2012 at 10:34:39AM +1000, Paul Mackerras wrote:
> > On Thu, Aug 09, 2012 at 03:16:12PM -0300, Marcelo Tosatti wrote:
> > 
> > > The !memslot->npages case is handled in __kvm_set_memory_region
> > > (please read that part, before kvm_arch_prepare_memory_region() call).
> > > 
> > > kvm_arch_flush_shadow should be implemented.
> > 
> > Book3S HV doesn't have shadow page tables per se, rather the hardware
> > page table is under the control of the hypervisor (i.e. KVM), and
> > entries are added and removed by the guest using hypercalls.  On
> > recent machines (POWER7) the hypervisor can choose whether or not to
> > have the hardware PTE point to a real page of memory; if it doesn't,
> > access by the guest will trap to the hypervisor.  On older machines
> > (PPC970) we don't have that flexibility, and we have to provide a real
> > page of memory (i.e. RAM or I/O) behind every hardware PTE.  (This is
> > because PPC970 provides no way for page faults in the guest to go to
> > the hypervisor.)
> > 
> > I could implement kvm_arch_flush_shadow to remove the backing pages
> > behind every hardware PTE, but that would be very slow and inefficient
> > on POWER7, and would break the guest on PPC970, particularly in the
> > case where userspace is removing a small memory slot containing some
> > I/O device and leaving the memory slot for system RAM untouched.
> > 
> > So the reason for unmapping the hardware PTEs in
> > kvm_arch_prepare_memory_region rather than kvm_arch_flush_shadow is
> > that that way we know which memslot is going away.
> > 
> > What exactly are the semantics of kvm_arch_flush_shadow?  
> 
> It removes all translations mapped via memslots. Its used in cases where
> the translations become stale, or during shutdown.
> 
> > I presume that on x86 with NPT/EPT it basically does nothing - is that right?
> 
> It does, it removes all NPT/EPT ptes (named "sptes" in arch/x86/kvm/).
> The translations are rebuilt on demand (when accesses by the guest fault
> into the HV).
> 
> > > > +	if (old->npages) {
> > > > +		/* modifying guest_phys or flags */
> > > > +		if (old->base_gfn != memslot->base_gfn)
> > > > +			kvmppc_unmap_memslot(kvm, old);
> > > 
> > > This case is also handled generically by the last kvm_arch_flush_shadow
> > > call in __kvm_set_memory_region.
> > 
> > Again, to use this we would need to know which memslot we're
> > flushing.  If we could change __kvm_set_memory_region to pass the
> > memslot for these kvm_arch_flush_shadow calls, then I could do as you
> > suggest.  (Though I would need to think carefully about what would
> > happen with guest invalidations of hardware PTEs in the interval
> > between the rcu_assign_pointer(kvm->memslots, slots) and the
> > kvm_arch_flush_shadow, and whether the invalidation would find the
> > correct location in the rmap array, given that we have updated the
> > base_gfn in the memslot without first getting rid of any references to
> > those pages in the hardware page table.)
> 
> That can be done.
> 
> I'll send a patch to flush per memslot in the next days, you can work
> out the PPC details in the meantime.
> 
> To be clear: this is necessary to have consistent behaviour across
> arches in the kvm_set_memory codepath which is tricky (not nitpicking).
> 
> Alternatively, kvm_arch_flush_shadow can be split into two methods (but
> thats not necessary if memslot information is sufficient for PPC).

What kvm_set_memory requires is that there are no stale translations. On
x86, it is cheaper to nuke all translation entries and let them be rebuilt on
pagefaults.

But, if necessary we can introduce a new callback
"kvm_arch_sync_shadow", which can be used by PPC to verify translations 
are valid, if it makes sense.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Takuya Yoshikawa Aug. 10, 2012, 2:09 a.m. UTC | #5
On Thu, 9 Aug 2012 22:25:32 -0300
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> I'll send a patch to flush per memslot in the next days, you can work
> out the PPC details in the meantime.

Are you going to implement that using slot_bitmap?

Since I'm now converting kvm_mmu_slot_remove_write_access() to
rmap based protection, I'd like to hear your plan.

If you can use the stale memslot to restrict the flush, that
should live with rmap based protection.

Thanks,
	Takuya
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 10, 2012, 6:35 p.m. UTC | #6
On Fri, Aug 10, 2012 at 11:09:09AM +0900, Takuya Yoshikawa wrote:
> On Thu, 9 Aug 2012 22:25:32 -0300
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> 
> > I'll send a patch to flush per memslot in the next days, you can work
> > out the PPC details in the meantime.

Not anymore.

> Are you going to implement that using slot_bitmap?
> 
> Since I'm now converting kvm_mmu_slot_remove_write_access() to
> rmap based protection, I'd like to hear your plan.
> 
> If you can use the stale memslot to restrict the flush, that
> should live with rmap based protection.

There's no plan. I just wanted to confirm this before converting 
to per-memslot flush.

1) __kvm_set_memory_region line 807:

                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
                 *      - kvm_is_visible_gfn (mmu_check_roots)
                 */
                kvm_arch_flush_shadow(kvm);
                kfree(old_memslots);
        }

This can be converted to per-memslot flush.

2) __kvm_set_memory_region line 846:

        /*
         * If the new memory slot is created, we need to clear all
         * mmio sptes.
         */
        if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
                kvm_arch_flush_shadow(kvm);

This must flush all translations in the new and old GPA ranges:
	a) to remove any mmio sptes that existed in the new GPA range
	   (see ce88decffd17bf9f373cc233c for a description).
	b) to remove any translations in the old range (this is
           necessary because change of GPA base is allowed).

Alex/Paul, slot creation should be rare (and modification of GPA base
should not be used, even though it is supported). But if you really need
a new callback, the two points above are what the second call needs (x86
will flush everything).

The other two kvm_arch_flush_shadow in kvm_mmu_notifier_release and
kvm_destroy_vm must remove all sptes obviously.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Aug. 11, 2012, 12:37 a.m. UTC | #7
On Fri, Aug 10, 2012 at 03:35:53PM -0300, Marcelo Tosatti wrote:
> There's no plan. I just wanted to confirm this before converting 
> to per-memslot flush.
> 
> 1) __kvm_set_memory_region line 807:
> 
>                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
>                  *      - kvm_is_visible_gfn (mmu_check_roots)
>                  */
>                 kvm_arch_flush_shadow(kvm);
>                 kfree(old_memslots);
>         }
> 
> This can be converted to per-memslot flush.

Yes, that would be good.

> 2) __kvm_set_memory_region line 846:
> 
>         /*
>          * If the new memory slot is created, we need to clear all
>          * mmio sptes.
>          */
>         if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
>                 kvm_arch_flush_shadow(kvm);
> 
> This must flush all translations in the new and old GPA ranges:
> 	a) to remove any mmio sptes that existed in the new GPA range
> 	   (see ce88decffd17bf9f373cc233c for a description).

I assume that mmio sptes are ones that are created for accesses to
guest physical addresses where there is no memslot.  On Book3S HV we
trap those accesses (on POWER7 at least) but we don't create any
hardware PTEs for them.  So I don't have anything to do here.

> 	b) to remove any translations in the old range (this is
>            necessary because change of GPA base is allowed).

In fact I need to remove any translations in the old range *before*
the new memslot gets committed, whereas this happens after that.
This is why I was doing the flush in kvm_arch_prepare_memory_region.

If the new memslot got committed while there were still some
translations left in the hardware page table, it's possible that the
guest could ask to remove one of those hardware PTEs.  As part of that
process, I get the GPA that the HPTE referred to, look up which
memslot it belongs to, and use the offset from the base_gfn of the
memslot to index into the memslot's rmap array.  If the base_gfn has
been changed then I will hit the wrong rmap entry, or possibly not
find the memslot at all, and bad things will happen.

Basically, the rmap array has to be empty before we commit the new
memslot.

> Alex/Paul, slot creation should be rare (and modification of GPA base
> should not be used, even though it is supported). But if you really need
> a new callback, the two points above are what the second call needs (x86
> will flush everything).
> 
> The other two kvm_arch_flush_shadow in kvm_mmu_notifier_release and
> kvm_destroy_vm must remove all sptes obviously.

Will these be the only remaining call sites for kvm_arch_flush_shadow?
I can see an optimization here if no vcpus are running (which should
be the case) -- I can set a flag to say that the hardware page table
is completely invalid, and if any vcpu ever does try to run again,
clear out the hardware page table and flush all TLBs at that point.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 13, 2012, 4:34 p.m. UTC | #8
On Sat, Aug 11, 2012 at 10:37:54AM +1000, Paul Mackerras wrote:
> On Fri, Aug 10, 2012 at 03:35:53PM -0300, Marcelo Tosatti wrote:
> > There's no plan. I just wanted to confirm this before converting 
> > to per-memslot flush.
> > 
> > 1) __kvm_set_memory_region line 807:
> > 
> >                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
> >                  *      - kvm_is_visible_gfn (mmu_check_roots)
> >                  */
> >                 kvm_arch_flush_shadow(kvm);
> >                 kfree(old_memslots);
> >         }
> > 
> > This can be converted to per-memslot flush.
> 
> Yes, that would be good.
> 
> > 2) __kvm_set_memory_region line 846:
> > 
> >         /*
> >          * If the new memory slot is created, we need to clear all
> >          * mmio sptes.
> >          */
> >         if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
> >                 kvm_arch_flush_shadow(kvm);
> > 
> > This must flush all translations in the new and old GPA ranges:
> > 	a) to remove any mmio sptes that existed in the new GPA range
> > 	   (see ce88decffd17bf9f373cc233c for a description).
> 
> I assume that mmio sptes are ones that are created for accesses to
> guest physical addresses where there is no memslot.  On Book3S HV we
> trap those accesses (on POWER7 at least) but we don't create any
> hardware PTEs for them.  So I don't have anything to do here.
> 
> > 	b) to remove any translations in the old range (this is
> >            necessary because change of GPA base is allowed).
> 
> In fact I need to remove any translations in the old range *before*
> the new memslot gets committed, whereas this happens after that.
> This is why I was doing the flush in kvm_arch_prepare_memory_region.

I see. The problem with that is, given that the fault path (reader 
of memslots) is protected only by SRCU, the following window is open:

A) kvm_arch_prepare_memory_region
B) rcu_assign_pointer(kvm->memslots, slots)
C) kvm_arch_commit_memory_region

The window between A and B where a guest pagefault can instantiate a new
translation using the old memslot information (while you already removed
any translations in the old range).


Avi, Gleb, Alex, do you know why it is necessary to support change of
GPA base again? 

Without taking into consideration backwards compatibility, userspace 
can first delete the slot and later create a new one.

> If the new memslot got committed while there were still some
> translations left in the hardware page table, it's possible that the
> guest could ask to remove one of those hardware PTEs.  As part of that
> process, I get the GPA that the HPTE referred to, look up which
> memslot it belongs to, and use the offset from the base_gfn of the
> memslot to index into the memslot's rmap array.  If the base_gfn has
> been changed then I will hit the wrong rmap entry, or possibly not
> find the memslot at all, and bad things will happen.

So, that notification before-commit-of-new-memslot is only necessary for
base change, correct? That is, you don't need that pre notification for
entirely new memslots in a previously unpopulated range.

> Basically, the rmap array has to be empty before we commit the new
> memslot.
> 
> > Alex/Paul, slot creation should be rare (and modification of GPA base
> > should not be used, even though it is supported). But if you really need
> > a new callback, the two points above are what the second call needs (x86
> > will flush everything).
> > 
> > The other two kvm_arch_flush_shadow in kvm_mmu_notifier_release and
> > kvm_destroy_vm must remove all sptes obviously.
> 
> Will these be the only remaining call sites for kvm_arch_flush_shadow?

Yes, should be. OK, lets first get the callbacks in
kvm_set_memory_region right and later you can restrict
kvm_arch_flush_shadow as necessary.

> I can see an optimization here if no vcpus are running (which should
> be the case) -- I can set a flag to say that the hardware page table
> is completely invalid, and if any vcpu ever does try to run again,
> clear out the hardware page table and flush all TLBs at that point.
> 
> Paul.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 13, 2012, 10:04 p.m. UTC | #9
On Mon, Aug 13, 2012 at 01:34:11PM -0300, Marcelo Tosatti wrote:
> On Sat, Aug 11, 2012 at 10:37:54AM +1000, Paul Mackerras wrote:
> > On Fri, Aug 10, 2012 at 03:35:53PM -0300, Marcelo Tosatti wrote:
> > > There's no plan. I just wanted to confirm this before converting 
> > > to per-memslot flush.
> > > 
> > > 1) __kvm_set_memory_region line 807:
> > > 
> > >                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
> > >                  *      - kvm_is_visible_gfn (mmu_check_roots)
> > >                  */
> > >                 kvm_arch_flush_shadow(kvm);
> > >                 kfree(old_memslots);
> > >         }
> > > 
> > > This can be converted to per-memslot flush.
> > 
> > Yes, that would be good.
> > 
> > > 2) __kvm_set_memory_region line 846:
> > > 
> > >         /*
> > >          * If the new memory slot is created, we need to clear all
> > >          * mmio sptes.
> > >          */
> > >         if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
> > >                 kvm_arch_flush_shadow(kvm);
> > > 
> > > This must flush all translations in the new and old GPA ranges:
> > > 	a) to remove any mmio sptes that existed in the new GPA range
> > > 	   (see ce88decffd17bf9f373cc233c for a description).
> > 
> > I assume that mmio sptes are ones that are created for accesses to
> > guest physical addresses where there is no memslot.  On Book3S HV we
> > trap those accesses (on POWER7 at least) but we don't create any
> > hardware PTEs for them.  So I don't have anything to do here.
> > 
> > > 	b) to remove any translations in the old range (this is
> > >            necessary because change of GPA base is allowed).
> > 
> > In fact I need to remove any translations in the old range *before*
> > the new memslot gets committed, whereas this happens after that.
> > This is why I was doing the flush in kvm_arch_prepare_memory_region.
> 
> I see. The problem with that is, given that the fault path (reader 
> of memslots) is protected only by SRCU, the following window is open:
> 
> A) kvm_arch_prepare_memory_region
> B) rcu_assign_pointer(kvm->memslots, slots)
> C) kvm_arch_commit_memory_region
> 
> The window between A and B where a guest pagefault can instantiate a new
> translation using the old memslot information (while you already removed
> any translations in the old range).
> 
> 
> Avi, Gleb, Alex, do you know why it is necessary to support change of
> GPA base again?
> 
> Without taking into consideration backwards compatibility, userspace 
> can first delete the slot and later create a new one.

Current userspace does not, and can't see why older userspace would. If
we break it and someone complains, it can be fixed. 

So, please:

1) Disallow change of base GPA (can do that in the "Check for overlaps" loop
in __kvm_set_memory_region), returning EINVAL.
2) Introduce kvm_arch_invalidate_memslot (replacing first
kvm_arch_flush_shadow).
3) Introduce kvm_arch_postcommit_memslot (replacing the 
second kvm_arch_flush_shadow).

> > If the new memslot got committed while there were still some
> > translations left in the hardware page table, it's possible that the
> > guest could ask to remove one of those hardware PTEs.  As part of that
> > process, I get the GPA that the HPTE referred to, look up which
> > memslot it belongs to, and use the offset from the base_gfn of the
> > memslot to index into the memslot's rmap array.  If the base_gfn has
> > been changed then I will hit the wrong rmap entry, or possibly not
> > find the memslot at all, and bad things will happen.
> 
> So, that notification before-commit-of-new-memslot is only necessary for
> base change, correct? That is, you don't need that pre notification for
> entirely new memslots in a previously unpopulated range.
> 
> > Basically, the rmap array has to be empty before we commit the new
> > memslot.
> > 
> > > Alex/Paul, slot creation should be rare (and modification of GPA base
> > > should not be used, even though it is supported). But if you really need
> > > a new callback, the two points above are what the second call needs (x86
> > > will flush everything).
> > > 
> > > The other two kvm_arch_flush_shadow in kvm_mmu_notifier_release and
> > > kvm_destroy_vm must remove all sptes obviously.
> > 
> > Will these be the only remaining call sites for kvm_arch_flush_shadow?
> 
> Yes, should be. OK, lets first get the callbacks in
> kvm_set_memory_region right and later you can restrict
> kvm_arch_flush_shadow as necessary.
> 
> > I can see an optimization here if no vcpus are running (which should
> > be the case) -- I can set a flag to say that the hardware page table
> > is completely invalid, and if any vcpu ever does try to run again,
> > clear out the hardware page table and flush all TLBs at that point.
> > 
> > Paul.
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Aug. 15, 2012, 6:06 a.m. UTC | #10
On Mon, Aug 13, 2012 at 01:34:11PM -0300, Marcelo Tosatti wrote:
> On Sat, Aug 11, 2012 at 10:37:54AM +1000, Paul Mackerras wrote:
> > In fact I need to remove any translations in the old range *before*
> > the new memslot gets committed, whereas this happens after that.
> > This is why I was doing the flush in kvm_arch_prepare_memory_region.
> 
> I see. The problem with that is, given that the fault path (reader 
> of memslots) is protected only by SRCU, the following window is open:
> 
> A) kvm_arch_prepare_memory_region
> B) rcu_assign_pointer(kvm->memslots, slots)
> C) kvm_arch_commit_memory_region
> 
> The window between A and B where a guest pagefault can instantiate a new
> translation using the old memslot information (while you already removed
> any translations in the old range).

Yes, good point.

> Avi, Gleb, Alex, do you know why it is necessary to support change of
> GPA base again? 
> 
> Without taking into consideration backwards compatibility, userspace 
> can first delete the slot and later create a new one.

I like this approach. :)

> > If the new memslot got committed while there were still some
> > translations left in the hardware page table, it's possible that the
> > guest could ask to remove one of those hardware PTEs.  As part of that
> > process, I get the GPA that the HPTE referred to, look up which
> > memslot it belongs to, and use the offset from the base_gfn of the
> > memslot to index into the memslot's rmap array.  If the base_gfn has
> > been changed then I will hit the wrong rmap entry, or possibly not
> > find the memslot at all, and bad things will happen.
> 
> So, that notification before-commit-of-new-memslot is only necessary for
> base change, correct? That is, you don't need that pre notification for
> entirely new memslots in a previously unpopulated range.

Correct.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 15, 2012, 9:23 a.m. UTC | #11
On 08/13/2012 07:34 PM, Marcelo Tosatti wrote:
> 
> Avi, Gleb, Alex, do you know why it is necessary to support change of
> GPA base again? 

BAR moving around.

> Without taking into consideration backwards compatibility, userspace 
> can first delete the slot and later create a new one.

Current qemu will in fact do that.  Not sure about older ones.

There is a difference in that the existing method is atomic.  I don't
see a real need for this atomicity though.
Avi Kivity Aug. 15, 2012, 9:26 a.m. UTC | #12
On 08/14/2012 01:04 AM, Marcelo Tosatti wrote:
> On Mon, Aug 13, 2012 at 01:34:11PM -0300, Marcelo Tosatti wrote:
>> On Sat, Aug 11, 2012 at 10:37:54AM +1000, Paul Mackerras wrote:
>> > On Fri, Aug 10, 2012 at 03:35:53PM -0300, Marcelo Tosatti wrote:
>> > > There's no plan. I just wanted to confirm this before converting 
>> > > to per-memslot flush.
>> > > 
>> > > 1) __kvm_set_memory_region line 807:
>> > > 
>> > >                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
>> > >                  *      - kvm_is_visible_gfn (mmu_check_roots)
>> > >                  */
>> > >                 kvm_arch_flush_shadow(kvm);
>> > >                 kfree(old_memslots);
>> > >         }
>> > > 
>> > > This can be converted to per-memslot flush.
>> > 
>> > Yes, that would be good.
>> > 
>> > > 2) __kvm_set_memory_region line 846:
>> > > 
>> > >         /*
>> > >          * If the new memory slot is created, we need to clear all
>> > >          * mmio sptes.
>> > >          */
>> > >         if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
>> > >                 kvm_arch_flush_shadow(kvm);
>> > > 
>> > > This must flush all translations in the new and old GPA ranges:
>> > > 	a) to remove any mmio sptes that existed in the new GPA range
>> > > 	   (see ce88decffd17bf9f373cc233c for a description).
>> > 
>> > I assume that mmio sptes are ones that are created for accesses to
>> > guest physical addresses where there is no memslot.  On Book3S HV we
>> > trap those accesses (on POWER7 at least) but we don't create any
>> > hardware PTEs for them.  So I don't have anything to do here.
>> > 
>> > > 	b) to remove any translations in the old range (this is
>> > >            necessary because change of GPA base is allowed).
>> > 
>> > In fact I need to remove any translations in the old range *before*
>> > the new memslot gets committed, whereas this happens after that.
>> > This is why I was doing the flush in kvm_arch_prepare_memory_region.
>> 
>> I see. The problem with that is, given that the fault path (reader 
>> of memslots) is protected only by SRCU, the following window is open:
>> 
>> A) kvm_arch_prepare_memory_region
>> B) rcu_assign_pointer(kvm->memslots, slots)
>> C) kvm_arch_commit_memory_region
>> 
>> The window between A and B where a guest pagefault can instantiate a new
>> translation using the old memslot information (while you already removed
>> any translations in the old range).
>> 
>> 
>> Avi, Gleb, Alex, do you know why it is necessary to support change of
>> GPA base again?
>> 
>> Without taking into consideration backwards compatibility, userspace 
>> can first delete the slot and later create a new one.
> 
> Current userspace does not, and can't see why older userspace would. If
> we break it and someone complains, it can be fixed. 
> 
> So, please:
> 
> 1) Disallow change of base GPA (can do that in the "Check for overlaps" loop
> in __kvm_set_memory_region), returning EINVAL.
> 2) Introduce kvm_arch_invalidate_memslot (replacing first
> kvm_arch_flush_shadow).
> 3) Introduce kvm_arch_postcommit_memslot (replacing the 
> second kvm_arch_flush_shadow).


Is there a reason why the kernel can't do the same?  First remove the
old slot, then add the new one?

We break atomicity, but it's less likely that guests rely on it than an
old qemu not relying on gpa-changing memory update.
Marcelo Tosatti Aug. 15, 2012, 5:59 p.m. UTC | #13
On Wed, Aug 15, 2012 at 12:26:05PM +0300, Avi Kivity wrote:
> On 08/14/2012 01:04 AM, Marcelo Tosatti wrote:
> > On Mon, Aug 13, 2012 at 01:34:11PM -0300, Marcelo Tosatti wrote:
> >> On Sat, Aug 11, 2012 at 10:37:54AM +1000, Paul Mackerras wrote:
> >> > On Fri, Aug 10, 2012 at 03:35:53PM -0300, Marcelo Tosatti wrote:
> >> > > There's no plan. I just wanted to confirm this before converting 
> >> > > to per-memslot flush.
> >> > > 
> >> > > 1) __kvm_set_memory_region line 807:
> >> > > 
> >> > >                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
> >> > >                  *      - kvm_is_visible_gfn (mmu_check_roots)
> >> > >                  */
> >> > >                 kvm_arch_flush_shadow(kvm);
> >> > >                 kfree(old_memslots);
> >> > >         }
> >> > > 
> >> > > This can be converted to per-memslot flush.
> >> > 
> >> > Yes, that would be good.
> >> > 
> >> > > 2) __kvm_set_memory_region line 846:
> >> > > 
> >> > >         /*
> >> > >          * If the new memory slot is created, we need to clear all
> >> > >          * mmio sptes.
> >> > >          */
> >> > >         if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
> >> > >                 kvm_arch_flush_shadow(kvm);
> >> > > 
> >> > > This must flush all translations in the new and old GPA ranges:
> >> > > 	a) to remove any mmio sptes that existed in the new GPA range
> >> > > 	   (see ce88decffd17bf9f373cc233c for a description).
> >> > 
> >> > I assume that mmio sptes are ones that are created for accesses to
> >> > guest physical addresses where there is no memslot.  On Book3S HV we
> >> > trap those accesses (on POWER7 at least) but we don't create any
> >> > hardware PTEs for them.  So I don't have anything to do here.
> >> > 
> >> > > 	b) to remove any translations in the old range (this is
> >> > >            necessary because change of GPA base is allowed).
> >> > 
> >> > In fact I need to remove any translations in the old range *before*
> >> > the new memslot gets committed, whereas this happens after that.
> >> > This is why I was doing the flush in kvm_arch_prepare_memory_region.
> >> 
> >> I see. The problem with that is, given that the fault path (reader 
> >> of memslots) is protected only by SRCU, the following window is open:
> >> 
> >> A) kvm_arch_prepare_memory_region
> >> B) rcu_assign_pointer(kvm->memslots, slots)
> >> C) kvm_arch_commit_memory_region
> >> 
> >> The window between A and B where a guest pagefault can instantiate a new
> >> translation using the old memslot information (while you already removed
> >> any translations in the old range).
> >> 
> >> 
> >> Avi, Gleb, Alex, do you know why it is necessary to support change of
> >> GPA base again?
> >> 
> >> Without taking into consideration backwards compatibility, userspace 
> >> can first delete the slot and later create a new one.
> > 
> > Current userspace does not, and can't see why older userspace would. If
> > we break it and someone complains, it can be fixed. 
> > 
> > So, please:
> > 
> > 1) Disallow change of base GPA (can do that in the "Check for overlaps" loop
> > in __kvm_set_memory_region), returning EINVAL.
> > 2) Introduce kvm_arch_invalidate_memslot (replacing first
> > kvm_arch_flush_shadow).
> > 3) Introduce kvm_arch_postcommit_memslot (replacing the 
> > second kvm_arch_flush_shadow).
> 
> 
> Is there a reason why the kernel can't do the same?  First remove the
> old slot, then add the new one?
> 
> We break atomicity, but it's less likely that guests rely on it than an
> old qemu not relying on gpa-changing memory update.

The guest should not expect memory accesses to an address
to behave sanely while changing a BAR anyway.

Yes, compatibility for change of GPA base can be done in the
kernel. I can look into it next week if nobody has done so at
that point.

Thanks Avi.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Aug. 17, 2012, 7:06 a.m. UTC | #14
On Wed, 2012-08-15 at 14:59 -0300, Marcelo Tosatti wrote:
> 
> The guest should not expect memory accesses to an address
> to behave sanely while changing a BAR anyway.
> 
> Yes, compatibility for change of GPA base can be done in the
> kernel. I can look into it next week if nobody has done so at
> that point. 

There's one thing to be extra careful about here is if we start
doing that for normal memory (in case we start breaking it up
in slots, such as NUMA setups etc...).

The problem is that we must not allow normal memory accesses to be
handled via the "emulation" code (ie MMIO emulation or load/store
emulation, whatever we call it).

Part of the issues is that on architectures that don't use IPIs for
TLB invalidations but instead use some form of HW broadcast such as
PowerPC or ARM, there is an inherent race in that the emulation code can
keep a guest physical address (and perform the relevant access to the
corresponding memory region) way beyond the point where the guest
virtual->physical translation leading to that address has been
invalidated.

This doesn't happen on x86 because essentially the completion of the
invalidation IPI has to wait for all VCPUs to "respond" and thus to
finish whatever emulation they are doing. This is not the case on archs
with a HW invalidate broadcast.

This is a nasty race, and while we more/less decided that it was
survivable as long as we only go through emulation for devices (as we
don't play swapping games with them in the guest kernel), the minute we
allow normal guest memory access to "slip through", we have broken the
guest virtual memory model.

So if we are manipulated memory slots used for guest RAM we must -not-
break atomicity, since during the time the slot is gone, it will
fallback to emulation, introducing the above race (at least on PowerPC
and ARM).

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 17, 2012, 6:39 p.m. UTC | #15
On Fri, Aug 17, 2012 at 05:06:18PM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2012-08-15 at 14:59 -0300, Marcelo Tosatti wrote:
> > 
> > The guest should not expect memory accesses to an address
> > to behave sanely while changing a BAR anyway.
> > 
> > Yes, compatibility for change of GPA base can be done in the
> > kernel. I can look into it next week if nobody has done so at
> > that point. 
> 
> There's one thing to be extra careful about here is if we start
> doing that for normal memory (in case we start breaking it up
> in slots, such as NUMA setups etc...).
> 
> The problem is that we must not allow normal memory accesses to be
> handled via the "emulation" code (ie MMIO emulation or load/store
> emulation, whatever we call it).
> 
> Part of the issues is that on architectures that don't use IPIs for
> TLB invalidations but instead use some form of HW broadcast such as
> PowerPC or ARM, there is an inherent race in that the emulation code can
> keep a guest physical address (and perform the relevant access to the
> corresponding memory region) way beyond the point where the guest
> virtual->physical translation leading to that address has been
> invalidated.
> 
> This doesn't happen on x86 because essentially the completion of the
> invalidation IPI has to wait for all VCPUs to "respond" and thus to
> finish whatever emulation they are doing. This is not the case on archs
> with a HW invalidate broadcast.
> 
> This is a nasty race, and while we more/less decided that it was
> survivable as long as we only go through emulation for devices (as we
> don't play swapping games with them in the guest kernel), the minute we
> allow normal guest memory access to "slip through", we have broken the
> guest virtual memory model.

This emulation is in hardware, yes? It is the lack of a TLB entry (or
the lack of a valid pagetable to fill the TLB) that triggers it?

> So if we are manipulated memory slots used for guest RAM we must -not-
> break atomicity, since during the time the slot is gone, it will
> fallback to emulation, introducing the above race (at least on PowerPC
> and ARM).

You could say get the vcpus to a known state (which has a side effect of
making sure that emulation is stopped), no? (just as a mental exercise).

> Cheers,
> Ben.

Yes. Well, Avi mentioned earlier that there are users for change of GPA
base. But, if my understanding is correct, the code that emulates
change of BAR in QEMU is:

        /* now do the real mapping */
        if (r->addr != PCI_BAR_UNMAPPED) {
            memory_region_del_subregion(r->address_space, r->memory);
        }
        r->addr = new_addr;
        if (r->addr != PCI_BAR_UNMAPPED) {
            memory_region_add_subregion_overlap(r->address_space,
                                                r->addr, r->memory, 1);

These translate to two kvm_set_user_memory ioctls. 

"> Without taking into consideration backwards compatibility, userspace 
 > can first delete the slot and later create a new one.

 Current qemu will in fact do that.  Not sure about older ones.
"

Avi, where it does that?


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Aug. 17, 2012, 8:32 p.m. UTC | #16
On Fri, 2012-08-17 at 15:39 -0300, Marcelo Tosatti wrote:
> On Fri, Aug 17, 2012 at 05:06:18PM +1000, Benjamin Herrenschmidt wrote:
> > On Wed, 2012-08-15 at 14:59 -0300, Marcelo Tosatti wrote:
> > > 
> > > The guest should not expect memory accesses to an address
> > > to behave sanely while changing a BAR anyway.
> > > 
> > > Yes, compatibility for change of GPA base can be done in the
> > > kernel. I can look into it next week if nobody has done so at
> > > that point. 
> > 
> > There's one thing to be extra careful about here is if we start
> > doing that for normal memory (in case we start breaking it up
> > in slots, such as NUMA setups etc...).
> > 
> > The problem is that we must not allow normal memory accesses to be
> > handled via the "emulation" code (ie MMIO emulation or load/store
> > emulation, whatever we call it).
> > 
> > Part of the issues is that on architectures that don't use IPIs for
> > TLB invalidations but instead use some form of HW broadcast such as
> > PowerPC or ARM, there is an inherent race in that the emulation code can
> > keep a guest physical address (and perform the relevant access to the
> > corresponding memory region) way beyond the point where the guest
> > virtual->physical translation leading to that address has been
> > invalidated.
> > 
> > This doesn't happen on x86 because essentially the completion of the
> > invalidation IPI has to wait for all VCPUs to "respond" and thus to
> > finish whatever emulation they are doing. This is not the case on archs
> > with a HW invalidate broadcast.
> > 
> > This is a nasty race, and while we more/less decided that it was
> > survivable as long as we only go through emulation for devices (as we
> > don't play swapping games with them in the guest kernel), the minute we
> > allow normal guest memory access to "slip through", we have broken the
> > guest virtual memory model.
> 
> This emulation is in hardware, yes? It is the lack of a TLB entry (or
> the lack of a valid pagetable to fill the TLB) that triggers it?

What do you mean ? I'm talking about KVM emulating load and store
instructions (either in the kernel or passing them down to qemu).

This happens whenever an access triggers a host page fault which we
can't resolve because there is no memory slot. In that case, the access
is treated as "emulation".

Thus removing a memory slot and later on adding it back is broken for
that reason on those architectures if that memory slot is used to cover
actual guest memory or anything for which the guest kernel can
potentially play mapping game (yes, potentially this can be an issue
with emulated graphics BARs if/when we start doing fancy stuff with them
such as using the DRM with the TTM which can "Swap" objects in/out of
the emulated vram and play with mappings).

The memory slot update must either be atomic or as you mention below,
whoever does the update must stop all vcpu's before doing the update
which sucks as well.

> > So if we are manipulated memory slots used for guest RAM we must -not-
> > break atomicity, since during the time the slot is gone, it will
> > fallback to emulation, introducing the above race (at least on PowerPC
> > and ARM).
> 
> You could say get the vcpus to a known state (which has a side effect of
> making sure that emulation is stopped), no? (just as a mental exercise).

Yes, you could do that.

> > Cheers,
> > Ben.
> 
> Yes. Well, Avi mentioned earlier that there are users for change of GPA
> base. But, if my understanding is correct, the code that emulates
> change of BAR in QEMU is:
> 
>         /* now do the real mapping */
>         if (r->addr != PCI_BAR_UNMAPPED) {
>             memory_region_del_subregion(r->address_space, r->memory);
>         }
>         r->addr = new_addr;
>         if (r->addr != PCI_BAR_UNMAPPED) {
>             memory_region_add_subregion_overlap(r->address_space,
>                                                 r->addr, r->memory, 1);
> 
> These translate to two kvm_set_user_memory ioctls. 
> 
> "> Without taking into consideration backwards compatibility, userspace 
>  > can first delete the slot and later create a new one.
> 
>  Current qemu will in fact do that.  Not sure about older ones.
> "
> 
> Avi, where it does that?
> 
Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity Aug. 19, 2012, 9:39 a.m. UTC | #17
On 08/17/2012 09:39 PM, Marcelo Tosatti wrote:
> 
> Yes. Well, Avi mentioned earlier that there are users for change of GPA
> base. But, if my understanding is correct, the code that emulates
> change of BAR in QEMU is:
> 
>         /* now do the real mapping */
>         if (r->addr != PCI_BAR_UNMAPPED) {
>             memory_region_del_subregion(r->address_space, r->memory);
>         }
>         r->addr = new_addr;
>         if (r->addr != PCI_BAR_UNMAPPED) {
>             memory_region_add_subregion_overlap(r->address_space,
>                                                 r->addr, r->memory, 1);
> 
> These translate to two kvm_set_user_memory ioctls. 

Not directly.  These functions change a qemu-internal memory map, which
is then transferred to kvm.  Those two calls might be in a transaction
(they aren't now), in which case the memory map update is atomic.

So indeed we issue two ioctls now, but that's a side effect of the
implementation, not related to those two calls being separate.

> 
> "> Without taking into consideration backwards compatibility, userspace 
>  > can first delete the slot and later create a new one.
> 
>  Current qemu will in fact do that.  Not sure about older ones.
> "
> 
> Avi, where it does that?

By "that" I meant first deleting the first slot and then creating a new one.
Marcelo Tosatti Aug. 23, 2012, 1:55 p.m. UTC | #18
On Sat, Aug 18, 2012 at 06:32:25AM +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2012-08-17 at 15:39 -0300, Marcelo Tosatti wrote:
> > On Fri, Aug 17, 2012 at 05:06:18PM +1000, Benjamin Herrenschmidt wrote:
> > > On Wed, 2012-08-15 at 14:59 -0300, Marcelo Tosatti wrote:
> > > > 
> > > > The guest should not expect memory accesses to an address
> > > > to behave sanely while changing a BAR anyway.
> > > > 
> > > > Yes, compatibility for change of GPA base can be done in the
> > > > kernel. I can look into it next week if nobody has done so at
> > > > that point. 
> > > 
> > > There's one thing to be extra careful about here is if we start
> > > doing that for normal memory (in case we start breaking it up
> > > in slots, such as NUMA setups etc...).
> > > 
> > > The problem is that we must not allow normal memory accesses to be
> > > handled via the "emulation" code (ie MMIO emulation or load/store
> > > emulation, whatever we call it).
> > > 
> > > Part of the issues is that on architectures that don't use IPIs for
> > > TLB invalidations but instead use some form of HW broadcast such as
> > > PowerPC or ARM, there is an inherent race in that the emulation code can
> > > keep a guest physical address (and perform the relevant access to the
> > > corresponding memory region) way beyond the point where the guest
> > > virtual->physical translation leading to that address has been
> > > invalidated.
> > > 
> > > This doesn't happen on x86 because essentially the completion of the
> > > invalidation IPI has to wait for all VCPUs to "respond" and thus to
> > > finish whatever emulation they are doing. This is not the case on archs
> > > with a HW invalidate broadcast.
> > > 
> > > This is a nasty race, and while we more/less decided that it was
> > > survivable as long as we only go through emulation for devices (as we
> > > don't play swapping games with them in the guest kernel), the minute we
> > > allow normal guest memory access to "slip through", we have broken the
> > > guest virtual memory model.
> > 
> > This emulation is in hardware, yes? It is the lack of a TLB entry (or
> > the lack of a valid pagetable to fill the TLB) that triggers it?
> 
> What do you mean ? I'm talking about KVM emulating load and store
> instructions (either in the kernel or passing them down to qemu).
> 
> This happens whenever an access triggers a host page fault which we
> can't resolve because there is no memory slot. In that case, the access
> is treated as "emulation".
> 
> Thus removing a memory slot and later on adding it back is broken for
> that reason on those architectures if that memory slot is used to cover
> actual guest memory or anything for which the guest kernel can
> potentially play mapping game (yes, potentially this can be an issue
> with emulated graphics BARs if/when we start doing fancy stuff with them
> such as using the DRM with the TTM which can "Swap" objects in/out of
> the emulated vram and play with mappings).
> 
> The memory slot update must either be atomic or as you mention below,
> whoever does the update must stop all vcpu's before doing the update
> which sucks as well.

There are a number of options to consider:

1) Merge the current patchset from Paul, which has two downsides:
	1-1) It contains an unfixable race.
	1-2) It splits the rcu/invalidation steps in generic code
             and subarch code. It opens the precedent for other 
             arches to do the same.
You'd still have to implement kvm_arch_flush_shadow to support proper 
deletion of memslots (without races there), for example.

2) Disallow GPA base change, require userspace to stop vcpus if it 
needs atomicity while changing the GPA base of a slot (and then 
introduce the two new callbacks as discussed in this thread).

3) Introduce a mechanism in kernel to serialize guest access (such as a mutex) 
while memory slot updates are performed, thus retaining atomicity.

It appears to me that given the relative rarity (as compared to
vmentry/vmexits, say) of change of GPA base, 2) is preferred. But i
might be wrong.

What do you prefer?

> > > So if we are manipulated memory slots used for guest RAM we must -not-
> > > break atomicity, since during the time the slot is gone, it will
> > > fallback to emulation, introducing the above race (at least on PowerPC
> > > and ARM).
> > 
> > You could say get the vcpus to a known state (which has a side effect of
> > making sure that emulation is stopped), no? (just as a mental exercise).
> 
> Yes, you could do that.
> 
> > > Cheers,
> > > Ben.
> > 
> > Yes. Well, Avi mentioned earlier that there are users for change of GPA
> > base. But, if my understanding is correct, the code that emulates
> > change of BAR in QEMU is:
> > 
> >         /* now do the real mapping */
> >         if (r->addr != PCI_BAR_UNMAPPED) {
> >             memory_region_del_subregion(r->address_space, r->memory);
> >         }
> >         r->addr = new_addr;
> >         if (r->addr != PCI_BAR_UNMAPPED) {
> >             memory_region_add_subregion_overlap(r->address_space,
> >                                                 r->addr, r->memory, 1);
> > 
> > These translate to two kvm_set_user_memory ioctls. 
> > 
> > "> Without taking into consideration backwards compatibility, userspace 
> >  > can first delete the slot and later create a new one.
> > 
> >  Current qemu will in fact do that.  Not sure about older ones.
> > "
> > 
> > Avi, where it does that?
> > 
> Cheers,
> Ben.
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Aug. 24, 2012, 9:29 a.m. UTC | #19
On Thu, Aug 23, 2012 at 10:55:37AM -0300, Marcelo Tosatti wrote:

> There are a number of options to consider:
> 
> 1) Merge the current patchset from Paul, which has two downsides:
> 	1-1) It contains an unfixable race.

The race being that new HTPEs using the old base address could get
inserted after we flush the old HPTEs and before we update the memslot
array?

I think we could solve that one by temporarily marking the memslot
invalid for changes of base_gfn as well as for memslot deletions.  Can
you see any problem with that?  It means that guest accesses to the
old memslot addresses would trap or fail, but if the guest is trying
to access a device while its BAR is being changed, then I think it
deserves that.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Aug. 24, 2012, 6:58 p.m. UTC | #20
On Fri, Aug 24, 2012 at 07:29:53PM +1000, Paul Mackerras wrote:
> On Thu, Aug 23, 2012 at 10:55:37AM -0300, Marcelo Tosatti wrote:
> 
> > There are a number of options to consider:
> > 
> > 1) Merge the current patchset from Paul, which has two downsides:
> > 	1-1) It contains an unfixable race.
> 
> The race being that new HTPEs using the old base address could get
> inserted after we flush the old HPTEs and before we update the memslot
> array?

Yes. That race, for the slot deletion case, is fixed by RCU assignment 
of memslot marked with KVM_MEMSLOT_INVALID.

For the base change case, x86 flushes all translations via the
kvm_arch_flush_shadow() at the of __kvm_set_memory. 

Which is not enough for PPC since you need to flush _before_ new
memslot is visible.

> I think we could solve that one by temporarily marking the memslot
> invalid for changes of base_gfn as well as for memslot deletions.  Can
> you see any problem with that?  It means that guest accesses to the
> old memslot addresses would trap or fail, but if the guest is trying
> to access a device while its BAR is being changed, then I think it
> deserves that.

No, i don't see any problem with that. Sent a patchset.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..044c921 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -140,11 +140,15 @@  extern void kvm_release_hpt(struct kvmppc_linear_info *li);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+                                struct kvm_memory_slot *memslot,
+                                struct kvm_memory_slot *old,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
 				      struct kvm_ppc_smmu_info *info);
+extern void kvmppc_unmap_memslot(struct kvm *kvm,
+				 struct kvm_memory_slot *memslot);
 
 extern int kvmppc_bookehv_init(void);
 extern void kvmppc_bookehv_exit(void);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 3c635c0..87735a7 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -850,7 +850,8 @@  static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		psize = hpte_page_size(hptep[0], ptel);
 		if ((hptep[0] & HPTE_V_VALID) &&
 		    hpte_rpn(ptel, psize) == gfn) {
-			hptep[0] |= HPTE_V_ABSENT;
+			if (kvm->arch.using_mmu_notifiers)
+				hptep[0] |= HPTE_V_ABSENT;
 			kvmppc_invalidate_hpte(kvm, hptep, i);
 			/* Harvest R and C */
 			rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
@@ -877,6 +878,28 @@  int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
 	return 0;
 }
 
+void kvmppc_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+	unsigned long *rmapp;
+	unsigned long gfn;
+	unsigned long n;
+
+	rmapp = memslot->rmap;
+	gfn = memslot->base_gfn;
+	for (n = memslot->npages; n; --n) {
+		/*
+		 * Testing the present bit without locking is OK because
+		 * the memslot has been marked invalid already, and hence
+		 * no new HPTEs referencing this page can be created,
+		 * thus the present bit can't go from 0 to 1.
+		 */
+		if (*rmapp & KVMPPC_RMAP_PRESENT)
+			kvm_unmap_rmapp(kvm, rmapp, gfn);
+		++rmapp;
+		++gfn;
+	}
+}
+
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			 unsigned long gfn)
 {
@@ -1039,7 +1062,7 @@  long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 	rmapp = memslot->rmap;
 	map = memslot->dirty_bitmap;
 	for (i = 0; i < memslot->npages; ++i) {
-		if (kvm_test_clear_dirty(kvm, rmapp))
+		if (kvm_test_clear_dirty(kvm, rmapp) && map)
 			__set_bit_le(i, map);
 		++rmapp;
 	}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 83e929e..aad20ca0 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1299,26 +1299,6 @@  static unsigned long slb_pgsize_encoding(unsigned long psize)
 	return senc;
 }
 
-int kvmppc_core_prepare_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem)
-{
-	unsigned long npages;
-	unsigned long *phys;
-
-	/* Allocate a slot_phys array */
-	phys = kvm->arch.slot_phys[mem->slot];
-	if (!kvm->arch.using_mmu_notifiers && !phys) {
-		npages = mem->memory_size >> PAGE_SHIFT;
-		phys = vzalloc(npages * sizeof(unsigned long));
-		if (!phys)
-			return -ENOMEM;
-		kvm->arch.slot_phys[mem->slot] = phys;
-		kvm->arch.slot_npages[mem->slot] = npages;
-	}
-
-	return 0;
-}
-
 static void unpin_slot(struct kvm *kvm, int slot_id)
 {
 	unsigned long *physp;
@@ -1343,6 +1323,47 @@  static void unpin_slot(struct kvm *kvm, int slot_id)
 	}
 }
 
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
+				      struct kvm_memory_slot *old,
+				      struct kvm_userspace_memory_region *mem)
+{
+	unsigned long npages;
+	unsigned long *phys;
+
+	/* Are we creating, deleting, or modifying the slot? */
+	if (!memslot->npages) {
+		/* deleting */
+		kvmppc_unmap_memslot(kvm, old);
+		if (!kvm->arch.using_mmu_notifiers)
+			unpin_slot(kvm, mem->slot);
+		return 0;
+	}
+
+	if (old->npages) {
+		/* modifying guest_phys or flags */
+		if (old->base_gfn != memslot->base_gfn)
+			kvmppc_unmap_memslot(kvm, old);
+		if (memslot->dirty_bitmap &&
+		    old->dirty_bitmap != memslot->dirty_bitmap)
+			kvmppc_hv_get_dirty_log(kvm, old);
+		return 0;
+	}
+
+	/* Creating - allocate a slot_phys array if needed */
+	phys = kvm->arch.slot_phys[mem->slot];
+	if (!kvm->arch.using_mmu_notifiers && !phys) {
+		npages = mem->memory_size >> PAGE_SHIFT;
+		phys = vzalloc(npages * sizeof(unsigned long));
+		if (!phys)
+			return -ENOMEM;
+		kvm->arch.slot_phys[mem->slot] = phys;
+		kvm->arch.slot_npages[mem->slot] = npages;
+	}
+
+	return 0;
+}
+
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem)
 {
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5c70d19..f48d7e6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -81,7 +81,7 @@  static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	ptel = rev->guest_rpte |= rcbits;
 	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
 	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
-	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+	if (!memslot)
 		return;
 
 	rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index a1baec3..c5e0062 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1184,6 +1184,8 @@  int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
 #endif /* CONFIG_PPC64 */
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
+				      struct kvm_memory_slot *old,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097..f037c64 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1254,6 +1254,8 @@  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 }
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
+				      struct kvm_memory_slot *old,
 				      struct kvm_userspace_memory_region *mem)
 {
 	return 0;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 87f4dc8..e84ba15 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -315,7 +315,7 @@  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                    struct kvm_userspace_memory_region *mem,
                                    int user_alloc)
 {
-	return kvmppc_core_prepare_memory_region(kvm, mem);
+	return kvmppc_core_prepare_memory_region(kvm, memslot, &old, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,