Patchwork [6/8] KVM: PPC: E500: Implement MMU notifiers

login
register
mail settings
Submitter Alexander Graf
Date Aug. 7, 2012, 10:57 a.m.
Message ID <1344337036-22244-7-git-send-email-agraf@suse.de>
Download mbox | patch
Permalink /patch/175589/
State New
Headers show

Comments

Alexander Graf - Aug. 7, 2012, 10:57 a.m.
The e500 target has lived without mmu notifiers ever since it got
introduced, but fails for the user space check on them with hugetlbfs.

So in order to get that one working, implement mmu notifiers in a
reasonably dumb fashion and be happy. On embedded hardware, we almost
never end up with mmu notifier calls, since most people don't overcommit.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |    3 +-
 arch/powerpc/include/asm/kvm_ppc.h  |    1 +
 arch/powerpc/kvm/Kconfig            |    2 +
 arch/powerpc/kvm/booke.c            |   23 +++++++++++++++
 arch/powerpc/kvm/e500_tlb.c         |   52 +++++++++++++++++++++++++++++++++++
 5 files changed, 80 insertions(+), 1 deletions(-)
Avi Kivity - Aug. 7, 2012, 1:30 p.m.
On 08/07/2012 01:57 PM, Alexander Graf wrote:
> The e500 target has lived without mmu notifiers ever since it got
> introduced, but fails for the user space check on them with hugetlbfs.
> 
> So in order to get that one working, implement mmu notifiers in a
> reasonably dumb fashion and be happy. On embedded hardware, we almost
> never end up with mmu notifier calls, since most people don't overcommit.
> 
>  
> +static void kvmppc_check_requests(struct kvm_vcpu *vcpu)
> +{
> +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
> +	if (vcpu->requests)
> +		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
> +			kvmppc_core_flush_tlb(vcpu);
> +#endif
> +}
> +
>  /*
>   * Common checks before entering the guest world.  Call with interrupts
>   * disabled.
> @@ -485,12 +494,24 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
>  			break;
>  		}
>  
> +		smp_mb();
> +		kvmppc_check_requests(vcpu);
> +

On x86 we do the requests processing while in normal preemptible
context, then do an additional check for requests != 0 during guest
entry.  This allows us to do sleepy things in request processing, and
reduces the amount of work we do with interrupts disabled.

>  		if (kvmppc_core_prepare_to_enter(vcpu)) {
>  			/* interrupts got enabled in between, so we
>  			   are back at square 1 */
>  			continue;
>  		}
>  
> +		if (vcpu->mode == EXITING_GUEST_MODE) {
> +			r = 1;
> +			break;
> +		}
> +
> +		/* Going into guest context! Yay! */
> +		vcpu->mode = IN_GUEST_MODE;
> +		smp_wmb();
> +
>  		break;
>  	}
>  
> @@ -560,6 +581,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>  #endif
>  
>  	kvm_guest_exit();
> +	vcpu->mode = OUTSIDE_GUEST_MODE;
> +	smp_wmb();
>  
> +/************* MMU Notifiers *************/
> +
> +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
> +{
> +	/* Is this a guest page? */
> +	if (!hva_to_memslot(kvm, hva))
> +		return 0;
> +
> +	/*
> +	 * Flush all shadow tlb entries everywhere. This is slow, but
> +	 * we are 100% sure that we catch the to be unmapped page
> +	 */
> +	kvm_flush_remote_tlbs(kvm);

Wow.

> +
> +	return 0;
> +}
> +

Where do you drop the reference count when installing a page in a shadow
tlb entry?
Alexander Graf - Aug. 7, 2012, 1:52 p.m.
On 07.08.2012, at 15:30, Avi Kivity <avi@redhat.com> wrote:

> On 08/07/2012 01:57 PM, Alexander Graf wrote:
>> The e500 target has lived without mmu notifiers ever since it got
>> introduced, but fails for the user space check on them with hugetlbfs.
>> 
>> So in order to get that one working, implement mmu notifiers in a
>> reasonably dumb fashion and be happy. On embedded hardware, we almost
>> never end up with mmu notifier calls, since most people don't overcommit.
>> 
>> 
>> +static void kvmppc_check_requests(struct kvm_vcpu *vcpu)
>> +{
>> +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
>> +    if (vcpu->requests)
>> +        if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
>> +            kvmppc_core_flush_tlb(vcpu);
>> +#endif
>> +}
>> +
>> /*
>>  * Common checks before entering the guest world.  Call with interrupts
>>  * disabled.
>> @@ -485,12 +494,24 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
>>            break;
>>        }
>> 
>> +        smp_mb();
>> +        kvmppc_check_requests(vcpu);
>> +
> 
> On x86 we do the requests processing while in normal preemptible
> context, then do an additional check for requests != 0 during guest
> entry.  This allows us to do sleepy things in request processing, and
> reduces the amount of work we do with interrupts disabled.

Hrm. We could do the same I guess. Let me give it a try.

> 
>>        if (kvmppc_core_prepare_to_enter(vcpu)) {
>>            /* interrupts got enabled in between, so we
>>               are back at square 1 */
>>            continue;
>>        }
>> 
>> +        if (vcpu->mode == EXITING_GUEST_MODE) {
>> +            r = 1;
>> +            break;
>> +        }
>> +
>> +        /* Going into guest context! Yay! */
>> +        vcpu->mode = IN_GUEST_MODE;
>> +        smp_wmb();
>> +
>>        break;
>>    }
>> 
>> @@ -560,6 +581,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>> #endif
>> 
>>    kvm_guest_exit();
>> +    vcpu->mode = OUTSIDE_GUEST_MODE;
>> +    smp_wmb();
>> 
>> +/************* MMU Notifiers *************/
>> +
>> +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
>> +{
>> +    /* Is this a guest page? */
>> +    if (!hva_to_memslot(kvm, hva))
>> +        return 0;
>> +
>> +    /*
>> +     * Flush all shadow tlb entries everywhere. This is slow, but
>> +     * we are 100% sure that we catch the to be unmapped page
>> +     */
>> +    kvm_flush_remote_tlbs(kvm);
> 
> Wow.

Yeah, cool, eh? It sounds worse than it is. Usually when we need to page out, we're under memory pressure. So we would get called multiple times to unmap different pages. If we just drop all shadow tlb entries, we also freed a lot of memory that can now be paged out without callbacks.

> 
>> +
>> +    return 0;
>> +}
>> +
> 
> Where do you drop the reference count when installing a page in a shadow
> tlb entry?

Which reference count? Essentially the remote tlb flush calls kvmppc_e500_prov_release() on all currently mapped shadow tlb entries. Are we missing out on something more?


Alex

> 
> 
> -- 
> error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Avi Kivity - Aug. 7, 2012, 2:14 p.m.
On 08/07/2012 04:52 PM, Alexander Graf wrote:
>>> 
>>> +/************* MMU Notifiers *************/
>>> +
>>> +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
>>> +{
>>> +    /* Is this a guest page? */
>>> +    if (!hva_to_memslot(kvm, hva))
>>> +        return 0;
>>> +
>>> +    /*
>>> +     * Flush all shadow tlb entries everywhere. This is slow, but
>>> +     * we are 100% sure that we catch the to be unmapped page
>>> +     */
>>> +    kvm_flush_remote_tlbs(kvm);
>> 
>> Wow.
> 
> Yeah, cool, eh? It sounds worse than it is. Usually when we need to page out, we're under memory pressure. So we would get called multiple times to unmap different pages. If we just drop all shadow tlb entries, we also freed a lot of memory that can now be paged out without callbacks.

And it's just a shadow tlb yes?  So there's a limited amount of stuff
there.  But it'd be hell on x86.

> 
>> 
>>> +
>>> +    return 0;
>>> +}
>>> +
>> 
>> Where do you drop the reference count when installing a page in a shadow
>> tlb entry?
> 
> Which reference count? Essentially the remote tlb flush calls kvmppc_e500_prov_release() on all currently mapped shadow tlb entries. Are we missing out on something more?
> 

With mmu notifiers mapped pages are kept without elevated reference
counts; the mmu notifier protects them, not the refcount.  This allows
core mm code that looks at refcounts to work.
Alexander Graf - Aug. 7, 2012, 2:24 p.m.
On 07.08.2012, at 16:14, Avi Kivity <avi@redhat.com> wrote:

> On 08/07/2012 04:52 PM, Alexander Graf wrote:
>>>> 
>>>> +/************* MMU Notifiers *************/
>>>> +
>>>> +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
>>>> +{
>>>> +    /* Is this a guest page? */
>>>> +    if (!hva_to_memslot(kvm, hva))
>>>> +        return 0;
>>>> +
>>>> +    /*
>>>> +     * Flush all shadow tlb entries everywhere. This is slow, but
>>>> +     * we are 100% sure that we catch the to be unmapped page
>>>> +     */
>>>> +    kvm_flush_remote_tlbs(kvm);
>>> 
>>> Wow.
>> 
>> Yeah, cool, eh? It sounds worse than it is. Usually when we need to page out, we're under memory pressure. So we would get called multiple times to unmap different pages. If we just drop all shadow tlb entries, we also freed a lot of memory that can now be paged out without callbacks.
> 
> And it's just a shadow tlb yes?  So there's a limited amount of stuff
> there.  But it'd be hell on x86.
> 
>> 
>>> 
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>> 
>>> Where do you drop the reference count when installing a page in a shadow
>>> tlb entry?
>> 
>> Which reference count? Essentially the remote tlb flush calls kvmppc_e500_prov_release() on all currently mapped shadow tlb entries. Are we missing out on something more?
>> 
> 
> With mmu notifiers mapped pages are kept without elevated reference
> counts; the mmu notifier protects them, not the refcount.  This allows
> core mm code that looks at refcounts to work.

Hrm. I wonder why it works then. We only drop the refcount after we got an mmu notifier callback. Maybe we get a callback on an unmapped page, but then happen to clear all the shadow entries as well, hence unpinning them along the way?

That explains why it works, but sure isn't exactly working as intended. Thanks for the hint!


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras - Aug. 8, 2012, 3:31 a.m.
On Tue, Aug 07, 2012 at 12:57:14PM +0200, Alexander Graf wrote:
> The e500 target has lived without mmu notifiers ever since it got
> introduced, but fails for the user space check on them with hugetlbfs.

Ironically that user space check isn't necessary any more since David
Gibson's fix for the hugetlbfs bug went in (90481622, "hugepages: fix
use after free bug in "quota" handling").  So on sufficiently recent
kernels you can just remove the userspace check.  Implementing
mmu-notifiers is a good thing for other reasons though.

Paul.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - Aug. 8, 2012, 8:03 a.m.
On 08.08.2012, at 05:31, Paul Mackerras <paulus@samba.org> wrote:

> On Tue, Aug 07, 2012 at 12:57:14PM +0200, Alexander Graf wrote:
>> The e500 target has lived without mmu notifiers ever since it got
>> introduced, but fails for the user space check on them with hugetlbfs.
> 
> Ironically that user space check isn't necessary any more since David
> Gibson's fix for the hugetlbfs bug went in (90481622, "hugepages: fix
> use after free bug in "quota" handling").  So on sufficiently recent
> kernels you can just remove the userspace check.  Implementing
> mmu-notifiers is a good thing for other reasons though.

Yeah, it's probably best to just require mmu notifiers from every target. It would however be good to have a CAP nevertheless (or change the semantics of SYNC_MMU) so that we can run HV KVM on 970 without changes to QEMU.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 572ad01..ed75bc9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -45,7 +45,8 @@ 
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
 
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#if defined(CONFIG_KVM_BOOK3S_64_HV) || defined(CONFIG_KVM_E500V2) || \
+    defined(CONFIG_KVM_E500MC)
 #include <linux/mmu_notifier.h>
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0124937..c38e824 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -104,6 +104,7 @@  extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
                                          struct kvm_interrupt *irq);
+extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                   unsigned int op, int *advance);
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index f4dacb9..40cad8c 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -123,6 +123,7 @@  config KVM_E500V2
 	depends on EXPERIMENTAL && E500 && !PPC_E500MC
 	select KVM
 	select KVM_MMIO
+	select MMU_NOTIFIER
 	---help---
 	  Support running unmodified E500 guest kernels in virtual machines on
 	  E500v2 host processors.
@@ -138,6 +139,7 @@  config KVM_E500MC
 	select KVM
 	select KVM_MMIO
 	select KVM_BOOKE_HV
+	select MMU_NOTIFIER
 	---help---
 	  Support running unmodified E500MC/E5500 (32-bit) guest kernels in
 	  virtual machines on E500MC/E5500 host processors.
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1d4ce9a..e794c3c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -461,6 +461,15 @@  int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 	return r;
 }
 
+static void kvmppc_check_requests(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	if (vcpu->requests)
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+			kvmppc_core_flush_tlb(vcpu);
+#endif
+}
+
 /*
  * Common checks before entering the guest world.  Call with interrupts
  * disabled.
@@ -485,12 +494,24 @@  static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 			break;
 		}
 
+		smp_mb();
+		kvmppc_check_requests(vcpu);
+
 		if (kvmppc_core_prepare_to_enter(vcpu)) {
 			/* interrupts got enabled in between, so we
 			   are back at square 1 */
 			continue;
 		}
 
+		if (vcpu->mode == EXITING_GUEST_MODE) {
+			r = 1;
+			break;
+		}
+
+		/* Going into guest context! Yay! */
+		vcpu->mode = IN_GUEST_MODE;
+		smp_wmb();
+
 		break;
 	}
 
@@ -560,6 +581,8 @@  int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 #endif
 
 	kvm_guest_exit();
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	smp_wmb();
 
 out:
 	local_irq_enable();
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index d26e705..3f78756 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -357,6 +357,13 @@  static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
 	clear_tlb_privs(vcpu_e500);
 }
 
+void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	clear_tlb_refs(vcpu_e500);
+	clear_tlb1_bitmap(vcpu_e500);
+}
+
 static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 		unsigned int eaddr, int as)
 {
@@ -1062,6 +1069,51 @@  void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 	write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
 }
 
+/************* MMU Notifiers *************/
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* Is this a guest page? */
+	if (!hva_to_memslot(kvm, hva))
+		return 0;
+
+	/*
+	 * Flush all shadow tlb entries everywhere. This is slow, but
+	 * we are 100% sure that we catch the to be unmapped page
+	 */
+	kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	/* kvm_unmap_hva flushes everything anyways */
+	kvm_unmap_hva(kvm, start);
+
+	return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	/* The page will get remapped properly on its next fault */
+	kvm_unmap_hva(kvm, hva);
+}
+
+/*****************************************/
+
 static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int i;