Patchwork [11/27] Add book3s_64 Host MMU handling

login
register
mail settings
Submitter Alexander Graf
Date Oct. 30, 2009, 3:47 p.m.
Message ID <1256917647-6200-12-git-send-email-agraf@suse.de>
Download mbox | patch
Permalink /patch/37318/
State Accepted, archived
Commit 0d8dc681c84aa88ddc3d2fe5b6029f8eb3d11ecf
Delegated to: Benjamin Herrenschmidt
Headers show

Comments

Alexander Graf - Oct. 30, 2009, 3:47 p.m.
We designed the Book3S port of KVM as modular as possible. Most
of the code could be easily used on a Book3S_32 host as well.

The main difference between 32 and 64 bit cores is the MMU. To keep
things well separated, we treat the book3s_64 MMU as one possible compile
option.

This patch adds all the MMU helpers the rest of the code needs in
order to modify the host's MMU, like setting PTEs and segments.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v5 -> v6

  - don't take mmap_sem
  - dprintk instead if scattered #ifdef's
  - minor cleanups
  - // -> /* */ (book3s_64_mmu_host.c)
---
 arch/powerpc/kvm/book3s_64_mmu_host.c |  408 +++++++++++++++++++++++++++++++++
 1 files changed, 408 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_64_mmu_host.c
Michael Neuling - Nov. 1, 2009, 11:39 p.m.
<snip>
> +static void invalidate_pte(struct hpte_cache *pte)
> +{
> +	dprintk_mmu("KVM: Flushing SPT %d: 0x%llx (0x%llx) -> 0x%llx\n",
> +		    i, pte->pte.eaddr, pte->pte.vpage, pte->host_va);
> +
> +	ppc_md.hpte_invalidate(pte->slot, pte->host_va,
> +			       MMU_PAGE_4K, MMU_SEGSIZE_256M,
> +			       false);

Are we assuming 256M segments here (and elsewhere)?

<snip>
> +static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
> +{
> +	int i;
> +	int max_slb_size = 64;
> +	int found_inval = -1;
> +	int r;
> +
> +	if (!get_paca()->kvm_slb_max)
> +		get_paca()->kvm_slb_max = 1;
> +
> +	/* Are we overwriting? */
> +	for (i = 1; i < get_paca()->kvm_slb_max; i++) {
> +		if (!(get_paca()->kvm_slb[i].esid & SLB_ESID_V))
> +			found_inval = i;
> +		else if ((get_paca()->kvm_slb[i].esid & ESID_MASK) == esid)
> +			return i;
> +	}
> +
> +	/* Found a spare entry that was invalidated before */
> +	if (found_inval > 0)
> +		return found_inval;
> +
> +	/* No spare invalid entry, so create one */
> +
> +	if (mmu_slb_size < 64)
> +		max_slb_size = mmu_slb_size;

Can we just use the global mmu_slb_size eliminate max_slb_size?

<snip>

Mikey
Alexander Graf - Nov. 2, 2009, 9:26 a.m.
Am 02.11.2009 um 00:39 schrieb Michael Neuling <mikey@neuling.org>:

> <snip>
>> +static void invalidate_pte(struct hpte_cache *pte)
>> +{
>> +    dprintk_mmu("KVM: Flushing SPT %d: 0x%llx (0x%llx) -> 0x%llx\n",
>> +            i, pte->pte.eaddr, pte->pte.vpage, pte->host_va);
>> +
>> +    ppc_md.hpte_invalidate(pte->slot, pte->host_va,
>> +                   MMU_PAGE_4K, MMU_SEGSIZE_256M,
>> +                   false);
>
> Are we assuming 256M segments here (and elsewhere)?

Yes, on the host we only create 256MB segments. What the guest uses is  
a different question.

>
> <snip>
>> +static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong  
>> esid)
>> +{
>> +    int i;
>> +    int max_slb_size = 64;
>> +    int found_inval = -1;
>> +    int r;
>> +
>> +    if (!get_paca()->kvm_slb_max)
>> +        get_paca()->kvm_slb_max = 1;
>> +
>> +    /* Are we overwriting? */
>> +    for (i = 1; i < get_paca()->kvm_slb_max; i++) {
>> +        if (!(get_paca()->kvm_slb[i].esid & SLB_ESID_V))
>> +            found_inval = i;
>> +        else if ((get_paca()->kvm_slb[i].esid & ESID_MASK) == esid)
>> +            return i;
>> +    }
>> +
>> +    /* Found a spare entry that was invalidated before */
>> +    if (found_inval > 0)
>> +        return found_inval;
>> +
>> +    /* No spare invalid entry, so create one */
>> +
>> +    if (mmu_slb_size < 64)
>> +        max_slb_size = mmu_slb_size;
>
> Can we just use the global mmu_slb_size eliminate max_slb_size?

Hm, for a strange reason I wanted to have at most 64 slb entries.  
Maybe the struct can't hold more? I'll check again as soon as I'm on a  
notebook again.

Alex

>
> <snip>
>
> Mikey
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
new file mode 100644
index 0000000..f2899b2
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -0,0 +1,408 @@ 
+/*
+ * Copyright (C) 2009 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *     Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#define PTE_SIZE 12
+#define VSID_ALL 0
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_SLB */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+#ifdef DEBUG_SLB
+#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_slb(a, ...) do { } while(0)
+#endif
+
+static void invalidate_pte(struct hpte_cache *pte)
+{
+	dprintk_mmu("KVM: Flushing SPT %d: 0x%llx (0x%llx) -> 0x%llx\n",
+		    i, pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+	ppc_md.hpte_invalidate(pte->slot, pte->host_va,
+			       MMU_PAGE_4K, MMU_SEGSIZE_256M,
+			       false);
+	pte->host_va = 0;
+	kvm_release_pfn_dirty(pte->pfn);
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 guest_ea, u64 ea_mask)
+{
+	int i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%llx & 0x%llx\n",
+		    vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
+	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+	guest_ea &= ea_mask;
+	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+		struct hpte_cache *pte;
+
+		pte = &vcpu->arch.hpte_cache[i];
+		if (!pte->host_va)
+			continue;
+
+		if ((pte->pte.eaddr & ea_mask) == guest_ea) {
+			invalidate_pte(pte);
+		}
+	}
+
+	/* Doing a complete flush -> start from scratch */
+	if (!ea_mask)
+		vcpu->arch.hpte_cache_offset = 0;
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+	int i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+		    vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
+	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+	guest_vp &= vp_mask;
+	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+		struct hpte_cache *pte;
+
+		pte = &vcpu->arch.hpte_cache[i];
+		if (!pte->host_va)
+			continue;
+
+		if ((pte->pte.vpage & vp_mask) == guest_vp) {
+			invalidate_pte(pte);
+		}
+	}
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end)
+{
+	int i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
+		    vcpu->arch.hpte_cache_offset, guest_pa, pa_mask);
+	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+		struct hpte_cache *pte;
+
+		pte = &vcpu->arch.hpte_cache[i];
+		if (!pte->host_va)
+			continue;
+
+		if ((pte->pte.raddr >= pa_start) &&
+		    (pte->pte.raddr < pa_end)) {
+			invalidate_pte(pte);
+		}
+	}
+}
+
+struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
+{
+	int i;
+	u64 guest_vp;
+
+	guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
+	for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
+		struct hpte_cache *pte;
+
+		pte = &vcpu->arch.hpte_cache[i];
+		if (!pte->host_va)
+			continue;
+
+		if (pte->pte.vpage == guest_vp)
+			return &pte->pte;
+	}
+
+	return NULL;
+}
+
+static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return vcpu->arch.hpte_cache_offset++;
+}
+
+/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
+ * a hash, so we don't waste cycles on looping */
+static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+}
+
+
+static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	u16 sid_map_mask;
+
+	if (vcpu->arch.msr & MSR_PR)
+		gvsid |= VSID_PR;
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+	if (map->guest_vsid == gvsid) {
+		dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
+			    gvsid, map->host_vsid);
+		return map;
+	}
+
+	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
+	if (map->guest_vsid == gvsid) {
+		dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
+			    gvsid, map->host_vsid);
+		return map;
+	}
+
+	dprintk_slb("SLB: Searching 0x%llx -> not found\n", gvsid);
+	return NULL;
+}
+
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
+{
+	pfn_t hpaddr;
+	ulong hash, hpteg, va;
+	u64 vsid;
+	int ret;
+	int rflags = 0x192;
+	int vflags = 0;
+	int attempt = 0;
+	struct kvmppc_sid_map *map;
+
+	/* Get host physical address for gpa */
+	hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+	if (kvm_is_error_hva(hpaddr)) {
+		printk(KERN_INFO "Couldn't get guest page for gfn %llx!\n", orig_pte->eaddr);
+		return -EINVAL;
+	}
+	hpaddr <<= PAGE_SHIFT;
+#if PAGE_SHIFT == 12
+#elif PAGE_SHIFT == 16
+	hpaddr |= orig_pte->raddr & 0xf000;
+#else
+#error Unknown page size
+#endif
+
+	/* and write the mapping ea -> hpa into the pt */
+	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
+	map = find_sid_vsid(vcpu, vsid);
+	if (!map) {
+		kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr);
+		map = find_sid_vsid(vcpu, vsid);
+	}
+	BUG_ON(!map);
+
+	vsid = map->host_vsid;
+	va = hpt_va(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M);
+
+	if (!orig_pte->may_write)
+		rflags |= HPTE_R_PP;
+	else
+		mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+
+	if (!orig_pte->may_execute)
+		rflags |= HPTE_R_N;
+
+	hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M);
+
+map_again:
+	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+	/* In case we tried normal mapping already, let's nuke old entries */
+	if (attempt > 1)
+		if (ppc_md.hpte_remove(hpteg) < 0)
+			return -1;
+
+	ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M);
+
+	if (ret < 0) {
+		/* If we couldn't map a primary PTE, try a secondary */
+#ifdef USE_SECONDARY
+		hash = ~hash;
+		attempt++;
+		if (attempt % 2)
+			vflags = HPTE_V_SECONDARY;
+		else
+			vflags = 0;
+#else
+		attempt = 2;
+#endif
+		goto map_again;
+	} else {
+		int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
+		struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
+
+		dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%lx (0x%llx) -> %lx\n",
+			    ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
+			    (rflags & HPTE_R_N) ? '-' : 'x',
+			    orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr);
+
+		pte->slot = hpteg + (ret & 7);
+		pte->host_va = va;
+		pte->pte = *orig_pte;
+		pte->pfn = hpaddr >> PAGE_SHIFT;
+	}
+
+	return 0;
+}
+
+static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u16 sid_map_mask;
+	static int backwards_map = 0;
+
+	if (vcpu->arch.msr & MSR_PR)
+		gvsid |= VSID_PR;
+
+	/* We might get collisions that trap in preceding order, so let's
+	   map them differently */
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	if (backwards_map)
+		sid_map_mask = SID_MAP_MASK - sid_map_mask;
+
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+
+	/* Make sure we're taking the other map next time */
+	backwards_map = !backwards_map;
+
+	/* Uh-oh ... out of mappings. Let's flush! */
+	if (vcpu_book3s->vsid_next == vcpu_book3s->vsid_max) {
+		vcpu_book3s->vsid_next = vcpu_book3s->vsid_first;
+		memset(vcpu_book3s->sid_map, 0,
+		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		kvmppc_mmu_flush_segments(vcpu);
+	}
+	map->host_vsid = vcpu_book3s->vsid_next++;
+
+	map->guest_vsid = gvsid;
+	map->valid = true;
+
+	return map;
+}
+
+static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
+{
+	int i;
+	int max_slb_size = 64;
+	int found_inval = -1;
+	int r;
+
+	if (!get_paca()->kvm_slb_max)
+		get_paca()->kvm_slb_max = 1;
+
+	/* Are we overwriting? */
+	for (i = 1; i < get_paca()->kvm_slb_max; i++) {
+		if (!(get_paca()->kvm_slb[i].esid & SLB_ESID_V))
+			found_inval = i;
+		else if ((get_paca()->kvm_slb[i].esid & ESID_MASK) == esid)
+			return i;
+	}
+
+	/* Found a spare entry that was invalidated before */
+	if (found_inval > 0)
+		return found_inval;
+
+	/* No spare invalid entry, so create one */
+
+	if (mmu_slb_size < 64)
+		max_slb_size = mmu_slb_size;
+
+	/* Overflowing -> purge */
+	if ((get_paca()->kvm_slb_max) == max_slb_size)
+		kvmppc_mmu_flush_segments(vcpu);
+
+	r = get_paca()->kvm_slb_max;
+	get_paca()->kvm_slb_max++;
+
+	return r;
+}
+
+int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+	u64 esid = eaddr >> SID_SHIFT;
+	u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;
+	u64 slb_vsid = SLB_VSID_USER;
+	u64 gvsid;
+	int slb_index;
+	struct kvmppc_sid_map *map;
+
+	slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);
+
+	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
+		/* Invalidate an entry */
+		get_paca()->kvm_slb[slb_index].esid = 0;
+		return -ENOENT;
+	}
+
+	map = find_sid_vsid(vcpu, gvsid);
+	if (!map)
+		map = create_sid_map(vcpu, gvsid);
+
+	map->guest_esid = esid;
+
+	slb_vsid |= (map->host_vsid << 12);
+	slb_vsid &= ~SLB_VSID_KP;
+	slb_esid |= slb_index;
+
+	get_paca()->kvm_slb[slb_index].esid = slb_esid;
+	get_paca()->kvm_slb[slb_index].vsid = slb_vsid;
+
+	dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid);
+
+	return 0;
+}
+
+void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
+{
+	get_paca()->kvm_slb_max = 1;
+	get_paca()->kvm_slb[0].esid = 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+}