diff mbox

[1/2] KVM: PPC: Add generic hpte management functions

Message ID 1277127841-32704-1-git-send-email-agraf@suse.de (mailing list archive)
State Not Applicable
Headers show

Commit Message

Alexander Graf June 21, 2010, 1:44 p.m. UTC
Currently the shadow paging code keeps an array of entries it knows about.
Whenever the guest invalidates an entry, we loop through that entry,
trying to invalidate matching parts.

While this is a really simple implementation, it is probably the most
ineffective one possible. So instead, let's keep an array of lists around
that are indexed by a hash. This way each PTE can be added by 4 list_add,
removed by 4 list_del invocations and the search only needs to loop through
entries that share the same hash.

This patch implements said lookup and exports generic functions that both
the 32-bit and 64-bit backend can use.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/book3s_mmu_hpte.c |  287 ++++++++++++++++++++++++++++++++++++
 1 files changed, 287 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_mmu_hpte.c

Comments

Avi Kivity June 22, 2010, 12:02 p.m. UTC | #1
On 06/21/2010 04:44 PM, Alexander Graf wrote:
> Currently the shadow paging code keeps an array of entries it knows about.
> Whenever the guest invalidates an entry, we loop through that entry,
> trying to invalidate matching parts.
>
> While this is a really simple implementation, it is probably the most
> ineffective one possible. So instead, let's keep an array of lists around
> that are indexed by a hash. This way each PTE can be added by 4 list_add,
> removed by 4 list_del invocations and the search only needs to loop through
> entries that share the same hash.
>
> This patch implements said lookup and exports generic functions that both
> the 32-bit and 64-bit backend can use.
>    

Mind explaining the all list in there?

>
> +
> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
> +	return hash_64(eaddr>>  PTE_SIZE, HPTEG_HASH_BITS);
> +}
> +
> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
> +	return hash_64(vpage&  0xfffffffffULL, HPTEG_HASH_BITS);
> +}
> +
> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
> +	return hash_64((vpage&  0xffffff000ULL)>>  12, HPTEG_HASH_BITS);
> +}
>    

Please use ordinary formatting for the functions above.

> +/* Flush with mask 0xffffff000 */
> +static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
> +{
> +	struct list_head *list;
> +	struct hpte_cache *pte, *tmp;
> +	u64 vp_mask = 0xffffff000ULL;
> +
> +	list =&vcpu->arch.hpte_hash_vpte_long[kvmppc_mmu_hash_vpte_long(guest_vp)];
> +
> +	/* No entries to flush */
> +	if (!list)
> +		return;
> +
> +	/* Check the list for matching entries */
> +	list_for_each_entry_safe(pte, tmp, list, list_vpte_long)
> +		/* Jump over the helper entry */
> +		if (&pte->list_vpte_long == list)
> +			continue;
> +
> +		if ((pte->pte.vpage&  vp_mask) == guest_vp)
> +			invalidate_pte(vcpu, pte);
> +}
>    

C wants brackets around blocks.
Alexander Graf June 22, 2010, 12:04 p.m. UTC | #2
Avi Kivity wrote:
> On 06/21/2010 04:44 PM, Alexander Graf wrote:
>> Currently the shadow paging code keeps an array of entries it knows
>> about.
>> Whenever the guest invalidates an entry, we loop through that entry,
>> trying to invalidate matching parts.
>>
>> While this is a really simple implementation, it is probably the most
>> ineffective one possible. So instead, let's keep an array of lists
>> around
>> that are indexed by a hash. This way each PTE can be added by 4
>> list_add,
>> removed by 4 list_del invocations and the search only needs to loop
>> through
>> entries that share the same hash.
>>
>> This patch implements said lookup and exports generic functions that
>> both
>> the 32-bit and 64-bit backend can use.
>>    
>
> Mind explaining the all list in there?

The all list is used to flush all entries when we need to get rid of all
entries, for example when we write a BAT.

>
>>
>> +
>> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
>> +    return hash_64(eaddr>>  PTE_SIZE, HPTEG_HASH_BITS);
>> +}
>> +
>> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
>> +    return hash_64(vpage&  0xfffffffffULL, HPTEG_HASH_BITS);
>> +}
>> +
>> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
>> +    return hash_64((vpage&  0xffffff000ULL)>>  12, HPTEG_HASH_BITS);
>> +}
>>    
>
> Please use ordinary formatting for the functions above.

Ouch.

>
>> +/* Flush with mask 0xffffff000 */
>> +static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64
>> guest_vp)
>> +{
>> +    struct list_head *list;
>> +    struct hpte_cache *pte, *tmp;
>> +    u64 vp_mask = 0xffffff000ULL;
>> +
>> +    list
>> =&vcpu->arch.hpte_hash_vpte_long[kvmppc_mmu_hash_vpte_long(guest_vp)];
>> +
>> +    /* No entries to flush */
>> +    if (!list)
>> +        return;
>> +
>> +    /* Check the list for matching entries */
>> +    list_for_each_entry_safe(pte, tmp, list, list_vpte_long)
>> +        /* Jump over the helper entry */
>> +        if (&pte->list_vpte_long == list)
>> +            continue;
>> +
>> +        if ((pte->pte.vpage&  vp_mask) == guest_vp)
>> +            invalidate_pte(vcpu, pte);
>> +}
>>    
>
> C wants brackets around blocks.
>


Even more ouch.


Alex
Avi Kivity June 22, 2010, 12:07 p.m. UTC | #3
On 06/22/2010 03:04 PM, Alexander Graf wrote:
> Avi Kivity wrote:
>    
>> On 06/21/2010 04:44 PM, Alexander Graf wrote:
>>      
>>> Currently the shadow paging code keeps an array of entries it knows
>>> about.
>>> Whenever the guest invalidates an entry, we loop through that entry,
>>> trying to invalidate matching parts.
>>>
>>> While this is a really simple implementation, it is probably the most
>>> ineffective one possible. So instead, let's keep an array of lists
>>> around
>>> that are indexed by a hash. This way each PTE can be added by 4
>>> list_add,
>>> removed by 4 list_del invocations and the search only needs to loop
>>> through
>>> entries that share the same hash.
>>>
>>> This patch implements said lookup and exports generic functions that
>>> both
>>> the 32-bit and 64-bit backend can use.
>>>
>>>        
>> Mind explaining the all list in there?
>>      
> The all list is used to flush all entries when we need to get rid of all
> entries, for example when we write a BAT.
>
>    

Yes, I more or less gathered that when I saw patch 2.  Does it make 
sense to avoid it by looping over all vpte lists in the vpte hash?  More 
effort for a full flush, esp. when the mmu is sparse, but less for 
individual pte operations.
Alexander Graf June 22, 2010, 12:10 p.m. UTC | #4
Avi Kivity wrote:
> On 06/22/2010 03:04 PM, Alexander Graf wrote:
>> Avi Kivity wrote:
>>   
>>> On 06/21/2010 04:44 PM, Alexander Graf wrote:
>>>     
>>>> Currently the shadow paging code keeps an array of entries it knows
>>>> about.
>>>> Whenever the guest invalidates an entry, we loop through that entry,
>>>> trying to invalidate matching parts.
>>>>
>>>> While this is a really simple implementation, it is probably the most
>>>> ineffective one possible. So instead, let's keep an array of lists
>>>> around
>>>> that are indexed by a hash. This way each PTE can be added by 4
>>>> list_add,
>>>> removed by 4 list_del invocations and the search only needs to loop
>>>> through
>>>> entries that share the same hash.
>>>>
>>>> This patch implements said lookup and exports generic functions that
>>>> both
>>>> the 32-bit and 64-bit backend can use.
>>>>
>>>>        
>>> Mind explaining the all list in there?
>>>      
>> The all list is used to flush all entries when we need to get rid of all
>> entries, for example when we write a BAT.
>>
>>    
>
> Yes, I more or less gathered that when I saw patch 2.  Does it make
> sense to avoid it by looping over all vpte lists in the vpte hash? 
> More effort for a full flush, esp. when the mmu is sparse, but less
> for individual pte operations.

Hrm. We could probably make the vpte_long list shorter. Currently all
lists are 1 << 13 entries wide. So we have 8192 lists to loop through.
For vpte_long 1 << 8 = 256 is probably enough. With that it would
probably make sense, yes.

If you have more performance hints, I'll gladly take them :).


Alex
Avi Kivity June 22, 2010, 12:12 p.m. UTC | #5
On 06/22/2010 03:10 PM, Alexander Graf wrote:
> If you have more performance hints, I'll gladly take them :).
>    

Using a cpu that virtualizes the mmu in hardware helps tremendously.
Alexander Graf June 22, 2010, 12:14 p.m. UTC | #6
Avi Kivity wrote:
> On 06/22/2010 03:10 PM, Alexander Graf wrote:
>> If you have more performance hints, I'll gladly take them :).
>>    
>
> Using a cpu that virtualizes the mmu in hardware helps tremendously.
>

PPC never does that. Even with the virtualization extensions the MMU is
still software managed. I was also more thinking of hints like
"kmem_cache_zalloc is slow" or so ;).


Alex
Avi Kivity June 22, 2010, 12:20 p.m. UTC | #7
On 06/22/2010 03:14 PM, Alexander Graf wrote:
> Avi Kivity wrote:
>    
>> On 06/22/2010 03:10 PM, Alexander Graf wrote:
>>      
>>> If you have more performance hints, I'll gladly take them :).
>>>
>>>        
>> Using a cpu that virtualizes the mmu in hardware helps tremendously.
>>
>>      
> PPC never does that. Even with the virtualization extensions the MMU is
> still software managed.

Then mmu intensive loads can expect to be slow.

> I was also more thinking of hints like
> "kmem_cache_zalloc is slow" or so ;).
>    

Stuff like that is usually worthless.  To give real feedback I need to 
understand the hardware, so I'm reduced to coding style and indentation 
review.
Benjamin Herrenschmidt June 26, 2010, 10:58 p.m. UTC | #8
On Tue, 2010-06-22 at 15:20 +0300, Avi Kivity wrote:
> On 06/22/2010 03:14 PM, Alexander Graf wrote:
> > Avi Kivity wrote:
> >    
> >> On 06/22/2010 03:10 PM, Alexander Graf wrote:
> >>      
> >>> If you have more performance hints, I'll gladly take them :).
> >>>
> >>>        
> >> Using a cpu that virtualizes the mmu in hardware helps tremendously.
> >>
> >>      
> > PPC never does that. Even with the virtualization extensions the MMU is
> > still software managed.
> 
> Then mmu intensive loads can expect to be slow.

Well, depends. ppc64 indeed requires the hash to be managed by the
hypervisor, so inserting or invalidating translations will mean a
roundtrip to the hypervisor, though there are ways at least the
insertion could be alleviated (for example, the HV could service the
hash misses directly walking the guest page tables).

But that's due in part to a design choice (whether it's a good one or
not I'm not going to argue here) which favors huge reasonably static
workloads where the hash is expected to contain all translations for
everything.

However, note that BookE (the embedded variant of the architecture) uses
a different model for virtualization, including options in its latest
variant for a HW logical->real translation (via a small dedicated TLB)
and direct access to some TLB ops from the guest.

> > I was also more thinking of hints like
> > "kmem_cache_zalloc is slow" or so ;).
> >    
> 
> Stuff like that is usually worthless.  To give real feedback I need to 
> understand the hardware, so I'm reduced to coding style and indentation 
> review.

In that case, I'd say that BAT manipulation is rare enough (mostly only
at boot time) to warrant indeed speeding up the normal PTE operations &
invalidations at the expense of the BAT change case.

Cheers,
Ben.
Avi Kivity June 27, 2010, 7:53 a.m. UTC | #9
On 06/27/2010 01:58 AM, Benjamin Herrenschmidt wrote:
>
>> Then mmu intensive loads can expect to be slow.
>>      
> Well, depends. ppc64 indeed requires the hash to be managed by the
> hypervisor, so inserting or invalidating translations will mean a
> roundtrip to the hypervisor, though there are ways at least the
> insertion could be alleviated (for example, the HV could service the
> hash misses directly walking the guest page tables).
>    

But the guest page tables are software defined, no?  That means the 
interface will break if the page table format changes.

> But that's due in part to a design choice (whether it's a good one or
> not I'm not going to argue here) which favors huge reasonably static
> workloads where the hash is expected to contain all translations for
> everything.
>    

What about when you have memory pressure?  The hash will have to reflect 
those pte_clear_flush_young(), no?

It seems horribly expensive.

> However, note that BookE (the embedded variant of the architecture) uses
> a different model for virtualization, including options in its latest
> variant for a HW logical->real translation (via a small dedicated TLB)
> and direct access to some TLB ops from the guest.
>    

I'm somewhat familiar with it, yes.
Benjamin Herrenschmidt June 27, 2010, 10:10 p.m. UTC | #10
On Sun, 2010-06-27 at 10:53 +0300, Avi Kivity wrote:
> On 06/27/2010 01:58 AM, Benjamin Herrenschmidt wrote:
> >
> >> Then mmu intensive loads can expect to be slow.
> >>      
> > Well, depends. ppc64 indeed requires the hash to be managed by the
> > hypervisor, so inserting or invalidating translations will mean a
> > roundtrip to the hypervisor, though there are ways at least the
> > insertion could be alleviated (for example, the HV could service the
> > hash misses directly walking the guest page tables).
> >    
> 
> But the guest page tables are software defined, no?  That means the 
> interface will break if the page table format changes.

Yes. Unless the hypervisor or architecture defines the format to be
used :-) IE. That's what Niagara 1 did. But we don't do that indeed
currently.

> > But that's due in part to a design choice (whether it's a good one or
> > not I'm not going to argue here) which favors huge reasonably static
> > workloads where the hash is expected to contain all translations for
> > everything.
> >    
> 
> What about when you have memory pressure?  The hash will have to reflect 
> those pte_clear_flush_young(), no?

Well, our architects would argue that the kind of workloads we target
don't have memory pressure :-)

But yes, I agree, harvesting of dirty and young bits is going to force a
hash flush which can be pretty expensive. Heh, we've been trying to
convince our own architects at designers that the MMU sucks for long
enough...

> It seems horribly expensive.
> 
> > However, note that BookE (the embedded variant of the architecture) uses
> > a different model for virtualization, including options in its latest
> > variant for a HW logical->real translation (via a small dedicated TLB)
> > and direct access to some TLB ops from the guest.
> >    
> 
> I'm somewhat familiar with it, yes.

Cheers,
Ben.
diff mbox

Patch

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644
index 0000000..8ee0f1e
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -0,0 +1,287 @@ 
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#define PTE_SIZE	12
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_SLB */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+#ifdef DEBUG_SLB
+#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_slb(a, ...) do { } while(0)
+#endif
+
+static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
+	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
+	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
+	return hash_64((vpage & 0xffffff000ULL) >> 12, HPTEG_HASH_BITS);
+}
+
+void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	u64 index;
+
+	/* Add to ePTE list */
+	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
+	list_add(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+
+	/* Add to vPTE list */
+	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
+	list_add(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+
+	/* Add to vPTE_long list */
+	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
+	list_add(&pte->list_vpte_long, &vcpu->arch.hpte_hash_vpte_long[index]);
+
+	/* Add to all list */
+	list_add(&pte->list_all, &vcpu->arch.hpte_all);
+}
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
+		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+	/* Different for 32 and 64 bit */
+	kvmppc_mmu_invalidate_pte(vcpu, pte);
+
+	if (pte->pte.may_write)
+		kvm_release_pfn_dirty(pte->pfn);
+	else
+		kvm_release_pfn_clean(pte->pfn);
+
+	list_del(&pte->list_pte);
+	list_del(&pte->list_vpte);
+	list_del(&pte->list_vpte_long);
+	list_del(&pte->list_all);
+
+	kmem_cache_free(vcpu->arch.hpte_cache, pte);
+}
+
+static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte, *tmp;
+
+	list_for_each_entry_safe(pte, tmp, &vcpu->arch.hpte_all, list_all) {
+		/* Jump over the helper entry */
+		if (&pte->list_all == &vcpu->arch.hpte_all)
+			continue;
+
+		invalidate_pte(vcpu, pte);
+	}
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+	u64 i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
+
+	switch (ea_mask) {
+	case ~0xfffUL:
+	{
+		struct list_head *list;
+		struct hpte_cache *pte, *tmp;
+
+		/* Find the list of entries in the map */
+		list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+
+		/* Check the list for matching entries */
+		list_for_each_entry_safe(pte, tmp, list, list_pte) {
+			/* Jump over the helper entry */
+			if (&pte->list_pte == list)
+				continue;
+
+			/* Invalidate matching PTE */
+			if ((pte->pte.eaddr & ~0xfffULL) == guest_ea)
+				invalidate_pte(vcpu, pte);
+		}
+		break;
+	}
+	case 0x0ffff000:
+		/* 32-bit flush w/o segment, go through all possible segments */
+		for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
+			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
+		break;
+	case 0:
+		/* Doing a complete flush -> start from scratch */
+		kvmppc_mmu_pte_flush_all(vcpu);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+/* Flush with mask 0xfffffffff */
+static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct list_head *list;
+	struct hpte_cache *pte, *tmp;
+	u64 vp_mask = 0xfffffffffULL;
+
+	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+
+	/* Check the list for matching entries */
+	list_for_each_entry_safe(pte, tmp, list, list_vpte) {
+		/* Jump over the helper entry */
+		if (&pte->list_vpte == list)
+			continue;
+
+		/* Invalidate matching PTEs */
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+	}
+}
+
+/* Flush with mask 0xffffff000 */
+static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct list_head *list;
+	struct hpte_cache *pte, *tmp;
+	u64 vp_mask = 0xffffff000ULL;
+
+	list = &vcpu->arch.hpte_hash_vpte_long[kvmppc_mmu_hash_vpte_long(guest_vp)];
+
+	/* No entries to flush */
+	if (!list)
+		return;
+
+	/* Check the list for matching entries */
+	list_for_each_entry_safe(pte, tmp, list, list_vpte_long)
+		/* Jump over the helper entry */
+		if (&pte->list_vpte_long == list)
+			continue;
+
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+		    vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
+	guest_vp &= vp_mask;
+
+	switch(vp_mask) {
+	case 0xfffffffffULL:
+		kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
+		break;
+	case 0xffffff000ULL:
+		kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+	struct hpte_cache *pte, *tmp;
+
+	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
+
+	/* Search in all entries for matching maps */
+	list_for_each_entry_safe(pte, tmp, &vcpu->arch.hpte_all, list_all) {
+		/* Jump over the helper entry */
+		if (&pte->list_all == &vcpu->arch.hpte_all)
+			continue;
+
+		if ((pte->pte.raddr >= pa_start) &&
+		    (pte->pte.raddr < pa_end)) {
+			invalidate_pte(vcpu, pte);
+		}
+	}
+}
+
+struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte;
+
+	pte = kmem_cache_zalloc(vcpu->arch.hpte_cache, GFP_KERNEL);
+	vcpu->arch.hpte_cache_count++;
+
+	if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+		kvmppc_mmu_pte_flush_all(vcpu);
+
+	return pte;
+}
+
+void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+	kmem_cache_destroy(vcpu->arch.hpte_cache);
+}
+
+static void kvmppc_mmu_hpte_init_hash(struct list_head *hash_list, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		INIT_LIST_HEAD(&hash_list[i]);
+	}
+}
+
+int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
+{
+	char kmem_name[128];
+
+	/* init hpte slab cache */
+	snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
+	vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
+		sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0, NULL);
+
+	/* init hpte lookup hashes */
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
+	INIT_LIST_HEAD(&vcpu->arch.hpte_all);
+
+	return 0;
+}