Patchwork KVM: PPC: Add generic hpte management functions

login
register
mail settings
Submitter Alexander Graf
Date June 25, 2010, 11:16 p.m.
Message ID <1277507817-626-2-git-send-email-agraf@suse.de>
Download mbox | patch
Permalink /patch/57007/
State Not Applicable
Headers show

Comments

Alexander Graf - June 25, 2010, 11:16 p.m.
Currently the shadow paging code keeps an array of entries it knows about.
Whenever the guest invalidates an entry, we loop through that entry,
trying to invalidate matching parts.

While this is a really simple implementation, it is probably the most
ineffective one possible. So instead, let's keep an array of lists around
that are indexed by a hash. This way each PTE can be added by 4 list_add,
removed by 4 list_del invocations and the search only needs to loop through
entries that share the same hash.

This patch implements said lookup and exports generic functions that both
the 32-bit and 64-bit backend can use.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - remove hpte_all list
  - lookup all using vpte_long lists
  - decrease size of vpte_long hash
  - fix missing brackets
---
 arch/powerpc/kvm/book3s_mmu_hpte.c |  286 ++++++++++++++++++++++++++++++++++++
 1 files changed, 286 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_mmu_hpte.c
Alexander Graf - June 25, 2010, 11:18 p.m.
On 26.06.2010, at 01:16, Alexander Graf wrote:

> Currently the shadow paging code keeps an array of entries it knows about.
> Whenever the guest invalidates an entry, we loop through that entry,
> trying to invalidate matching parts.
> 
> While this is a really simple implementation, it is probably the most
> ineffective one possible. So instead, let's keep an array of lists around
> that are indexed by a hash. This way each PTE can be added by 4 list_add,
> removed by 4 list_del invocations and the search only needs to loop through
> entries that share the same hash.
> 
> This patch implements said lookup and exports generic functions that both
> the 32-bit and 64-bit backend can use.

Yikes - I forgot -n.

This is patch 1/2.


Alex
Avi Kivity - June 28, 2010, 8:28 a.m.
On 06/26/2010 02:16 AM, Alexander Graf wrote:
> Currently the shadow paging code keeps an array of entries it knows about.
> Whenever the guest invalidates an entry, we loop through that entry,
> trying to invalidate matching parts.
>
> While this is a really simple implementation, it is probably the most
> ineffective one possible. So instead, let's keep an array of lists around
> that are indexed by a hash. This way each PTE can be added by 4 list_add,
> removed by 4 list_del invocations and the search only needs to loop through
> entries that share the same hash.
>
> This patch implements said lookup and exports generic functions that both
> the 32-bit and 64-bit backend can use.
>
>
> +
> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
> +	return hash_64(eaddr>>  PTE_SIZE, HPTEG_HASH_BITS_PTE);
> +}
> +
> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
> +	return hash_64(vpage&  0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
> +}
> +
> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
> +	return hash_64((vpage&  0xffffff000ULL)>>  12,
> +		       HPTEG_HASH_BITS_VPTE_LONG);
> +}
>    

Still with the wierd coding style?

> +static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
> +{
> +	struct hpte_cache *pte, *tmp;
> +	int i;
> +
> +	for (i = 0; i<  HPTEG_HASH_NUM_VPTE_LONG; i++) {
> +		struct list_head *list =&vcpu->arch.hpte_hash_vpte_long[i];
> +
> +		list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
> +			/* Jump over the helper entry */
> +			if (&pte->list_vpte_long == list)
> +				continue;
>    

I don't think l_f_e_e_s() will ever give you the head back.

> +
> +			invalidate_pte(vcpu, pte);
> +		}
>    

Does invalidate_pte() remove the pte?  doesn't seem so, so you can drop 
the _safe iteration.

> +	}
> +}
> +
> +void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
> +{
> +	u64 i;
> +
> +	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx&  0x%lx\n",
> +		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
> +
> +	guest_ea&= ea_mask;
> +
> +	switch (ea_mask) {
> +	case ~0xfffUL:
> +	{
> +		struct list_head *list;
> +		struct hpte_cache *pte, *tmp;
> +
> +		/* Find the list of entries in the map */
> +		list =&vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
> +
> +		/* Check the list for matching entries */
> +		list_for_each_entry_safe(pte, tmp, list, list_pte) {
> +			/* Jump over the helper entry */
> +			if (&pte->list_pte == list)
> +				continue;
>    

Same here.

> +
> +			/* Invalidate matching PTE */
> +			if ((pte->pte.eaddr&  ~0xfffUL) == guest_ea)
> +				invalidate_pte(vcpu, pte);
> +		}
> +		break;
> +	}
>    

Would be nice to put this block into a function.

> +	case 0x0ffff000:
> +		/* 32-bit flush w/o segment, go through all possible segments */
> +		for (i = 0; i<  0x100000000ULL; i += 0x10000000ULL)
> +			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
> +		break;
> +	case 0:
> +		/* Doing a complete flush ->  start from scratch */
> +		kvmppc_mmu_pte_flush_all(vcpu);
> +		break;
> +	default:
> +		WARN_ON(1);
> +		break;
> +	}
> +}
> +
> +/* Flush with mask 0xfffffffff */
> +static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
> +{
> +	struct list_head *list;
> +	struct hpte_cache *pte, *tmp;
> +	u64 vp_mask = 0xfffffffffULL;
> +
> +	list =&vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
> +
> +	/* Check the list for matching entries */
> +	list_for_each_entry_safe(pte, tmp, list, list_vpte) {
> +		/* Jump over the helper entry */
> +		if (&pte->list_vpte == list)
> +			continue;
>    

list cannot contain list.  Or maybe I don't understand the data 
structure.  Isn't it multiple hash tables with lists holding matching ptes?

> +
> +		/* Invalidate matching PTEs */
> +		if ((pte->pte.vpage&  vp_mask) == guest_vp)
> +			invalidate_pte(vcpu, pte);
> +	}
> +}
> +
>
> +
> +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
> +{
> +	struct hpte_cache *pte, *tmp;
> +	int i;
> +
> +	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
> +		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
> +
> +	for (i = 0; i<  HPTEG_HASH_NUM_VPTE_LONG; i++) {
> +		struct list_head *list =&vcpu->arch.hpte_hash_vpte_long[i];
> +
> +		list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
> +			/* Jump over the helper entry */
> +			if (&pte->list_vpte_long == list)
> +				continue;
> +
> +			if ((pte->pte.raddr>= pa_start)&&
> +			    (pte->pte.raddr<  pa_end)) {
> +				invalidate_pte(vcpu, pte);
> +			}
>    

Extra braces.

> +		}
> +	}
> +}
> +
>
> +
> +static void kvmppc_mmu_hpte_init_hash(struct list_head *hash_list, int len)
> +{
> +	int i;
> +
> +	for (i = 0; i<  len; i++) {
> +		INIT_LIST_HEAD(&hash_list[i]);
> +	}
> +}
>    

Extra braces.

> +
> +int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
> +{
> +	char kmem_name[128];
> +
> +	/* init hpte slab cache */
> +	snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
> +	vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
> +		sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0, NULL);
>    

Why not one global cache?

> +
> +	/* init hpte lookup hashes */
> +	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
> +				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
> +	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
> +				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
> +	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
> +				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
> +
> +	return 0;
> +}
>
Alexander Graf - June 28, 2010, 8:55 a.m.
On 28.06.2010, at 10:28, Avi Kivity wrote:

> On 06/26/2010 02:16 AM, Alexander Graf wrote:
>> Currently the shadow paging code keeps an array of entries it knows about.
>> Whenever the guest invalidates an entry, we loop through that entry,
>> trying to invalidate matching parts.
>> 
>> While this is a really simple implementation, it is probably the most
>> ineffective one possible. So instead, let's keep an array of lists around
>> that are indexed by a hash. This way each PTE can be added by 4 list_add,
>> removed by 4 list_del invocations and the search only needs to loop through
>> entries that share the same hash.
>> 
>> This patch implements said lookup and exports generic functions that both
>> the 32-bit and 64-bit backend can use.
>> 
>> 
>> +
>> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
>> +	return hash_64(eaddr>>  PTE_SIZE, HPTEG_HASH_BITS_PTE);
>> +}
>> +
>> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
>> +	return hash_64(vpage&  0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
>> +}
>> +
>> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
>> +	return hash_64((vpage&  0xffffff000ULL)>>  12,
>> +		       HPTEG_HASH_BITS_VPTE_LONG);
>> +}
>>   
> 
> Still with the wierd coding style?

Not sure what's going on there. My editor displays it normally. Weird.

> 
>> +static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
>> +{
>> +	struct hpte_cache *pte, *tmp;
>> +	int i;
>> +
>> +	for (i = 0; i<  HPTEG_HASH_NUM_VPTE_LONG; i++) {
>> +		struct list_head *list =&vcpu->arch.hpte_hash_vpte_long[i];
>> +
>> +		list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
>> +			/* Jump over the helper entry */
>> +			if (&pte->list_vpte_long == list)
>> +				continue;
>>   
> 
> I don't think l_f_e_e_s() will ever give you the head back.

Uh. Usually you have struct list_head in a struct and you point to the first entry to loop over all. So if it doesn't return the first entry, that would seem very counter-intuitive.

> 
>> +
>> +			invalidate_pte(vcpu, pte);
>> +		}
>>   
> 
> Does invalidate_pte() remove the pte?  doesn't seem so, so you can drop the _safe iteration.

Yes, it does.

> 
>> +	}
>> +}
>> +
>> +void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
>> +{
>> +	u64 i;
>> +
>> +	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx&  0x%lx\n",
>> +		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
>> +
>> +	guest_ea&= ea_mask;
>> +
>> +	switch (ea_mask) {
>> +	case ~0xfffUL:
>> +	{
>> +		struct list_head *list;
>> +		struct hpte_cache *pte, *tmp;
>> +
>> +		/* Find the list of entries in the map */
>> +		list =&vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
>> +
>> +		/* Check the list for matching entries */
>> +		list_for_each_entry_safe(pte, tmp, list, list_pte) {
>> +			/* Jump over the helper entry */
>> +			if (&pte->list_pte == list)
>> +				continue;
>>   
> 
> Same here.
> 
>> +
>> +			/* Invalidate matching PTE */
>> +			if ((pte->pte.eaddr&  ~0xfffUL) == guest_ea)
>> +				invalidate_pte(vcpu, pte);
>> +		}
>> +		break;
>> +	}
>>   
> 
> Would be nice to put this block into a function.

Yup.

> 
>> +	case 0x0ffff000:
>> +		/* 32-bit flush w/o segment, go through all possible segments */
>> +		for (i = 0; i<  0x100000000ULL; i += 0x10000000ULL)
>> +			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
>> +		break;
>> +	case 0:
>> +		/* Doing a complete flush ->  start from scratch */
>> +		kvmppc_mmu_pte_flush_all(vcpu);
>> +		break;
>> +	default:
>> +		WARN_ON(1);
>> +		break;
>> +	}
>> +}
>> +
>> +/* Flush with mask 0xfffffffff */
>> +static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
>> +{
>> +	struct list_head *list;
>> +	struct hpte_cache *pte, *tmp;
>> +	u64 vp_mask = 0xfffffffffULL;
>> +
>> +	list =&vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
>> +
>> +	/* Check the list for matching entries */
>> +	list_for_each_entry_safe(pte, tmp, list, list_vpte) {
>> +		/* Jump over the helper entry */
>> +		if (&pte->list_vpte == list)
>> +			continue;
>>   
> 
> list cannot contain list.  Or maybe I don't understand the data structure.  Isn't it multiple hash tables with lists holding matching ptes?

It is multiple hash tables with list_heads that are one element of a list that contains the matching ptes. Usually you'd have

struct x {
  struct list_head;
  int foo;
  char bar;
};

and you loop through each of those elements. What we have here is

struct list_head hash[..];

and some loose struct x's. The hash's "next" element is a struct x.

The "normal" way would be to have "struct x hash[..];" but I figured that eats too much space.

> 
>> +
>> +		/* Invalidate matching PTEs */
>> +		if ((pte->pte.vpage&  vp_mask) == guest_vp)
>> +			invalidate_pte(vcpu, pte);
>> +	}
>> +}
>> +
>> 
>> +
>> +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
>> +{
>> +	struct hpte_cache *pte, *tmp;
>> +	int i;
>> +
>> +	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
>> +		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
>> +
>> +	for (i = 0; i<  HPTEG_HASH_NUM_VPTE_LONG; i++) {
>> +		struct list_head *list =&vcpu->arch.hpte_hash_vpte_long[i];
>> +
>> +		list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
>> +			/* Jump over the helper entry */
>> +			if (&pte->list_vpte_long == list)
>> +				continue;
>> +
>> +			if ((pte->pte.raddr>= pa_start)&&
>> +			    (pte->pte.raddr<  pa_end)) {
>> +				invalidate_pte(vcpu, pte);
>> +			}
>>   
> 
> Extra braces.

Yeah, for two-lined if's I find it more readable that way. Is it forbidden?

> 
>> +		}
>> +	}
>> +}
>> +
>> 
>> +
>> +static void kvmppc_mmu_hpte_init_hash(struct list_head *hash_list, int len)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i<  len; i++) {
>> +		INIT_LIST_HEAD(&hash_list[i]);
>> +	}
>> +}
>>   
> 
> Extra braces.

Yup.

> 
>> +
>> +int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
>> +{
>> +	char kmem_name[128];
>> +
>> +	/* init hpte slab cache */
>> +	snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
>> +	vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
>> +		sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0, NULL);
>>   
> 
> Why not one global cache?

You mean over all vcpus? Or over all VMs? Because this way they don't interfere. An operation on one vCPU doesn't inflict anything on another. There's also no locking necessary this way.


Alex
Avi Kivity - June 28, 2010, 9:12 a.m.
On 06/28/2010 11:55 AM, Alexander Graf wrote:
>
>>> +
>>> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
>>> +	return hash_64(eaddr>>   PTE_SIZE, HPTEG_HASH_BITS_PTE);
>>> +}
>>> +
>>> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
>>> +	return hash_64(vpage&   0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
>>> +}
>>> +
>>> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
>>> +	return hash_64((vpage&   0xffffff000ULL)>>   12,
>>> +		       HPTEG_HASH_BITS_VPTE_LONG);
>>> +}
>>>
>>>        
>> Still with the wierd coding style?
>>      
> Not sure what's going on there. My editor displays it normally. Weird.
>    

Try hitting 'save'.

>>> +static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
>>> +{
>>> +	struct hpte_cache *pte, *tmp;
>>> +	int i;
>>> +
>>> +	for (i = 0; i<   HPTEG_HASH_NUM_VPTE_LONG; i++) {
>>> +		struct list_head *list =&vcpu->arch.hpte_hash_vpte_long[i];
>>> +
>>> +		list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
>>> +			/* Jump over the helper entry */
>>> +			if (&pte->list_vpte_long == list)
>>> +				continue;
>>>
>>>        
>> I don't think l_f_e_e_s() will ever give you the head back.
>>      
> Uh. Usually you have struct list_head in a struct and you point to the first entry to loop over all. So if it doesn't return the first entry, that would seem very counter-intuitive.
>    

Linux list_heads aren't intuitive.  The same structure is used for the 
container and for the nodes.  Would have been better (and more typesafe) 
to have separate list_heads and list_nodes.

>>> +
>>> +			invalidate_pte(vcpu, pte);
>>> +		}
>>>
>>>        
>> Does invalidate_pte() remove the pte?  doesn't seem so, so you can drop the _safe iteration.
>>      
> Yes, it does.
>    

I don't see it?

> static void invalidate_pte(struct hpte_cache *pte)
> {
>     dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
>             pte->pte.eaddr, pte->pte.vpage, pte->host_va);
>
>     ppc_md.hpte_invalidate(pte->slot, pte->host_va,
>                    MMU_PAGE_4K, MMU_SEGSIZE_256M,
>                    false);
>     pte->host_va = 0;
>
>     if (pte->pte.may_write)
>         kvm_release_pfn_dirty(pte->pfn);
>     else
>         kvm_release_pfn_clean(pte->pfn);
> }

Am I looking at old code?

>>> +
>>> +/* Flush with mask 0xfffffffff */
>>> +static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
>>> +{
>>> +	struct list_head *list;
>>> +	struct hpte_cache *pte, *tmp;
>>> +	u64 vp_mask = 0xfffffffffULL;
>>> +
>>> +	list =&vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
>>> +
>>> +	/* Check the list for matching entries */
>>> +	list_for_each_entry_safe(pte, tmp, list, list_vpte) {
>>> +		/* Jump over the helper entry */
>>> +		if (&pte->list_vpte == list)
>>> +			continue;
>>>
>>>        
>> list cannot contain list.  Or maybe I don't understand the data structure.  Isn't it multiple hash tables with lists holding matching ptes?
>>      
> It is multiple hash tables with list_heads that are one element of a list that contains the matching ptes. Usually you'd have
>
> struct x {
>    struct list_head;
>    int foo;
>    char bar;
> };
>
> and you loop through each of those elements. What we have here is
>
> struct list_head hash[..];
>
> and some loose struct x's. The hash's "next" element is a struct x.
>
> The "normal" way would be to have "struct x hash[..];" but I figured that eats too much space.
>    

No, what you describe is quite normal.  In fact, x86 kvm mmu is exactly 
like that, except we only have a single hash:

>     struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];

(another difference is using struct hlist_head instead of list_head, 
which I recommend since it saves space)

>>> +
>>> +			if ((pte->pte.raddr>= pa_start)&&
>>> +			    (pte->pte.raddr<   pa_end)) {
>>> +				invalidate_pte(vcpu, pte);
>>> +			}
>>>
>>>        
>> Extra braces.
>>      
> Yeah, for two-lined if's I find it more readable that way. Is it forbidden?
>    

It's not forbidden, but it tends to attract "cleanup" patches, which are 
annoying.  Best to conform to the coding style if there isn't a good 
reason not to.

Personally I prefer braces for one-liners (yes they're ugly, but they're 
safer and easier to patch).

>>> +int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
>>> +{
>>> +	char kmem_name[128];
>>> +
>>> +	/* init hpte slab cache */
>>> +	snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
>>> +	vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
>>> +		sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0, NULL);
>>>
>>>        
>> Why not one global cache?
>>      
> You mean over all vcpus? Or over all VMs?

Totally global.  As in 'static struct kmem_cache *kvm_hpte_cache;'.

> Because this way they don't interfere. An operation on one vCPU doesn't inflict anything on another. There's also no locking necessary this way.
>    

The slab writers have solved this for everyone, not just us.  
kmem_cache_alloc() will usually allocate from a per-cpu cache, so no 
interference and/or locking.  See ____cache_alloc().

If there's a problem in kmem_cache_alloc(), solve it there, don't 
introduce workarounds.
Alexander Graf - June 28, 2010, 9:27 a.m.
Am 28.06.2010 um 11:12 schrieb Avi Kivity <avi@redhat.com>:

> On 06/28/2010 11:55 AM, Alexander Graf wrote:
>>
>>>> +
>>>> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
>>>> +    return hash_64(eaddr>>   PTE_SIZE, HPTEG_HASH_BITS_PTE);
>>>> +}
>>>> +
>>>> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
>>>> +    return hash_64(vpage&   0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
>>>> +}
>>>> +
>>>> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
>>>> +    return hash_64((vpage&   0xffffff000ULL)>>   12,
>>>> +               HPTEG_HASH_BITS_VPTE_LONG);
>>>> +}
>>>>
>>>>
>>> Still with the wierd coding style?
>>>
>> Not sure what's going on there. My editor displays it normally.  
>> Weird.
>>
>
> Try hitting 'save'.

Thanks for the hint :). No really, no idea what's going on here.

>
>>>> +static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +    struct hpte_cache *pte, *tmp;
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i<   HPTEG_HASH_NUM_VPTE_LONG; i++) {
>>>> +        struct list_head *list =&vcpu->arch.hpte_hash_vpte_long 
>>>> [i];
>>>> +
>>>> +        list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
>>>> +            /* Jump over the helper entry */
>>>> +            if (&pte->list_vpte_long == list)
>>>> +                continue;
>>>>
>>>>
>>> I don't think l_f_e_e_s() will ever give you the head back.
>>>
>> Uh. Usually you have struct list_head in a struct and you point to  
>> the first entry to loop over all. So if it doesn't return the first  
>> entry, that would seem very counter-intuitive.
>>
>
> Linux list_heads aren't intuitive.  The same structure is used for  
> the container and for the nodes.  Would have been better (and more  
> typesafe) to have separate list_heads and list_nodes.

Hrm. Ok, I'll check by reading the source.

>
>>>> +
>>>> +            invalidate_pte(vcpu, pte);
>>>> +        }
>>>>
>>>>
>>> Does invalidate_pte() remove the pte?  doesn't seem so, so you can  
>>> drop the _safe iteration.
>>>
>> Yes, it does.
>>
>
> I don't see it?
>
>> static void invalidate_pte(struct hpte_cache *pte)
>> {
>>    dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
>>            pte->pte.eaddr, pte->pte.vpage, pte->host_va);
>>
>>    ppc_md.hpte_invalidate(pte->slot, pte->host_va,
>>                   MMU_PAGE_4K, MMU_SEGSIZE_256M,
>>                   false);
>>    pte->host_va = 0;
>>
>>    if (pte->pte.may_write)
>>        kvm_release_pfn_dirty(pte->pfn);
>>    else
>>        kvm_release_pfn_clean(pte->pfn);
>> }
>
> Am I looking at old code?

Apparently. Check book3s_mmu_*.c

>
>>>> +
>>>> +/* Flush with mask 0xfffffffff */
>>>> +static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu,  
>>>> u64 guest_vp)
>>>> +{
>>>> +    struct list_head *list;
>>>> +    struct hpte_cache *pte, *tmp;
>>>> +    u64 vp_mask = 0xfffffffffULL;
>>>> +
>>>> +    list =&vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte 
>>>> (guest_vp)];
>>>> +
>>>> +    /* Check the list for matching entries */
>>>> +    list_for_each_entry_safe(pte, tmp, list, list_vpte) {
>>>> +        /* Jump over the helper entry */
>>>> +        if (&pte->list_vpte == list)
>>>> +            continue;
>>>>
>>>>
>>> list cannot contain list.  Or maybe I don't understand the data  
>>> structure.  Isn't it multiple hash tables with lists holding  
>>> matching ptes?
>>>
>> It is multiple hash tables with list_heads that are one element of  
>> a list that contains the matching ptes. Usually you'd have
>>
>> struct x {
>>   struct list_head;
>>   int foo;
>>   char bar;
>> };
>>
>> and you loop through each of those elements. What we have here is
>>
>> struct list_head hash[..];
>>
>> and some loose struct x's. The hash's "next" element is a struct x.
>>
>> The "normal" way would be to have "struct x hash[..];" but I  
>> figured that eats too much space.
>>
>
> No, what you describe is quite normal.  In fact, x86 kvm mmu is  
> exactly like that, except we only have a single hash:
>
>>    struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];

I see.

>
> (another difference is using struct hlist_head instead of list_head,  
> which I recommend since it saves space)

Hrm. I thought about this quite a bit before too, but that makes  
invalidation more complicated, no? We always need to remember the  
previous entry in a list.

>
>>>> +
>>>> +            if ((pte->pte.raddr>= pa_start)&&
>>>> +                (pte->pte.raddr<   pa_end)) {
>>>> +                invalidate_pte(vcpu, pte);
>>>> +            }
>>>>
>>>>
>>> Extra braces.
>>>
>> Yeah, for two-lined if's I find it more readable that way. Is it  
>> forbidden?
>>
>
> It's not forbidden, but it tends to attract "cleanup" patches, which  
> are annoying.  Best to conform to the coding style if there isn't a  
> good reason not to.
>
> Personally I prefer braces for one-liners (yes they're ugly, but  
> they're safer and easier to patch).
>
>>>> +int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +    char kmem_name[128];
>>>> +
>>>> +    /* init hpte slab cache */
>>>> +    snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
>>>> +    vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
>>>> +        sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0,  
>>>> NULL);
>>>>
>>>>
>>> Why not one global cache?
>>>
>> You mean over all vcpus? Or over all VMs?
>
> Totally global.  As in 'static struct kmem_cache *kvm_hpte_cache;'.

What would be the benefit?

>
>> Because this way they don't interfere. An operation on one vCPU  
>> doesn't inflict anything on another. There's also no locking  
>> necessary this way.
>>
>
> The slab writers have solved this for everyone, not just us.   
> kmem_cache_alloc() will usually allocate from a per-cpu cache, so no  
> interference and/or locking.  See ____cache_alloc().
>
> If there's a problem in kmem_cache_alloc(), solve it there, don't  
> introduce workarounds.

So you would still keep different hash arrays and everything, just  
allocate the objects from a global pool? I still fail to see how that  
benefits anyone.

Alex
Avi Kivity - June 28, 2010, 9:34 a.m.
On 06/28/2010 12:27 PM, Alexander Graf wrote:
>> Am I looking at old code?
>
>
> Apparently. Check book3s_mmu_*.c

I don't have that pattern.

>
>>
>> (another difference is using struct hlist_head instead of list_head, 
>> which I recommend since it saves space)
>
> Hrm. I thought about this quite a bit before too, but that makes 
> invalidation more complicated, no? We always need to remember the 
> previous entry in a list.

hlist_for_each_entry_safe() does that.

>>
>>>>> +int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +    char kmem_name[128];
>>>>> +
>>>>> +    /* init hpte slab cache */
>>>>> +    snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
>>>>> +    vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
>>>>> +        sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0, 
>>>>> NULL);
>>>>>
>>>>>
>>>> Why not one global cache?
>>>>
>>> You mean over all vcpus? Or over all VMs?
>>
>> Totally global.  As in 'static struct kmem_cache *kvm_hpte_cache;'.
>
> What would be the benefit?

Less and simpler code, better reporting through slabtop, less wastage of 
partially allocated slab pages.

>>> Because this way they don't interfere. An operation on one vCPU 
>>> doesn't inflict anything on another. There's also no locking 
>>> necessary this way.
>>>
>>
>> The slab writers have solved this for everyone, not just us.  
>> kmem_cache_alloc() will usually allocate from a per-cpu cache, so no 
>> interference and/or locking.  See ____cache_alloc().
>>
>> If there's a problem in kmem_cache_alloc(), solve it there, don't 
>> introduce workarounds.
>
> So you would still keep different hash arrays and everything, just 
> allocate the objects from a global pool? 

Yes.

> I still fail to see how that benefits anyone.

See above.
Alexander Graf - June 28, 2010, 9:55 a.m.
Avi Kivity wrote:
> On 06/28/2010 12:27 PM, Alexander Graf wrote:
>>> Am I looking at old code?
>>
>>
>> Apparently. Check book3s_mmu_*.c
>
> I don't have that pattern.

It's in this patch.

> +static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
> +{
> +	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
> +		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
> +
> +	/* Different for 32 and 64 bit */
> +	kvmppc_mmu_invalidate_pte(vcpu, pte);
> +
> +	if (pte->pte.may_write)
> +		kvm_release_pfn_dirty(pte->pfn);
> +	else
> +		kvm_release_pfn_clean(pte->pfn);
> +
> +	list_del(&pte->list_pte);
> +	list_del(&pte->list_vpte);
> +	list_del(&pte->list_vpte_long);
> +	list_del(&pte->list_all);
> +
> +	kmem_cache_free(vcpu->arch.hpte_cache, pte);
> +}
> +

>
>>
>>>
>>> (another difference is using struct hlist_head instead of list_head,
>>> which I recommend since it saves space)
>>
>> Hrm. I thought about this quite a bit before too, but that makes
>> invalidation more complicated, no? We always need to remember the
>> previous entry in a list.
>
> hlist_for_each_entry_safe() does that.

Oh - very nice. So all I need to do is pass the previous list entry to
invalide_pte too and I'm good. I guess I'll give it a shot.

>
>>>
>>>>>> +int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
>>>>>> +{
>>>>>> +    char kmem_name[128];
>>>>>> +
>>>>>> +    /* init hpte slab cache */
>>>>>> +    snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
>>>>>> +    vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
>>>>>> +        sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0,
>>>>>> NULL);
>>>>>>
>>>>>>
>>>>> Why not one global cache?
>>>>>
>>>> You mean over all vcpus? Or over all VMs?
>>>
>>> Totally global.  As in 'static struct kmem_cache *kvm_hpte_cache;'.
>>
>> What would be the benefit?
>
> Less and simpler code, better reporting through slabtop, less wastage
> of partially allocated slab pages.

But it also means that one VM can spill the global slab cache and kill
another VM's mm performance, no?


Alex
Avi Kivity - June 28, 2010, 10:01 a.m.
On 06/28/2010 12:55 PM, Alexander Graf wrote:
> Avi Kivity wrote:
>    
>> On 06/28/2010 12:27 PM, Alexander Graf wrote:
>>      
>>>> Am I looking at old code?
>>>>          
>>>
>>> Apparently. Check book3s_mmu_*.c
>>>        
>> I don't have that pattern.
>>      
> It's in this patch.
>    

Yes.  Silly me.

>> +static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
>> +{
>> +	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) ->  0x%llx\n",
>> +		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
>> +
>> +	/* Different for 32 and 64 bit */
>> +	kvmppc_mmu_invalidate_pte(vcpu, pte);
>> +
>> +	if (pte->pte.may_write)
>> +		kvm_release_pfn_dirty(pte->pfn);
>> +	else
>> +		kvm_release_pfn_clean(pte->pfn);
>> +
>> +	list_del(&pte->list_pte);
>> +	list_del(&pte->list_vpte);
>> +	list_del(&pte->list_vpte_long);
>> +	list_del(&pte->list_all);
>> +
>> +	kmem_cache_free(vcpu->arch.hpte_cache, pte);
>> +}
>> +
>>      

(that's the old one with list_all - better check what's going on here)


>>>> (another difference is using struct hlist_head instead of list_head,
>>>> which I recommend since it saves space)
>>>>          
>>> Hrm. I thought about this quite a bit before too, but that makes
>>> invalidation more complicated, no? We always need to remember the
>>> previous entry in a list.
>>>        
>> hlist_for_each_entry_safe() does that.
>>      
> Oh - very nice. So all I need to do is pass the previous list entry to
> invalide_pte too and I'm good. I guess I'll give it a shot.
>    

No, just the for_each cursor.

>> Less and simpler code, better reporting through slabtop, less wastage
>> of partially allocated slab pages.
>>      
> But it also means that one VM can spill the global slab cache and kill
> another VM's mm performance, no?
>    

What do you mean by spill?

btw, in the midst of the nit-picking frenzy I forgot to ask how the 
individual hash chain lengths as well as the per-vm allocation were limited.

On x86 we have a per-vm limit and we allow the mm shrinker to reduce 
shadow mmu data structures dynamically.
Alexander Graf - June 28, 2010, 1:25 p.m.
Avi Kivity wrote:
> On 06/28/2010 12:55 PM, Alexander Graf wrote:
>> Avi Kivity wrote:
>>   
>>> On 06/28/2010 12:27 PM, Alexander Graf wrote:
>>>     
>>>>> Am I looking at old code?
>>>>>          
>>>>
>>>> Apparently. Check book3s_mmu_*.c
>>>>        
>>> I don't have that pattern.
>>>      
>> It's in this patch.
>>    
>
> Yes.  Silly me.
>
>>> +static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache
>>> *pte)
>>> +{
>>> +    dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) ->  0x%llx\n",
>>> +            pte->pte.eaddr, pte->pte.vpage, pte->host_va);
>>> +
>>> +    /* Different for 32 and 64 bit */
>>> +    kvmppc_mmu_invalidate_pte(vcpu, pte);
>>> +
>>> +    if (pte->pte.may_write)
>>> +        kvm_release_pfn_dirty(pte->pfn);
>>> +    else
>>> +        kvm_release_pfn_clean(pte->pfn);
>>> +
>>> +    list_del(&pte->list_pte);
>>> +    list_del(&pte->list_vpte);
>>> +    list_del(&pte->list_vpte_long);
>>> +    list_del(&pte->list_all);
>>> +
>>> +    kmem_cache_free(vcpu->arch.hpte_cache, pte);
>>> +}
>>> +
>>>      
>
> (that's the old one with list_all - better check what's going on here)

Yeah, I just searched my inbox for the first patch. Obviously it was the
old version :(.

>
>
>>>>> (another difference is using struct hlist_head instead of list_head,
>>>>> which I recommend since it saves space)
>>>>>          
>>>> Hrm. I thought about this quite a bit before too, but that makes
>>>> invalidation more complicated, no? We always need to remember the
>>>> previous entry in a list.
>>>>        
>>> hlist_for_each_entry_safe() does that.
>>>      
>> Oh - very nice. So all I need to do is pass the previous list entry to
>> invalide_pte too and I'm good. I guess I'll give it a shot.
>>    
>
> No, just the for_each cursor.
>
>>> Less and simpler code, better reporting through slabtop, less wastage
>>> of partially allocated slab pages.
>>>      
>> But it also means that one VM can spill the global slab cache and kill
>> another VM's mm performance, no?
>>    
>
> What do you mean by spill?
>
> btw, in the midst of the nit-picking frenzy I forgot to ask how the
> individual hash chain lengths as well as the per-vm allocation were
> limited.
>
> On x86 we have a per-vm limit and we allow the mm shrinker to reduce
> shadow mmu data structures dynamically.
>

Very simple. I keep an int with the number of allocated entries around
and if that hits a define'd threshold, I flush all shadow pages.


Alex
Avi Kivity - June 28, 2010, 1:30 p.m.
On 06/28/2010 04:25 PM, Alexander Graf wrote:
>>
>>>> Less and simpler code, better reporting through slabtop, less wastage
>>>> of partially allocated slab pages.
>>>>
>>>>          
>>> But it also means that one VM can spill the global slab cache and kill
>>> another VM's mm performance, no?
>>>
>>>        
>> What do you mean by spill?
>>      


Well?

>> btw, in the midst of the nit-picking frenzy I forgot to ask how the
>> individual hash chain lengths as well as the per-vm allocation were
>> limited.
>>
>> On x86 we have a per-vm limit and we allow the mm shrinker to reduce
>> shadow mmu data structures dynamically.
>>
>>      
> Very simple. I keep an int with the number of allocated entries around
> and if that hits a define'd threshold, I flush all shadow pages.
>    

A truly nefarious guest will make all ptes hash to the same chain, 
making some operations very long (O(n^2) in the x86 mmu, don't know 
about ppc) under a spinlock.  So we had to limit hash chains, not just 
the number of entries.

But your mmu is per-cpu, no?  In that case, no spinlock, and any damage 
the guest does is limited to itself.
Alexander Graf - June 28, 2010, 1:32 p.m.
Avi Kivity wrote:
> On 06/28/2010 04:25 PM, Alexander Graf wrote:
>>>
>>>>> Less and simpler code, better reporting through slabtop, less wastage
>>>>> of partially allocated slab pages.
>>>>>
>>>>>          
>>>> But it also means that one VM can spill the global slab cache and kill
>>>> another VM's mm performance, no?
>>>>
>>>>        
>>> What do you mean by spill?
>>>      
>
>
> Well?

I was thinking of a global capping, but I guess I could still do it
per-vcpu. So yes, doing a global slab doesn't hurt.

>
>>> btw, in the midst of the nit-picking frenzy I forgot to ask how the
>>> individual hash chain lengths as well as the per-vm allocation were
>>> limited.
>>>
>>> On x86 we have a per-vm limit and we allow the mm shrinker to reduce
>>> shadow mmu data structures dynamically.
>>>
>>>      
>> Very simple. I keep an int with the number of allocated entries around
>> and if that hits a define'd threshold, I flush all shadow pages.
>>    
>
> A truly nefarious guest will make all ptes hash to the same chain,
> making some operations very long (O(n^2) in the x86 mmu, don't know
> about ppc) under a spinlock.  So we had to limit hash chains, not just
> the number of entries.
>
> But your mmu is per-cpu, no?  In that case, no spinlock, and any
> damage the guest does is limited to itself.

Yes, it is. No locking. The vcpu can kill its own performance, but I
don't care about that.


Alex
Alexander Graf - June 29, 2010, 12:56 p.m.
Avi Kivity wrote:
> On 06/28/2010 11:55 AM, Alexander Graf wrote:
>>
>>>> +
>>>> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
>>>> +    return hash_64(eaddr>>   PTE_SIZE, HPTEG_HASH_BITS_PTE);
>>>> +}
>>>> +
>>>> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
>>>> +    return hash_64(vpage&   0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
>>>> +}
>>>> +
>>>> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
>>>> +    return hash_64((vpage&   0xffffff000ULL)>>   12,
>>>> +               HPTEG_HASH_BITS_VPTE_LONG);
>>>> +}
>>>>
>>>>        
>>> Still with the wierd coding style?
>>>      
>> Not sure what's going on there. My editor displays it normally. Weird.
>>    
>
> Try hitting 'save'.

hexdump -C on the respective section in the exact patch file I submitted
above shows:

00000a80  75 72 6e 20 68 61 73 68  5f 36 34 28 65 61 64 64  |urn
hash_64(eadd|
00000a90  72 20 3e 3e 20 50 54 45  5f 53 49 5a 45 2c 20 48  |r >>
PTE_SIZE, H|


Maybe your mail client breaks it?


Alex
Avi Kivity - June 29, 2010, 1:05 p.m.
On 06/29/2010 03:56 PM, Alexander Graf wrote:
> Avi Kivity wrote:
>    
>> On 06/28/2010 11:55 AM, Alexander Graf wrote:
>>      
>>>        
>>>>> +
>>>>> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
>>>>> +    return hash_64(eaddr>>    PTE_SIZE, HPTEG_HASH_BITS_PTE);
>>>>> +}
>>>>> +
>>>>> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
>>>>> +    return hash_64(vpage&    0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
>>>>> +}
>>>>> +
>>>>> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
>>>>> +    return hash_64((vpage&    0xffffff000ULL)>>    12,
>>>>> +               HPTEG_HASH_BITS_VPTE_LONG);
>>>>> +}
>>>>>
>>>>>
>>>>>            
>>>> Still with the wierd coding style?
>>>>
>>>>          
>>> Not sure what's going on there. My editor displays it normally. Weird.
>>>
>>>        
>> Try hitting 'save'.
>>      
> hexdump -C on the respective section in the exact patch file I submitted
> above shows:
>
> 00000a80  75 72 6e 20 68 61 73 68  5f 36 34 28 65 61 64 64  |urn
> hash_64(eadd|
> 00000a90  72 20 3e 3e 20 50 54 45  5f 53 49 5a 45 2c 20 48  |r>>
> PTE_SIZE, H|
>
>
> Maybe your mail client breaks it?
>    

The list archives too:

   http://www.mail-archive.com/kvm@vger.kernel.org/msg37093.html

Looks like a cache coherency bug.  What processor are you using?
Alexander Graf - June 29, 2010, 1:06 p.m.
Avi Kivity wrote:
> On 06/29/2010 03:56 PM, Alexander Graf wrote:
>> Avi Kivity wrote:
>>   
>>> On 06/28/2010 11:55 AM, Alexander Graf wrote:
>>>     
>>>>       
>>>>>> +
>>>>>> +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
>>>>>> +    return hash_64(eaddr>>    PTE_SIZE, HPTEG_HASH_BITS_PTE);
>>>>>> +}
>>>>>> +
>>>>>> +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
>>>>>> +    return hash_64(vpage&    0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
>>>>>> +}
>>>>>> +
>>>>>> +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
>>>>>> +    return hash_64((vpage&    0xffffff000ULL)>>    12,
>>>>>> +               HPTEG_HASH_BITS_VPTE_LONG);
>>>>>> +}
>>>>>>
>>>>>>
>>>>>>            
>>>>> Still with the wierd coding style?
>>>>>
>>>>>          
>>>> Not sure what's going on there. My editor displays it normally. Weird.
>>>>
>>>>        
>>> Try hitting 'save'.
>>>      
>> hexdump -C on the respective section in the exact patch file I submitted
>> above shows:
>>
>> 00000a80  75 72 6e 20 68 61 73 68  5f 36 34 28 65 61 64 64  |urn
>> hash_64(eadd|
>> 00000a90  72 20 3e 3e 20 50 54 45  5f 53 49 5a 45 2c 20 48  |r>>
>> PTE_SIZE, H|
>>
>>
>> Maybe your mail client breaks it?
>>    
>
> The list archives too:
>
>   http://www.mail-archive.com/kvm@vger.kernel.org/msg37093.html
>
> Looks like a cache coherency bug.  What processor are you using?

Are we looking at the same link? Looks good to me there.

Alex
Avi Kivity - June 29, 2010, 1:13 p.m.
On 06/29/2010 04:06 PM, Alexander Graf wrote:
>
> Are we looking at the same link? Looks good to me there.
>
>    


We're probably looking at the same link but looking at different 
things.  I'm whining about

     static u64 f() {
         ...
     }

as opposed to the more sober

     static u64 f()
     {
         ...
     }

for f in [kvmppc_mmu_hash_pte, kvmppc_mmu_hash_vpte, 
kvmppc_mmu_hash_vpte_long].

Patch

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644
index 0000000..5826e61
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -0,0 +1,286 @@ 
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#define PTE_SIZE	12
+
+/* #define DEBUG_MMU */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) {
+	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) {
+	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) {
+	return hash_64((vpage & 0xffffff000ULL) >> 12,
+		       HPTEG_HASH_BITS_VPTE_LONG);
+}
+
+void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	u64 index;
+
+	/* Add to ePTE list */
+	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
+	list_add(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+
+	/* Add to vPTE list */
+	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
+	list_add(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+
+	/* Add to vPTE_long list */
+	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
+	list_add(&pte->list_vpte_long, &vcpu->arch.hpte_hash_vpte_long[index]);
+}
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
+		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+	/* Different for 32 and 64 bit */
+	kvmppc_mmu_invalidate_pte(vcpu, pte);
+
+	if (pte->pte.may_write)
+		kvm_release_pfn_dirty(pte->pfn);
+	else
+		kvm_release_pfn_clean(pte->pfn);
+
+	list_del(&pte->list_pte);
+	list_del(&pte->list_vpte);
+	list_del(&pte->list_vpte_long);
+
+	kmem_cache_free(vcpu->arch.hpte_cache, pte);
+}
+
+static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte, *tmp;
+	int i;
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct list_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+		list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
+			/* Jump over the helper entry */
+			if (&pte->list_vpte_long == list)
+				continue;
+
+			invalidate_pte(vcpu, pte);
+		}
+	}
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+	u64 i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
+
+	guest_ea &= ea_mask;
+
+	switch (ea_mask) {
+	case ~0xfffUL:
+	{
+		struct list_head *list;
+		struct hpte_cache *pte, *tmp;
+
+		/* Find the list of entries in the map */
+		list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+
+		/* Check the list for matching entries */
+		list_for_each_entry_safe(pte, tmp, list, list_pte) {
+			/* Jump over the helper entry */
+			if (&pte->list_pte == list)
+				continue;
+
+			/* Invalidate matching PTE */
+			if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
+				invalidate_pte(vcpu, pte);
+		}
+		break;
+	}
+	case 0x0ffff000:
+		/* 32-bit flush w/o segment, go through all possible segments */
+		for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
+			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
+		break;
+	case 0:
+		/* Doing a complete flush -> start from scratch */
+		kvmppc_mmu_pte_flush_all(vcpu);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+/* Flush with mask 0xfffffffff */
+static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct list_head *list;
+	struct hpte_cache *pte, *tmp;
+	u64 vp_mask = 0xfffffffffULL;
+
+	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+
+	/* Check the list for matching entries */
+	list_for_each_entry_safe(pte, tmp, list, list_vpte) {
+		/* Jump over the helper entry */
+		if (&pte->list_vpte == list)
+			continue;
+
+		/* Invalidate matching PTEs */
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+	}
+}
+
+/* Flush with mask 0xffffff000 */
+static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct list_head *list;
+	struct hpte_cache *pte, *tmp;
+	u64 vp_mask = 0xffffff000ULL;
+
+	list = &vcpu->arch.hpte_hash_vpte_long[
+		kvmppc_mmu_hash_vpte_long(guest_vp)];
+
+	/* Check the list for matching entries */
+	list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
+		/* Jump over the helper entry */
+		if (&pte->list_vpte_long == list)
+			continue;
+
+		/* Invalidate matching PTEs */
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+	}
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+		    vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
+	guest_vp &= vp_mask;
+
+	switch(vp_mask) {
+	case 0xfffffffffULL:
+		kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
+		break;
+	case 0xffffff000ULL:
+		kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+	struct hpte_cache *pte, *tmp;
+	int i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct list_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+		list_for_each_entry_safe(pte, tmp, list, list_vpte_long) {
+			/* Jump over the helper entry */
+			if (&pte->list_vpte_long == list)
+				continue;
+
+			if ((pte->pte.raddr >= pa_start) &&
+			    (pte->pte.raddr < pa_end)) {
+				invalidate_pte(vcpu, pte);
+			}
+		}
+	}
+}
+
+struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte;
+
+	pte = kmem_cache_zalloc(vcpu->arch.hpte_cache, GFP_KERNEL);
+	vcpu->arch.hpte_cache_count++;
+
+	if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+		kvmppc_mmu_pte_flush_all(vcpu);
+
+	return pte;
+}
+
+void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+	kmem_cache_destroy(vcpu->arch.hpte_cache);
+}
+
+static void kvmppc_mmu_hpte_init_hash(struct list_head *hash_list, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		INIT_LIST_HEAD(&hash_list[i]);
+	}
+}
+
+int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
+{
+	char kmem_name[128];
+
+	/* init hpte slab cache */
+	snprintf(kmem_name, 128, "kvm-spt-%p", vcpu);
+	vcpu->arch.hpte_cache = kmem_cache_create(kmem_name,
+		sizeof(struct hpte_cache), sizeof(struct hpte_cache), 0, NULL);
+
+	/* init hpte lookup hashes */
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
+
+	return 0;
+}