diff mbox

[-V5] target-ppc: Fix page table lookup with kvm enabled

Message ID 1381490016-13557-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com
State New
Headers show

Commit Message

Aneesh Kumar K.V Oct. 11, 2013, 11:13 a.m. UTC
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

With kvm enabled, we store the hash page table information in the hypervisor.
Use ioctl to read the htab contents. Without this we get the below error when
trying to read the guest address

 (gdb) x/10 do_fork
 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
 (gdb)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Changes from V4:
* Rewrite to avoid two code paths for doing hash lookups

 hw/ppc/spapr_hcall.c    | 44 ++++++++++++++++++++-----------
 target-ppc/kvm.c        | 47 +++++++++++++++++++++++++++++++++
 target-ppc/kvm_ppc.h    | 16 +++++++++++
 target-ppc/mmu-hash64.c | 70 +++++++++++++++++++++++++++++++++++++++----------
 target-ppc/mmu-hash64.h | 33 ++++++++++++++++-------
 5 files changed, 170 insertions(+), 40 deletions(-)

Comments

Alexander Graf Oct. 11, 2013, 1:36 p.m. UTC | #1
On 11.10.2013, at 13:13, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:

> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> With kvm enabled, we store the hash page table information in the hypervisor.
> Use ioctl to read the htab contents. Without this we get the below error when
> trying to read the guest address
> 
> (gdb) x/10 do_fork
> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
> (gdb)
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> Changes from V4:
> * Rewrite to avoid two code paths for doing hash lookups
> 
> hw/ppc/spapr_hcall.c    | 44 ++++++++++++++++++++-----------
> target-ppc/kvm.c        | 47 +++++++++++++++++++++++++++++++++
> target-ppc/kvm_ppc.h    | 16 +++++++++++
> target-ppc/mmu-hash64.c | 70 +++++++++++++++++++++++++++++++++++++++----------
> target-ppc/mmu-hash64.h | 33 ++++++++++++++++-------
> 5 files changed, 170 insertions(+), 40 deletions(-)
> 
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index f10ba8a..96348e3 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -52,6 +52,7 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     target_ulong raddr;
>     target_ulong i;
>     hwaddr hpte;
> +    struct ppc_hash64_hpte_token *token;
> 
>     /* only handle 4k and 16M pages for now */
>     if (pteh & HPTE64_V_LARGE) {
> @@ -94,25 +95,32 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
>         return H_PARAMETER;
>     }
> +
> +    i = 0;
> +    hpte = pte_index * HASH_PTE_SIZE_64;
>     if (likely((flags & H_EXACT) == 0)) {
>         pte_index &= ~7ULL;
> -        hpte = pte_index * HASH_PTE_SIZE_64;
> -        for (i = 0; ; ++i) {
> +        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
> +        do {

Why convert this into a while loop?

>             if (i == 8) {
> +                ppc_hash64_stop_access(token);
>                 return H_PTEG_FULL;
>             }
> -            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
> +            if ((ppc_hash64_load_hpte0(token, i) & HPTE64_V_VALID) == 0) {
>                 break;
>             }
> -            hpte += HASH_PTE_SIZE_64;
> -        }
> +        } while (i++);
> +        ppc_hash64_stop_access(token);
>     } else {
> -        i = 0;
> -        hpte = pte_index * HASH_PTE_SIZE_64;
> -        if (ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) {
> +        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
> +        if (ppc_hash64_load_hpte0(token, 0) & HPTE64_V_VALID) {
> +            ppc_hash64_stop_access(token);
>             return H_PTEG_FULL;
>         }
> +        ppc_hash64_stop_access(token);
>     }
> +    hpte += i * HASH_PTE_SIZE_64;
> +
>     ppc_hash64_store_hpte1(env, hpte, ptel);
>     /* eieio();  FIXME: need some sort of barrier for smp? */
>     ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
> @@ -135,15 +143,16 @@ static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
> {
>     hwaddr hpte;
>     target_ulong v, r, rb;
> +    struct ppc_hash64_hpte_token  *token;
> 
>     if ((ptex * HASH_PTE_SIZE_64) & ~env->htab_mask) {
>         return REMOVE_PARM;
>     }
> 
> -    hpte = ptex * HASH_PTE_SIZE_64;
> -
> -    v = ppc_hash64_load_hpte0(env, hpte);
> -    r = ppc_hash64_load_hpte1(env, hpte);
> +    token = ppc_hash64_start_access(ppc_env_get_cpu(env), ptex);
> +    v = ppc_hash64_load_hpte0(token, 0);
> +    r = ppc_hash64_load_hpte1(token, 0);
> +    ppc_hash64_stop_access(token);
> 
>     if ((v & HPTE64_V_VALID) == 0 ||
>         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn) ||
> @@ -152,6 +161,7 @@ static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
>     }
>     *vp = v;
>     *rp = r;
> +    hpte = ptex * HASH_PTE_SIZE_64;
>     ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY);
>     rb = compute_tlbie_rb(v, r, ptex);
>     ppc_tlb_invalidate_one(env, rb);
> @@ -261,15 +271,16 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     target_ulong avpn = args[2];
>     hwaddr hpte;
>     target_ulong v, r, rb;
> +    struct ppc_hash64_hpte_token *token;
> 
>     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
>         return H_PARAMETER;
>     }
> 
> -    hpte = pte_index * HASH_PTE_SIZE_64;
> -
> -    v = ppc_hash64_load_hpte0(env, hpte);
> -    r = ppc_hash64_load_hpte1(env, hpte);
> +    token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
> +    v = ppc_hash64_load_hpte0(token, 0);
> +    r = ppc_hash64_load_hpte1(token, 0);
> +    ppc_hash64_stop_access(token);
> 
>     if ((v & HPTE64_V_VALID) == 0 ||
>         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn)) {
> @@ -282,6 +293,7 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     r |= (flags << 48) & HPTE64_R_KEY_HI;
>     r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
>     rb = compute_tlbie_rb(v, r, pte_index);
> +    hpte = pte_index * HASH_PTE_SIZE_64;
>     ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY);
>     ppc_tlb_invalidate_one(env, rb);
>     ppc_hash64_store_hpte1(env, hpte, r);
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index ac70efbf..3ad3283 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -1783,6 +1783,11 @@ bool kvmppc_has_cap_epr(void)
>     return cap_epr;
> }
> 
> +bool kvmppc_has_cap_htab_fd(void)
> +{
> +    return cap_htab_fd;
> +}
> +
> static int kvm_ppc_register_host_cpu_type(void)
> {
>     TypeInfo type_info = {
> @@ -1888,3 +1893,45 @@ int kvm_arch_on_sigbus(int code, void *addr)
> void kvm_arch_init_irq_routing(KVMState *s)
> {
> }
> +
> +int kvm_ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index,
> +                                struct ppc_hash64_hpte_token *token)
> +{
> +    int htab_fd;
> +    int hpte_group_size;
> +    struct kvm_get_htab_fd ghf;
> +    struct kvm_get_htab_buf {
> +        struct kvm_get_htab_header header;
> +        /*
> +         * We required one extra byte for read
> +         */
> +        unsigned long hpte[(HPTES_PER_GROUP * 2) + 1];
> +    } hpte_buf;;

Double semicolon?

> +
> +    ghf.flags = 0;
> +    ghf.start_index = pte_index;
> +    htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
> +    if (htab_fd < 0) {
> +        goto error_out;
> +    }
> +    memset(&hpte_buf, 0, sizeof(hpte_buf));
> +    /*
> +     * Read the hpte group
> +     */
> +    if (read(htab_fd, &hpte_buf, sizeof(hpte_buf)) < 0) {
> +        goto out_close;
> +    }
> +
> +    hpte_group_size = sizeof(unsigned long) * 2 * HPTES_PER_GROUP;
> +    token->hpte_group = g_malloc(hpte_group_size);
> +    token->free_hpte_group = 1;
> +    token->external = 1;
> +    memcpy(token->hpte_group, hpte_buf.hpte, hpte_group_size);
> +    close(htab_fd);
> +    return 0;
> +
> +out_close:
> +    close(htab_fd);
> +error_out:
> +    return -1;
> +}
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index 4ae7bf2..31df330 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -12,6 +12,7 @@
> #define TYPE_HOST_POWERPC_CPU "host-" TYPE_POWERPC_CPU
> 
> void kvmppc_init(void);
> +struct ppc_hash64_hpte_token;
> 
> #ifdef CONFIG_KVM
> 
> @@ -38,10 +39,13 @@ uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
> #endif /* !CONFIG_USER_ONLY */
> int kvmppc_fixup_cpu(PowerPCCPU *cpu);
> bool kvmppc_has_cap_epr(void);
> +bool kvmppc_has_cap_htab_fd(void);
> int kvmppc_get_htab_fd(bool write);
> int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns);
> int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
>                            uint16_t n_valid, uint16_t n_invalid);
> +int kvm_ppc_hash64_start_access(PowerPCCPU *cpu, hwaddr hash,
> +                                struct ppc_hash64_hpte_token *token);
> 
> #else
> 
> @@ -164,6 +168,11 @@ static inline bool kvmppc_has_cap_epr(void)
>     return false;
> }
> 
> +static inline bool kvmppc_has_cap_htab_fd(void)
> +{
> +    return false;
> +}
> +
> static inline int kvmppc_get_htab_fd(bool write)
> {
>     return -1;
> @@ -181,6 +190,13 @@ static inline int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
>     abort();
> }
> 
> +static inline int kvm_ppc_hash64_start_access(PowerPCCPU *cpu,
> +                                        unsigned long pte_index,
> +                                        struct ppc_hash64_hpte_token *token)
> +{
> +    abort();
> +}
> +
> #endif
> 
> #ifndef CONFIG_KVM
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index 67fc1b5..aeb4593 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -302,29 +302,73 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>     return prot;
> }
> 
> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
> +struct ppc_hash64_hpte_token *ppc_hash64_start_access(PowerPCCPU *cpu,
> +                                                      unsigned long pte_index)

How about you also pass in the number of PTEs you want to access? Let's call it "pte_num" for now. Then if you only care about one PTE you can indicate so, otherwise it's clear that you want to access 8 PTEs beginning from the one you're pointing at.

> +{
> +    hwaddr pte_offset;
> +    struct ppc_hash64_hpte_token *token;

void *token = NULL;

if (kvmppc_uses_htab_fd(cpu)) {
    /* HTAB is controlled by KVM. Fetch the PTEG into a new buffer. */

    int hpte_group_size = sizeof(unsigned long) * 2 * pte_num;
    token = g_malloc(hpte_group_size);
    if (kvm_ppc_hash64_read_pteg(cpu, pte_index, token)) {
        free(token);
        return NULL;
    }
} else {
    /* HTAB is controlled by QEMU. Just point to the internally accessible PTEG. */
    hwaddr pte_offset;

    pte_offset = pte_index * HASH_PTE_SIZE_64;
    if (cpu->env.external_htab) {
        token = cpu->env.external_htab + pte_offset;
    } else {
        token = (uint8_t *) cpu->env.htab_base + pte_offset;
    }
}

return token;

This way it's more obvious which path the "normal code flow" would be. We also only clearly choose what to do depending on in-kernel HTAB or now. As a big plus we don't need a struct that we need to dynamically allocate even in the TCG case (malloc is slow).

You also want to cache kvmppc_uses_htab_fd(cpu) so it doesn't have to call an ioctl every time we try to read a PTE. I guess the best spot for it would be to put it into env.


> +
> +    token = g_malloc(sizeof(struct ppc_hash64_hpte_token));
> +
> +    pte_offset = pte_index * HASH_PTE_SIZE_64;
> +    if (!kvmppc_has_cap_htab_fd()) {
> +        token->free_hpte_group = 0;
> +        token->external = !!cpu->env.external_htab;
> +        if (token->external) {
> +            token->hpte_group = cpu->env.external_htab + pte_offset;
> +        } else {
> +            token->hpte_group = (uint8_t *) cpu->env.htab_base + pte_offset;
> +        }
> +        return token;
> +    }
> +
> +    if (!kvm_ppc_hash64_start_access(cpu, pte_index, token)) {
> +        return token;
> +    }
> +    free(token);
> +    return NULL;
> +}
> +
> +void ppc_hash64_stop_access(struct ppc_hash64_hpte_token *token)
> +{

Pass in cpu as well, then simply only do

if (kvmppc_uses_htab_fd(cpu)) {
    g_free(token);
}

> +    if (token->free_hpte_group) {
> +        free(token->hpte_group);
> +    }
> +    free(token);
> +    return;
> +}
> +
> +static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr hash,
>                                      bool secondary, target_ulong ptem,
>                                      ppc_hash_pte64_t *pte)
> {
> -    hwaddr pte_offset = pteg_off;
> -    target_ulong pte0, pte1;
>     int i;
> +    target_ulong pte0, pte1;
> +    unsigned long pte_index;
> +    struct ppc_hash64_hpte_token *token;
> 
> +    pte_index = (hash * HPTES_PER_GROUP) & env->htab_mask;
> +    token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
> +    if (!token) {
> +        return -1;
> +    }
>     for (i = 0; i < HPTES_PER_GROUP; i++) {
> -        pte0 = ppc_hash64_load_hpte0(env, pte_offset);
> -        pte1 = ppc_hash64_load_hpte1(env, pte_offset);
> +        pte0 = ppc_hash64_load_hpte0(token, i);
> +        pte1 = ppc_hash64_load_hpte1(token, i);
> 
>         if ((pte0 & HPTE64_V_VALID)
>             && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
>             && HPTE64_V_COMPARE(pte0, ptem)) {
>             pte->pte0 = pte0;
>             pte->pte1 = pte1;
> -            return pte_offset;
> +            ppc_hash64_stop_access(token);
> +            return (pte_index + i) * HASH_PTE_SIZE_64;
>         }
> -
> -        pte_offset += HASH_PTE_SIZE_64;
>     }
> -
> +    ppc_hash64_stop_access(token);
> +    /*
> +     * We didn't find a valid entry.
> +     */
>     return -1;
> }
> 
> @@ -332,7 +376,7 @@ static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
>                                      ppc_slb_t *slb, target_ulong eaddr,
>                                      ppc_hash_pte64_t *pte)
> {
> -    hwaddr pteg_off, pte_offset;
> +    hwaddr pte_offset;
>     hwaddr hash;
>     uint64_t vsid, epnshift, epnmask, epn, ptem;
> 
> @@ -367,8 +411,7 @@ static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
>             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
>             " hash=" TARGET_FMT_plx "\n",
>             env->htab_base, env->htab_mask, vsid, ptem,  hash);
> -    pteg_off = (hash * HASH_PTEG_SIZE_64) & env->htab_mask;
> -    pte_offset = ppc_hash64_pteg_search(env, pteg_off, 0, ptem, pte);
> +    pte_offset = ppc_hash64_pteg_search(env, hash, 0, ptem, pte);
> 
>     if (pte_offset == -1) {
>         /* Secondary PTEG lookup */
> @@ -377,8 +420,7 @@ static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
>                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
>                 env->htab_mask, vsid, ptem, ~hash);
> 
> -        pteg_off = (~hash * HASH_PTEG_SIZE_64) & env->htab_mask;
> -        pte_offset = ppc_hash64_pteg_search(env, pteg_off, 1, ptem, pte);
> +        pte_offset = ppc_hash64_pteg_search(env, ~hash, 1, ptem, pte);
>     }
> 
>     return pte_offset;
> diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
> index 55f5a23..0b44321 100644
> --- a/target-ppc/mmu-hash64.h
> +++ b/target-ppc/mmu-hash64.h
> @@ -75,23 +75,36 @@ int ppc_hash64_handle_mmu_fault(CPUPPCState *env, target_ulong address, int rw,
> #define HPTE64_V_1TB_SEG        0x4000000000000000ULL
> #define HPTE64_V_VRMA_MASK      0x4001ffffff000000ULL
> 
> -static inline target_ulong ppc_hash64_load_hpte0(CPUPPCState *env,
> -                                                 hwaddr pte_offset)
> +
> +struct ppc_hash64_hpte_token {
> +    bool free_hpte_group;
> +    bool external;
> +    uint8_t *hpte_group;
> +};
> +
> +struct ppc_hash64_hpte_token *ppc_hash64_start_access(PowerPCCPU *cpu,
> +                                                      unsigned long pte_index);
> +void ppc_hash64_stop_access(struct ppc_hash64_hpte_token *token);
> +
> +static inline target_ulong ppc_hash64_load_hpte0(struct ppc_hash64_hpte_token *token,
> +                                                 int index)

If you pass in env or cpu in here as well you can do the decision whether to use ldq_p or ldq_phys based on cpu->env.external_htab. Whenever the HTAB is controlled by KVM, external_htab is true as well.


Alex

> {
> -    if (env->external_htab) {
> -        return  ldq_p(env->external_htab + pte_offset);
> +    index *= HASH_PTE_SIZE_64;
> +    if (token->external) {
> +        return  ldq_p(token->hpte_group + index);
>     } else {
> -        return ldq_phys(env->htab_base + pte_offset);
> +        return ldq_phys((uint64_t)(token->hpte_group + index));
>     }
> }
> 
> -static inline target_ulong ppc_hash64_load_hpte1(CPUPPCState *env,
> -                                                 hwaddr pte_offset)
> +static inline target_ulong ppc_hash64_load_hpte1(struct ppc_hash64_hpte_token *token,
> +                                                 int index)
> {
> -    if (env->external_htab) {
> -        return ldq_p(env->external_htab + pte_offset + HASH_PTE_SIZE_64/2);
> +    index = index * HASH_PTE_SIZE_64;
> +    if (token->external) {
> +        return  ldq_p(token->hpte_group + index + HASH_PTE_SIZE_64/2);
>     } else {
> -        return ldq_phys(env->htab_base + pte_offset + HASH_PTE_SIZE_64/2);
> +        return ldq_phys((uint64_t)(token->hpte_group + index + HASH_PTE_SIZE_64/2));
>     }
> }
> 
> -- 
> 1.8.1.2
>
Aneesh Kumar K.V Oct. 11, 2013, 4:58 p.m. UTC | #2
Alexander Graf <agraf@suse.de> writes:

> On 11.10.2013, at 13:13, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> With kvm enabled, we store the hash page table information in the hypervisor.
>> Use ioctl to read the htab contents. Without this we get the below error when
>> trying to read the guest address
>> 
>> (gdb) x/10 do_fork
>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>> (gdb)
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> ---
>> Changes from V4:
>> * Rewrite to avoid two code paths for doing hash lookups

....

>> +
>> +    i = 0;
>> +    hpte = pte_index * HASH_PTE_SIZE_64;
>>     if (likely((flags & H_EXACT) == 0)) {
>>         pte_index &= ~7ULL;
>> -        hpte = pte_index * HASH_PTE_SIZE_64;
>> -        for (i = 0; ; ++i) {
>> +        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
>> +        do {
>
> Why convert this into a while loop?

I am moving i = 0 outside the loop. Hence found while () better than for(;;++i) 

>
>>             if (i == 8) {
>> +                ppc_hash64_stop_access(token);
>>                 return H_PTEG_FULL;
>>             }
>> -            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
>> +            if ((ppc_hash64_load_hpte0(token, i) & HPTE64_V_VALID) == 0) {
>>                 break;
>>             }
>> -            hpte += HASH_PTE_SIZE_64;
>> -        }
>> +        } while (i++);
>> +        ppc_hash64_stop_access(token);

....


>> +
>> +int kvm_ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index,
>> +                                struct ppc_hash64_hpte_token *token)
>> +{
>> +    int htab_fd;
>> +    int hpte_group_size;
>> +    struct kvm_get_htab_fd ghf;
>> +    struct kvm_get_htab_buf {
>> +        struct kvm_get_htab_header header;
>> +        /*
>> +         * We required one extra byte for read
>> +         */
>> +        unsigned long hpte[(HPTES_PER_GROUP * 2) + 1];
>> +    } hpte_buf;;
>
> Double semicolon?

Will fix

>
>> +
>> +    ghf.flags = 0;
>> +    ghf.start_index = pte_index;
>> +    htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
>> +    if (htab_fd < 0) {
>> +        goto error_out;
>> +    }
>> +    memset(&hpte_buf, 0, sizeof(hpte_buf));

....

>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>> index 67fc1b5..aeb4593 100644
>> --- a/target-ppc/mmu-hash64.c
>> +++ b/target-ppc/mmu-hash64.c
>> @@ -302,29 +302,73 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>>     return prot;
>> }
>> 
>> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
>> +struct ppc_hash64_hpte_token *ppc_hash64_start_access(PowerPCCPU *cpu,
>> +                                                      unsigned long pte_index)
>
> How about you also pass in the number of PTEs you want to access?
> Let's call it "pte_num" for now. Then if you only care about one PTE
> you can indicate so, otherwise it's clear that you want to access 8
> PTEs beginning from the one you're pointing at.

So if we want to pass pte_num, then i can be any number, 1, 8, 10. That
would make the code complex, because now we need to make the buffer
passed to read() of variable size.Also i would need another allocation
for the return buffer. I can do tricks like make the token handle the
pointer to actual buffer skipping the header. But ppc_hash64=stop_acess then
would have to know about kvm htab read header which i found not nice.
We can possibly update the function name to indicate that it will always
read hptegroup from the pte_index. Something like ppc64_start_hpteg_access() ?. 

>
>> +{
>> +    hwaddr pte_offset;
>> +    struct ppc_hash64_hpte_token *token;
>
> void *token = NULL;
>
> if (kvmppc_uses_htab_fd(cpu)) {
>     /* HTAB is controlled by KVM. Fetch the PTEG into a new buffer. */
>
>     int hpte_group_size = sizeof(unsigned long) * 2 * pte_num;
>     token = g_malloc(hpte_group_size);
>     if (kvm_ppc_hash64_read_pteg(cpu, pte_index, token)) {

That is the tricky part, the read buffer need to have a header in the
beginning. May be i can do kvm_ppc_hash64_stop_access(void *token) that
does the pointer match gets to the head of token and free. Will try that.

>         free(token);
>         return NULL;
>     }
> } else {
>     /* HTAB is controlled by QEMU. Just point to the internally accessible PTEG. */
>     hwaddr pte_offset;
>
>     pte_offset = pte_index * HASH_PTE_SIZE_64;
>     if (cpu->env.external_htab) {
>         token = cpu->env.external_htab + pte_offset;
>     } else {
>         token = (uint8_t *) cpu->env.htab_base + pte_offset;
>     }
> }
>
> return token;
>
> This way it's more obvious which path the "normal code flow" would be. We also only clearly choose what to do depending on in-kernel HTAB or now. As a big plus we don't need a struct that we need to dynamically allocate even in the TCG case (malloc is slow).
>
> You also want to cache kvmppc_uses_htab_fd(cpu) so it doesn't have to
> call an ioctl every time we try to read a PTE. I guess the best spot
> for it would be to put it into env.

didn't get that. We already cache that value as 

bool kvmppc_has_cap_htab_fd(void)
{
    return cap_htab_fd;
}

Looking at the kernel capability check I do find an issue, So with both
hv and pr loaded as module, that would always return true. Now that is
not we want here. Ideally we should have all these as VM ioctl and not
device ioctl. I had asked that as a fixme in the HV PR patchset.


>
>
>> +
>> +    token = g_malloc(sizeof(struct ppc_hash64_hpte_token));
>> +
>> +    pte_offset = pte_index * HASH_PTE_SIZE_64;
>> +    if (!kvmppc_has_cap_htab_fd()) {
>> +        token->free_hpte_group = 0;
>> +        token->external = !!cpu->env.external_htab;
>> +        if (token->external) {
>> +            token->hpte_group = cpu->env.external_htab + pte_offset;
>> +        } else {
>> +            token->hpte_group = (uint8_t *) cpu->env.htab_base + pte_offset;
>> +        }
>> +        return token;
>> +    }
>> +
>> +    if (!kvm_ppc_hash64_start_access(cpu, pte_index, token)) {
>> +        return token;
>> +    }
>> +    free(token);
>> +    return NULL;
>> +}
>> +
>> +void ppc_hash64_stop_access(struct ppc_hash64_hpte_token *token)
>> +{
>
> Pass in cpu as well, then simply only do
>
> if (kvmppc_uses_htab_fd(cpu)) {
>     g_free(token);
> }
>
>> +    if (token->free_hpte_group) {
>> +        free(token->hpte_group);
>> +    }
>> +    free(token);
>> +    return;
>> +}


......


>> +
>> +struct ppc_hash64_hpte_token {
>> +    bool free_hpte_group;
>> +    bool external;
>> +    uint8_t *hpte_group;
>> +};
>> +
>> +struct ppc_hash64_hpte_token *ppc_hash64_start_access(PowerPCCPU *cpu,
>> +                                                      unsigned long pte_index);
>> +void ppc_hash64_stop_access(struct ppc_hash64_hpte_token *token);
>> +
>> +static inline target_ulong ppc_hash64_load_hpte0(struct ppc_hash64_hpte_token *token,
>> +                                                 int index)
>
> If you pass in env or cpu in here as well you can do the decision
> whether to use ldq_p or ldq_phys based on
> cpu->env.external_htab. Whenever the HTAB is controlled by KVM,
> external_htab is true as well.

Will do

>
>
> Alex
>
>> {
>> -    if (env->external_htab) {
>> -        return  ldq_p(env->external_htab + pte_offset);
>> +    index *= HASH_PTE_SIZE_64;
>> +    if (token->external) {
>> +        return  ldq_p(token->hpte_group + index);
>>     } else {
>> -        return ldq_phys(env->htab_base + pte_offset);
>> +        return ldq_phys((uint64_t)(token->hpte_group + index));
>>     }
>> }


-aneesh
Alexander Graf Oct. 27, 2013, 6:29 p.m. UTC | #3
On 11.10.2013, at 09:58, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Alexander Graf <agraf@suse.de> writes:
> 
>> On 11.10.2013, at 13:13, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>> 
>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>> 
>>> With kvm enabled, we store the hash page table information in the hypervisor.
>>> Use ioctl to read the htab contents. Without this we get the below error when
>>> trying to read the guest address
>>> 
>>> (gdb) x/10 do_fork
>>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>>> (gdb)
>>> 
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>> ---
>>> Changes from V4:
>>> * Rewrite to avoid two code paths for doing hash lookups
> 
> ....
> 
>>> +
>>> +    i = 0;
>>> +    hpte = pte_index * HASH_PTE_SIZE_64;
>>>    if (likely((flags & H_EXACT) == 0)) {
>>>        pte_index &= ~7ULL;
>>> -        hpte = pte_index * HASH_PTE_SIZE_64;
>>> -        for (i = 0; ; ++i) {
>>> +        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
>>> +        do {
>> 
>> Why convert this into a while loop?
> 
> I am moving i = 0 outside the loop. Hence found while () better than for(;;++i) 

Outside of what loop? You're only moving it outside of the if().

> 
>> 
>>>            if (i == 8) {
>>> +                ppc_hash64_stop_access(token);
>>>                return H_PTEG_FULL;
>>>            }
>>> -            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
>>> +            if ((ppc_hash64_load_hpte0(token, i) & HPTE64_V_VALID) == 0) {
>>>                break;
>>>            }
>>> -            hpte += HASH_PTE_SIZE_64;
>>> -        }
>>> +        } while (i++);
>>> +        ppc_hash64_stop_access(token);
> 
> ....
> 
> 
>>> +
>>> +int kvm_ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index,
>>> +                                struct ppc_hash64_hpte_token *token)
>>> +{
>>> +    int htab_fd;
>>> +    int hpte_group_size;
>>> +    struct kvm_get_htab_fd ghf;
>>> +    struct kvm_get_htab_buf {
>>> +        struct kvm_get_htab_header header;
>>> +        /*
>>> +         * We required one extra byte for read
>>> +         */
>>> +        unsigned long hpte[(HPTES_PER_GROUP * 2) + 1];
>>> +    } hpte_buf;;
>> 
>> Double semicolon?
> 
> Will fix
> 
>> 
>>> +
>>> +    ghf.flags = 0;
>>> +    ghf.start_index = pte_index;
>>> +    htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
>>> +    if (htab_fd < 0) {
>>> +        goto error_out;
>>> +    }
>>> +    memset(&hpte_buf, 0, sizeof(hpte_buf));
> 
> ....
> 
>>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>>> index 67fc1b5..aeb4593 100644
>>> --- a/target-ppc/mmu-hash64.c
>>> +++ b/target-ppc/mmu-hash64.c
>>> @@ -302,29 +302,73 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>>>    return prot;
>>> }
>>> 
>>> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
>>> +struct ppc_hash64_hpte_token *ppc_hash64_start_access(PowerPCCPU *cpu,
>>> +                                                      unsigned long pte_index)
>> 
>> How about you also pass in the number of PTEs you want to access?
>> Let's call it "pte_num" for now. Then if you only care about one PTE
>> you can indicate so, otherwise it's clear that you want to access 8
>> PTEs beginning from the one you're pointing at.
> 
> So if we want to pass pte_num, then i can be any number, 1, 8, 10. That
> would make the code complex, because now we need to make the buffer
> passed to read() of variable size.Also i would need another allocation
> for the return buffer. I can do tricks like make the token handle the
> pointer to actual buffer skipping the header. But ppc_hash64=stop_acess then
> would have to know about kvm htab read header which i found not nice.
> We can possibly update the function name to indicate that it will always
> read hptegroup from the pte_index. Something like ppc64_start_hpteg_access() ?. 

Just abort() if pte_num is not 1 or 8.

> 
>> 
>>> +{
>>> +    hwaddr pte_offset;
>>> +    struct ppc_hash64_hpte_token *token;
>> 
>> void *token = NULL;
>> 
>> if (kvmppc_uses_htab_fd(cpu)) {
>>    /* HTAB is controlled by KVM. Fetch the PTEG into a new buffer. */
>> 
>>    int hpte_group_size = sizeof(unsigned long) * 2 * pte_num;
>>    token = g_malloc(hpte_group_size);
>>    if (kvm_ppc_hash64_read_pteg(cpu, pte_index, token)) {
> 
> That is the tricky part, the read buffer need to have a header in the
> beginning. May be i can do kvm_ppc_hash64_stop_access(void *token) that
> does the pointer match gets to the head of token and free. Will try that.
> 
>>        free(token);
>>        return NULL;
>>    }
>> } else {
>>    /* HTAB is controlled by QEMU. Just point to the internally accessible PTEG. */
>>    hwaddr pte_offset;
>> 
>>    pte_offset = pte_index * HASH_PTE_SIZE_64;
>>    if (cpu->env.external_htab) {
>>        token = cpu->env.external_htab + pte_offset;
>>    } else {
>>        token = (uint8_t *) cpu->env.htab_base + pte_offset;
>>    }
>> }
>> 
>> return token;
>> 
>> This way it's more obvious which path the "normal code flow" would be. We also only clearly choose what to do depending on in-kernel HTAB or now. As a big plus we don't need a struct that we need to dynamically allocate even in the TCG case (malloc is slow).
>> 
>> You also want to cache kvmppc_uses_htab_fd(cpu) so it doesn't have to
>> call an ioctl every time we try to read a PTE. I guess the best spot
>> for it would be to put it into env.
> 
> didn't get that. We already cache that value as 
> 
> bool kvmppc_has_cap_htab_fd(void)
> {
>    return cap_htab_fd;
> }
> 
> Looking at the kernel capability check I do find an issue, So with both
> hv and pr loaded as module, that would always return true. Now that is
> not we want here. Ideally we should have all these as VM ioctl and not
> device ioctl. I had asked that as a fixme in the HV PR patchset.

As you've realized we only cache the ability for an htab fd, not whether we are actually using one. That needs to be a separate variable.


Alex
Aneesh Kumar K.V Nov. 6, 2013, 5:08 p.m. UTC | #4
Alexander Graf <agraf@suse.de> writes:

> On 11.10.2013, at 09:58, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>
>> Alexander Graf <agraf@suse.de> writes:
>> 
>>> On 11.10.2013, at 13:13, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>>> 
>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>>> 
>>>> With kvm enabled, we store the hash page table information in the hypervisor.
>>>> Use ioctl to read the htab contents. Without this we get the below error when
>>>> trying to read the guest address
>>>> 
>>>> (gdb) x/10 do_fork
>>>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>>>> (gdb)
>>>> 
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>>> ---
>>>> Changes from V4:
>>>> * Rewrite to avoid two code paths for doing hash lookups
>> 
>> ....
>> 
>>>> +
>>>> +    i = 0;
>>>> +    hpte = pte_index * HASH_PTE_SIZE_64;
>>>>    if (likely((flags & H_EXACT) == 0)) {
>>>>        pte_index &= ~7ULL;
>>>> -        hpte = pte_index * HASH_PTE_SIZE_64;
>>>> -        for (i = 0; ; ++i) {
>>>> +        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
>>>> +        do {
>>> 
>>> Why convert this into a while loop?
>> 
>> I am moving i = 0 outside the loop. Hence found while () better than for(;;++i) 
>
> Outside of what loop? You're only moving it outside of the if().

I meant moving i = 0 outside of for loop initialization part


here is how it would look like with for

i = 0;
if ()
    for(;;i++) {
      if () {
         break;
     }
   }
} else {

}
hpte += i * HASH_PTE_SIZE_64;



I found do { } while (i++)  better than for loop


>
>> 
>>> 
>>>>            if (i == 8) {
>>>> +                ppc_hash64_stop_access(token);
>>>>                return H_PTEG_FULL;
>>>>            }
>>>> -            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
>>>> +            if ((ppc_hash64_load_hpte0(token, i) & HPTE64_V_VALID) == 0) {
>>>>                break;
>>>>            }
>>>> -            hpte += HASH_PTE_SIZE_64;
>>>> -        }
>>>> +        } while (i++);
>>>> +        ppc_hash64_stop_access(token);
>> 
>> ....

....

>>> You also want to cache kvmppc_uses_htab_fd(cpu) so it doesn't have to
>>> call an ioctl every time we try to read a PTE. I guess the best spot
>>> for it would be to put it into env.
>> 
>> didn't get that. We already cache that value as 
>> 
>> bool kvmppc_has_cap_htab_fd(void)
>> {
>>    return cap_htab_fd;
>> }
>> 
>> Looking at the kernel capability check I do find an issue, So with both
>> hv and pr loaded as module, that would always return true. Now that is
>> not we want here. Ideally we should have all these as VM ioctl and not
>> device ioctl. I had asked that as a fixme in the HV PR patchset.
>
> As you've realized we only cache the ability for an htab fd, not whether we are actually using one. That needs to be a separate variable.
>

As mentioned earlier we can't really cache the htab fd, because the
interface doesn't support random reads of hash page table. 


-aneesh
Alexander Graf Nov. 6, 2013, 5:18 p.m. UTC | #5
On 06.11.2013, at 18:08, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Alexander Graf <agraf@suse.de> writes:
> 
>> On 11.10.2013, at 09:58, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>> 
>>> Alexander Graf <agraf@suse.de> writes:
>>> 
>>>> On 11.10.2013, at 13:13, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>>>> 
>>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>>>> 
>>>>> With kvm enabled, we store the hash page table information in the hypervisor.
>>>>> Use ioctl to read the htab contents. Without this we get the below error when
>>>>> trying to read the guest address
>>>>> 
>>>>> (gdb) x/10 do_fork
>>>>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>>>>> (gdb)
>>>>> 
>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>>>> ---
>>>>> Changes from V4:
>>>>> * Rewrite to avoid two code paths for doing hash lookups
>>> 
>>> ....
>>> 
>>>>> +
>>>>> +    i = 0;
>>>>> +    hpte = pte_index * HASH_PTE_SIZE_64;
>>>>>   if (likely((flags & H_EXACT) == 0)) {
>>>>>       pte_index &= ~7ULL;
>>>>> -        hpte = pte_index * HASH_PTE_SIZE_64;
>>>>> -        for (i = 0; ; ++i) {
>>>>> +        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
>>>>> +        do {
>>>> 
>>>> Why convert this into a while loop?
>>> 
>>> I am moving i = 0 outside the loop. Hence found while () better than for(;;++i) 
>> 
>> Outside of what loop? You're only moving it outside of the if().
> 
> I meant moving i = 0 outside of for loop initialization part
> 
> 
> here is how it would look like with for
> 
> i = 0;
> if ()
>    for(;;i++) {
>      if () {
>         break;
>     }
>   }
> } else {
> 
> }
> hpte += i * HASH_PTE_SIZE_64;
> 
> 
> 
> I found do { } while (i++)  better than for loop

Just make it a different, separate variable. That way it's obvious that we're using it beyond the scope of the loop.

> 
> 
>> 
>>> 
>>>> 
>>>>>           if (i == 8) {
>>>>> +                ppc_hash64_stop_access(token);
>>>>>               return H_PTEG_FULL;
>>>>>           }
>>>>> -            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
>>>>> +            if ((ppc_hash64_load_hpte0(token, i) & HPTE64_V_VALID) == 0) {
>>>>>               break;
>>>>>           }
>>>>> -            hpte += HASH_PTE_SIZE_64;
>>>>> -        }
>>>>> +        } while (i++);
>>>>> +        ppc_hash64_stop_access(token);
>>> 
>>> ....
> 
> ....
> 
>>>> You also want to cache kvmppc_uses_htab_fd(cpu) so it doesn't have to
>>>> call an ioctl every time we try to read a PTE. I guess the best spot
>>>> for it would be to put it into env.
>>> 
>>> didn't get that. We already cache that value as 
>>> 
>>> bool kvmppc_has_cap_htab_fd(void)
>>> {
>>>   return cap_htab_fd;
>>> }
>>> 
>>> Looking at the kernel capability check I do find an issue, So with both
>>> hv and pr loaded as module, that would always return true. Now that is
>>> not we want here. Ideally we should have all these as VM ioctl and not
>>> device ioctl. I had asked that as a fixme in the HV PR patchset.
>> 
>> As you've realized we only cache the ability for an htab fd, not whether we are actually using one. That needs to be a separate variable.
>> 
> 
> As mentioned earlier we can't really cache the htab fd, because the
> interface doesn't support random reads of hash page table.

We don't need to cache the fd itself, only the fact that the cap is actually available and not just potentially available.


Alex
diff mbox

Patch

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index f10ba8a..96348e3 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -52,6 +52,7 @@  static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     target_ulong raddr;
     target_ulong i;
     hwaddr hpte;
+    struct ppc_hash64_hpte_token *token;
 
     /* only handle 4k and 16M pages for now */
     if (pteh & HPTE64_V_LARGE) {
@@ -94,25 +95,32 @@  static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
         return H_PARAMETER;
     }
+
+    i = 0;
+    hpte = pte_index * HASH_PTE_SIZE_64;
     if (likely((flags & H_EXACT) == 0)) {
         pte_index &= ~7ULL;
-        hpte = pte_index * HASH_PTE_SIZE_64;
-        for (i = 0; ; ++i) {
+        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
+        do {
             if (i == 8) {
+                ppc_hash64_stop_access(token);
                 return H_PTEG_FULL;
             }
-            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
+            if ((ppc_hash64_load_hpte0(token, i) & HPTE64_V_VALID) == 0) {
                 break;
             }
-            hpte += HASH_PTE_SIZE_64;
-        }
+        } while (i++);
+        ppc_hash64_stop_access(token);
     } else {
-        i = 0;
-        hpte = pte_index * HASH_PTE_SIZE_64;
-        if (ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) {
+        token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
+        if (ppc_hash64_load_hpte0(token, 0) & HPTE64_V_VALID) {
+            ppc_hash64_stop_access(token);
             return H_PTEG_FULL;
         }
+        ppc_hash64_stop_access(token);
     }
+    hpte += i * HASH_PTE_SIZE_64;
+
     ppc_hash64_store_hpte1(env, hpte, ptel);
     /* eieio();  FIXME: need some sort of barrier for smp? */
     ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
@@ -135,15 +143,16 @@  static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
 {
     hwaddr hpte;
     target_ulong v, r, rb;
+    struct ppc_hash64_hpte_token  *token;
 
     if ((ptex * HASH_PTE_SIZE_64) & ~env->htab_mask) {
         return REMOVE_PARM;
     }
 
-    hpte = ptex * HASH_PTE_SIZE_64;
-
-    v = ppc_hash64_load_hpte0(env, hpte);
-    r = ppc_hash64_load_hpte1(env, hpte);
+    token = ppc_hash64_start_access(ppc_env_get_cpu(env), ptex);
+    v = ppc_hash64_load_hpte0(token, 0);
+    r = ppc_hash64_load_hpte1(token, 0);
+    ppc_hash64_stop_access(token);
 
     if ((v & HPTE64_V_VALID) == 0 ||
         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn) ||
@@ -152,6 +161,7 @@  static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
     }
     *vp = v;
     *rp = r;
+    hpte = ptex * HASH_PTE_SIZE_64;
     ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY);
     rb = compute_tlbie_rb(v, r, ptex);
     ppc_tlb_invalidate_one(env, rb);
@@ -261,15 +271,16 @@  static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     target_ulong avpn = args[2];
     hwaddr hpte;
     target_ulong v, r, rb;
+    struct ppc_hash64_hpte_token *token;
 
     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
         return H_PARAMETER;
     }
 
-    hpte = pte_index * HASH_PTE_SIZE_64;
-
-    v = ppc_hash64_load_hpte0(env, hpte);
-    r = ppc_hash64_load_hpte1(env, hpte);
+    token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
+    v = ppc_hash64_load_hpte0(token, 0);
+    r = ppc_hash64_load_hpte1(token, 0);
+    ppc_hash64_stop_access(token);
 
     if ((v & HPTE64_V_VALID) == 0 ||
         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn)) {
@@ -282,6 +293,7 @@  static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     r |= (flags << 48) & HPTE64_R_KEY_HI;
     r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
     rb = compute_tlbie_rb(v, r, pte_index);
+    hpte = pte_index * HASH_PTE_SIZE_64;
     ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY);
     ppc_tlb_invalidate_one(env, rb);
     ppc_hash64_store_hpte1(env, hpte, r);
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index ac70efbf..3ad3283 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -1783,6 +1783,11 @@  bool kvmppc_has_cap_epr(void)
     return cap_epr;
 }
 
+bool kvmppc_has_cap_htab_fd(void)
+{
+    return cap_htab_fd;
+}
+
 static int kvm_ppc_register_host_cpu_type(void)
 {
     TypeInfo type_info = {
@@ -1888,3 +1893,45 @@  int kvm_arch_on_sigbus(int code, void *addr)
 void kvm_arch_init_irq_routing(KVMState *s)
 {
 }
+
+int kvm_ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index,
+                                struct ppc_hash64_hpte_token *token)
+{
+    int htab_fd;
+    int hpte_group_size;
+    struct kvm_get_htab_fd ghf;
+    struct kvm_get_htab_buf {
+        struct kvm_get_htab_header header;
+        /*
+         * We required one extra byte for read
+         */
+        unsigned long hpte[(HPTES_PER_GROUP * 2) + 1];
+    } hpte_buf;;
+
+    ghf.flags = 0;
+    ghf.start_index = pte_index;
+    htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
+    if (htab_fd < 0) {
+        goto error_out;
+    }
+    memset(&hpte_buf, 0, sizeof(hpte_buf));
+    /*
+     * Read the hpte group
+     */
+    if (read(htab_fd, &hpte_buf, sizeof(hpte_buf)) < 0) {
+        goto out_close;
+    }
+
+    hpte_group_size = sizeof(unsigned long) * 2 * HPTES_PER_GROUP;
+    token->hpte_group = g_malloc(hpte_group_size);
+    token->free_hpte_group = 1;
+    token->external = 1;
+    memcpy(token->hpte_group, hpte_buf.hpte, hpte_group_size);
+    close(htab_fd);
+    return 0;
+
+out_close:
+    close(htab_fd);
+error_out:
+    return -1;
+}
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index 4ae7bf2..31df330 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -12,6 +12,7 @@ 
 #define TYPE_HOST_POWERPC_CPU "host-" TYPE_POWERPC_CPU
 
 void kvmppc_init(void);
+struct ppc_hash64_hpte_token;
 
 #ifdef CONFIG_KVM
 
@@ -38,10 +39,13 @@  uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
 #endif /* !CONFIG_USER_ONLY */
 int kvmppc_fixup_cpu(PowerPCCPU *cpu);
 bool kvmppc_has_cap_epr(void);
+bool kvmppc_has_cap_htab_fd(void);
 int kvmppc_get_htab_fd(bool write);
 int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns);
 int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
                            uint16_t n_valid, uint16_t n_invalid);
+int kvm_ppc_hash64_start_access(PowerPCCPU *cpu, hwaddr hash,
+                                struct ppc_hash64_hpte_token *token);
 
 #else
 
@@ -164,6 +168,11 @@  static inline bool kvmppc_has_cap_epr(void)
     return false;
 }
 
+static inline bool kvmppc_has_cap_htab_fd(void)
+{
+    return false;
+}
+
 static inline int kvmppc_get_htab_fd(bool write)
 {
     return -1;
@@ -181,6 +190,13 @@  static inline int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
     abort();
 }
 
+static inline int kvm_ppc_hash64_start_access(PowerPCCPU *cpu,
+                                        unsigned long pte_index,
+                                        struct ppc_hash64_hpte_token *token)
+{
+    abort();
+}
+
 #endif
 
 #ifndef CONFIG_KVM
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 67fc1b5..aeb4593 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -302,29 +302,73 @@  static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
     return prot;
 }
 
-static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
+struct ppc_hash64_hpte_token *ppc_hash64_start_access(PowerPCCPU *cpu,
+                                                      unsigned long pte_index)
+{
+    hwaddr pte_offset;
+    struct ppc_hash64_hpte_token *token;
+
+    token = g_malloc(sizeof(struct ppc_hash64_hpte_token));
+
+    pte_offset = pte_index * HASH_PTE_SIZE_64;
+    if (!kvmppc_has_cap_htab_fd()) {
+        token->free_hpte_group = 0;
+        token->external = !!cpu->env.external_htab;
+        if (token->external) {
+            token->hpte_group = cpu->env.external_htab + pte_offset;
+        } else {
+            token->hpte_group = (uint8_t *) cpu->env.htab_base + pte_offset;
+        }
+        return token;
+    }
+
+    if (!kvm_ppc_hash64_start_access(cpu, pte_index, token)) {
+        return token;
+    }
+    free(token);
+    return NULL;
+}
+
+void ppc_hash64_stop_access(struct ppc_hash64_hpte_token *token)
+{
+    if (token->free_hpte_group) {
+        free(token->hpte_group);
+    }
+    free(token);
+    return;
+}
+
+static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr hash,
                                      bool secondary, target_ulong ptem,
                                      ppc_hash_pte64_t *pte)
 {
-    hwaddr pte_offset = pteg_off;
-    target_ulong pte0, pte1;
     int i;
+    target_ulong pte0, pte1;
+    unsigned long pte_index;
+    struct ppc_hash64_hpte_token *token;
 
+    pte_index = (hash * HPTES_PER_GROUP) & env->htab_mask;
+    token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
+    if (!token) {
+        return -1;
+    }
     for (i = 0; i < HPTES_PER_GROUP; i++) {
-        pte0 = ppc_hash64_load_hpte0(env, pte_offset);
-        pte1 = ppc_hash64_load_hpte1(env, pte_offset);
+        pte0 = ppc_hash64_load_hpte0(token, i);
+        pte1 = ppc_hash64_load_hpte1(token, i);
 
         if ((pte0 & HPTE64_V_VALID)
             && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
             && HPTE64_V_COMPARE(pte0, ptem)) {
             pte->pte0 = pte0;
             pte->pte1 = pte1;
-            return pte_offset;
+            ppc_hash64_stop_access(token);
+            return (pte_index + i) * HASH_PTE_SIZE_64;
         }
-
-        pte_offset += HASH_PTE_SIZE_64;
     }
-
+    ppc_hash64_stop_access(token);
+    /*
+     * We didn't find a valid entry.
+     */
     return -1;
 }
 
@@ -332,7 +376,7 @@  static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
                                      ppc_slb_t *slb, target_ulong eaddr,
                                      ppc_hash_pte64_t *pte)
 {
-    hwaddr pteg_off, pte_offset;
+    hwaddr pte_offset;
     hwaddr hash;
     uint64_t vsid, epnshift, epnmask, epn, ptem;
 
@@ -367,8 +411,7 @@  static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
             " hash=" TARGET_FMT_plx "\n",
             env->htab_base, env->htab_mask, vsid, ptem,  hash);
-    pteg_off = (hash * HASH_PTEG_SIZE_64) & env->htab_mask;
-    pte_offset = ppc_hash64_pteg_search(env, pteg_off, 0, ptem, pte);
+    pte_offset = ppc_hash64_pteg_search(env, hash, 0, ptem, pte);
 
     if (pte_offset == -1) {
         /* Secondary PTEG lookup */
@@ -377,8 +420,7 @@  static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
                 env->htab_mask, vsid, ptem, ~hash);
 
-        pteg_off = (~hash * HASH_PTEG_SIZE_64) & env->htab_mask;
-        pte_offset = ppc_hash64_pteg_search(env, pteg_off, 1, ptem, pte);
+        pte_offset = ppc_hash64_pteg_search(env, ~hash, 1, ptem, pte);
     }
 
     return pte_offset;
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 55f5a23..0b44321 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -75,23 +75,36 @@  int ppc_hash64_handle_mmu_fault(CPUPPCState *env, target_ulong address, int rw,
 #define HPTE64_V_1TB_SEG        0x4000000000000000ULL
 #define HPTE64_V_VRMA_MASK      0x4001ffffff000000ULL
 
-static inline target_ulong ppc_hash64_load_hpte0(CPUPPCState *env,
-                                                 hwaddr pte_offset)
+
+struct ppc_hash64_hpte_token {
+    bool free_hpte_group;
+    bool external;
+    uint8_t *hpte_group;
+};
+
+struct ppc_hash64_hpte_token *ppc_hash64_start_access(PowerPCCPU *cpu,
+                                                      unsigned long pte_index);
+void ppc_hash64_stop_access(struct ppc_hash64_hpte_token *token);
+
+static inline target_ulong ppc_hash64_load_hpte0(struct ppc_hash64_hpte_token *token,
+                                                 int index)
 {
-    if (env->external_htab) {
-        return  ldq_p(env->external_htab + pte_offset);
+    index *= HASH_PTE_SIZE_64;
+    if (token->external) {
+        return  ldq_p(token->hpte_group + index);
     } else {
-        return ldq_phys(env->htab_base + pte_offset);
+        return ldq_phys((uint64_t)(token->hpte_group + index));
     }
 }
 
-static inline target_ulong ppc_hash64_load_hpte1(CPUPPCState *env,
-                                                 hwaddr pte_offset)
+static inline target_ulong ppc_hash64_load_hpte1(struct ppc_hash64_hpte_token *token,
+                                                 int index)
 {
-    if (env->external_htab) {
-        return ldq_p(env->external_htab + pte_offset + HASH_PTE_SIZE_64/2);
+    index = index * HASH_PTE_SIZE_64;
+    if (token->external) {
+        return  ldq_p(token->hpte_group + index + HASH_PTE_SIZE_64/2);
     } else {
-        return ldq_phys(env->htab_base + pte_offset + HASH_PTE_SIZE_64/2);
+        return ldq_phys((uint64_t)(token->hpte_group + index + HASH_PTE_SIZE_64/2));
     }
 }