diff mbox

[-V7,3/3] target-ppc: Fix page table lookup with kvm enabled

Message ID 1383834662-12473-3-git-send-email-aneesh.kumar@linux.vnet.ibm.com
State New
Headers show

Commit Message

Aneesh Kumar K.V Nov. 7, 2013, 2:31 p.m. UTC
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>

With kvm enabled, we store the hash page table information in the hypervisor.
Use ioctl to read the htab contents. Without this we get the below error when
trying to read the guest address

 (gdb) x/10 do_fork
 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
 (gdb)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
Changes from V6:
* drop htab_fd argument and use global variable kvmppc_kern_htab instead

 hw/ppc/spapr.c          |  1 +
 hw/ppc/spapr_hcall.c    | 50 +++++++++++++++++++------------
 target-ppc/kvm.c        | 53 +++++++++++++++++++++++++++++++++
 target-ppc/kvm_ppc.h    | 19 ++++++++++++
 target-ppc/mmu-hash64.c | 78 ++++++++++++++++++++++++++++++++++++++++---------
 target-ppc/mmu-hash64.h | 23 ++++++++++-----
 6 files changed, 183 insertions(+), 41 deletions(-)

Comments

Alexander Graf Dec. 18, 2013, 9:49 a.m. UTC | #1
On 07.11.2013, at 15:31, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:

> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
> 
> With kvm enabled, we store the hash page table information in the hypervisor.
> Use ioctl to read the htab contents. Without this we get the below error when
> trying to read the guest address
> 
> (gdb) x/10 do_fork
> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
> (gdb)
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> Changes from V6:
> * drop htab_fd argument and use global variable kvmppc_kern_htab instead
> 
> hw/ppc/spapr.c          |  1 +
> hw/ppc/spapr_hcall.c    | 50 +++++++++++++++++++------------
> target-ppc/kvm.c        | 53 +++++++++++++++++++++++++++++++++
> target-ppc/kvm_ppc.h    | 19 ++++++++++++
> target-ppc/mmu-hash64.c | 78 ++++++++++++++++++++++++++++++++++++++++---------
> target-ppc/mmu-hash64.h | 23 ++++++++++-----
> 6 files changed, 183 insertions(+), 41 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index d4f3502..8bf886e 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -662,6 +662,7 @@ static void spapr_reset_htab(sPAPREnvironment *spapr)
>     if (shift > 0) {
>         /* Kernel handles htab, we don't need to allocate one */
>         spapr->htab_shift = shift;
> +        kvmppc_kern_htab = true;
>     } else {
>         if (!spapr->htab) {
>             /* Allocate an htab if we don't yet have one */
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index f10ba8a..f9ea691 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -50,8 +50,9 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     target_ulong ptel = args[3];
>     target_ulong page_shift = 12;
>     target_ulong raddr;
> -    target_ulong i;
> +    target_ulong index;
>     hwaddr hpte;
> +    void *token;
> 
>     /* only handle 4k and 16M pages for now */
>     if (pteh & HPTE64_V_LARGE) {
> @@ -94,30 +95,37 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
>         return H_PARAMETER;
>     }
> +
> +    index = 0;
> +    hpte = pte_index * HASH_PTE_SIZE_64;
>     if (likely((flags & H_EXACT) == 0)) {
>         pte_index &= ~7ULL;
> -        hpte = pte_index * HASH_PTE_SIZE_64;
> -        for (i = 0; ; ++i) {
> -            if (i == 8) {
> +        token = ppc_hash64_start_access(cpu, pte_index);
> +        do {
> +            if (index == 8) {
> +                ppc_hash64_stop_access(token);
>                 return H_PTEG_FULL;
>             }
> -            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
> +            if ((ppc_hash64_load_hpte0(env, token, index) & HPTE64_V_VALID) == 0) {
>                 break;
>             }
> -            hpte += HASH_PTE_SIZE_64;
> -        }
> +        } while (index++);
> +        ppc_hash64_stop_access(token);
>     } else {
> -        i = 0;
> -        hpte = pte_index * HASH_PTE_SIZE_64;
> -        if (ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) {
> +        token = ppc_hash64_start_access(cpu, pte_index);
> +        if (ppc_hash64_load_hpte0(env, token, 0) & HPTE64_V_VALID) {
> +            ppc_hash64_stop_access(token);
>             return H_PTEG_FULL;
>         }
> +        ppc_hash64_stop_access(token);
>     }
> +    hpte += index * HASH_PTE_SIZE_64;
> +
>     ppc_hash64_store_hpte1(env, hpte, ptel);

I'm not a big fan of fixing the read part, but leaving the write part broken. However I can see value in read only already, so I'm fine if the write part follows later.

>     /* eieio();  FIXME: need some sort of barrier for smp? */
>     ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
> 
> -    args[0] = pte_index + i;
> +    args[0] = pte_index + index;
>     return H_SUCCESS;
> }
> 
> @@ -134,16 +142,17 @@ static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
>                                 target_ulong *vp, target_ulong *rp)
> {
>     hwaddr hpte;
> +    void *token;
>     target_ulong v, r, rb;
> 
>     if ((ptex * HASH_PTE_SIZE_64) & ~env->htab_mask) {
>         return REMOVE_PARM;
>     }
> 
> -    hpte = ptex * HASH_PTE_SIZE_64;
> -
> -    v = ppc_hash64_load_hpte0(env, hpte);
> -    r = ppc_hash64_load_hpte1(env, hpte);
> +    token = ppc_hash64_start_access(ppc_env_get_cpu(env), ptex);
> +    v = ppc_hash64_load_hpte0(env, token, 0);
> +    r = ppc_hash64_load_hpte1(env, token, 0);
> +    ppc_hash64_stop_access(token);
> 
>     if ((v & HPTE64_V_VALID) == 0 ||
>         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn) ||
> @@ -152,6 +161,7 @@ static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
>     }
>     *vp = v;
>     *rp = r;
> +    hpte = ptex * HASH_PTE_SIZE_64;
>     ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY);
>     rb = compute_tlbie_rb(v, r, ptex);
>     ppc_tlb_invalidate_one(env, rb);
> @@ -260,16 +270,17 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     target_ulong pte_index = args[1];
>     target_ulong avpn = args[2];
>     hwaddr hpte;
> +    void *token;
>     target_ulong v, r, rb;
> 
>     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
>         return H_PARAMETER;
>     }
> 
> -    hpte = pte_index * HASH_PTE_SIZE_64;
> -
> -    v = ppc_hash64_load_hpte0(env, hpte);
> -    r = ppc_hash64_load_hpte1(env, hpte);
> +    token = ppc_hash64_start_access(cpu, pte_index);
> +    v = ppc_hash64_load_hpte0(env, token, 0);
> +    r = ppc_hash64_load_hpte1(env, token, 0);
> +    ppc_hash64_stop_access(token);
> 
>     if ((v & HPTE64_V_VALID) == 0 ||
>         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn)) {
> @@ -282,6 +293,7 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
>     r |= (flags << 48) & HPTE64_R_KEY_HI;
>     r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
>     rb = compute_tlbie_rb(v, r, pte_index);
> +    hpte = pte_index * HASH_PTE_SIZE_64;
>     ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY);
>     ppc_tlb_invalidate_one(env, rb);
>     ppc_hash64_store_hpte1(env, hpte, r);
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index ac70efbf..b8f9544 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -1783,6 +1783,11 @@ bool kvmppc_has_cap_epr(void)
>     return cap_epr;
> }
> 
> +bool kvmppc_has_cap_htab_fd(void)
> +{
> +    return cap_htab_fd;
> +}
> +
> static int kvm_ppc_register_host_cpu_type(void)
> {
>     TypeInfo type_info = {
> @@ -1888,3 +1893,51 @@ int kvm_arch_on_sigbus(int code, void *addr)
> void kvm_arch_init_irq_routing(KVMState *s)
> {
> }
> +
> +struct kvm_get_htab_buf {
> +    struct kvm_get_htab_header header;
> +    /*
> +     * We required one extra byte for read
> +     */
> +    unsigned long hpte[(HPTES_PER_GROUP * 2) + 1];
> +};
> +
> +void *kvmppc_hash64_read_pteg(PowerPCCPU *cpu, unsigned long pte_index)
> +{
> +    int htab_fd;
> +    struct kvm_get_htab_fd ghf;
> +    struct kvm_get_htab_buf  *hpte_buf;
> +
> +    ghf.flags = 0;
> +    ghf.start_index = pte_index;
> +    htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
> +    if (htab_fd < 0) {
> +        goto error_out;
> +    }
> +
> +    hpte_buf = g_malloc0(sizeof(*hpte_buf));
> +    /*
> +     * Read the hpte group
> +     */
> +    if (read(htab_fd, hpte_buf, sizeof(*hpte_buf)) < 0) {
> +        goto out_close;
> +    }
> +
> +    close(htab_fd);
> +    return hpte_buf->hpte;
> +
> +out_close:
> +    g_free(hpte_buf);
> +    close(htab_fd);
> +error_out:
> +    return NULL;
> +}
> +
> +void kvmppc_hash64_free_pteg(void *token)
> +{
> +    struct kvm_get_htab_buf *htab_buf;
> +
> +    htab_buf = container_of(token, struct kvm_get_htab_buf, hpte);
> +    g_free(htab_buf);
> +    return;
> +}
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index 4ae7bf2..9981e34 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -38,10 +38,13 @@ uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
> #endif /* !CONFIG_USER_ONLY */
> int kvmppc_fixup_cpu(PowerPCCPU *cpu);
> bool kvmppc_has_cap_epr(void);
> +bool kvmppc_has_cap_htab_fd(void);
> int kvmppc_get_htab_fd(bool write);
> int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns);
> int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
>                            uint16_t n_valid, uint16_t n_invalid);
> +void *kvmppc_hash64_read_pteg(PowerPCCPU *cpu, unsigned long pte_index);
> +void kvmppc_hash64_free_pteg(void *token);
> 
> #else
> 
> @@ -164,6 +167,11 @@ static inline bool kvmppc_has_cap_epr(void)
>     return false;
> }
> 
> +static inline bool kvmppc_has_cap_htab_fd(void)
> +{
> +    return false;
> +}
> +
> static inline int kvmppc_get_htab_fd(bool write)
> {
>     return -1;
> @@ -181,6 +189,17 @@ static inline int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
>     abort();
> }
> 
> +static inline void *kvmppc_hash64_read_pteg(PowerPCCPU *cpu,
> +                                            unsigned long pte_index)
> +{
> +    abort();
> +}
> +
> +static inline void kvmppc_hash64_free_pteg(void *token)
> +{
> +    abort();
> +}
> +
> #endif
> 
> #ifndef CONFIG_KVM
> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
> index 67fc1b5..f59d199 100644
> --- a/target-ppc/mmu-hash64.c
> +++ b/target-ppc/mmu-hash64.c
> @@ -41,6 +41,11 @@
> #endif
> 
> /*
> + * Used to indicate whether we have allocated htab in the
> + * host kernel
> + */
> +bool kvmppc_kern_htab;
> +/*
>  * SLB handling
>  */
> 
> @@ -302,29 +307,76 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>     return prot;
> }
> 
> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
> +void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index)
> +{
> +    void *token = NULL;
> +    hwaddr pte_offset;
> +
> +    pte_offset = pte_index * HASH_PTE_SIZE_64;
> +    if (kvmppc_kern_htab) {
> +        /*
> +         * HTAB is controlled by KVM. Fetch the PTEG into a new buffer.
> +         */
> +        token = kvmppc_hash64_read_pteg(cpu, pte_index);
> +        if (token) {
> +            return token;
> +        }
> +        /*
> +         * pteg read failed, even though we have allocated htab via
> +         * kvmppc_reset_htab.
> +         */
> +        return NULL;
> +    }
> +    /*
> +     * HTAB is controlled by QEMU. Just point to the internally
> +     * accessible PTEG.
> +     */
> +    if (cpu->env.external_htab) {
> +        token = cpu->env.external_htab + pte_offset;
> +    } else if (cpu->env.htab_base) {
> +        token = (uint8_t *) cpu->env.htab_base + pte_offset;

This breaks if you run a 64-bit guest on a 32-bit host trying to access memory beyond 4GB. In that case htab_base is hwaddr (64bit) while uint8_t is only 32bit wide.

Just pass a 64bit token around. That makes it safe and easy.


Alex

> +    }
> +    return token;
> +}
> +
> +void ppc_hash64_stop_access(void *token)
> +{
> +    if (kvmppc_kern_htab) {
> +        return kvmppc_hash64_free_pteg(token);
> +    }
> +}
> +
> +static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr hash,
>                                      bool secondary, target_ulong ptem,
>                                      ppc_hash_pte64_t *pte)
> {
> -    hwaddr pte_offset = pteg_off;
> -    target_ulong pte0, pte1;
>     int i;
> +    void *token;
> +    target_ulong pte0, pte1;
> +    unsigned long pte_index;
> 
> +    pte_index = (hash & env->htab_mask) * HPTES_PER_GROUP;
> +    token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
> +    if (!token) {
> +        return -1;
> +    }
>     for (i = 0; i < HPTES_PER_GROUP; i++) {
> -        pte0 = ppc_hash64_load_hpte0(env, pte_offset);
> -        pte1 = ppc_hash64_load_hpte1(env, pte_offset);
> +        pte0 = ppc_hash64_load_hpte0(env, token, i);
> +        pte1 = ppc_hash64_load_hpte1(env, token, i);
> 
>         if ((pte0 & HPTE64_V_VALID)
>             && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
>             && HPTE64_V_COMPARE(pte0, ptem)) {
>             pte->pte0 = pte0;
>             pte->pte1 = pte1;
> -            return pte_offset;
> +            ppc_hash64_stop_access(token);
> +            return (pte_index + i) * HASH_PTE_SIZE_64;
>         }
> -
> -        pte_offset += HASH_PTE_SIZE_64;
>     }
> -
> +    ppc_hash64_stop_access(token);
> +    /*
> +     * We didn't find a valid entry.
> +     */
>     return -1;
> }
> 
> @@ -332,7 +384,7 @@ static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
>                                      ppc_slb_t *slb, target_ulong eaddr,
>                                      ppc_hash_pte64_t *pte)
> {
> -    hwaddr pteg_off, pte_offset;
> +    hwaddr pte_offset;
>     hwaddr hash;
>     uint64_t vsid, epnshift, epnmask, epn, ptem;
> 
> @@ -367,8 +419,7 @@ static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
>             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
>             " hash=" TARGET_FMT_plx "\n",
>             env->htab_base, env->htab_mask, vsid, ptem,  hash);
> -    pteg_off = (hash * HASH_PTEG_SIZE_64) & env->htab_mask;
> -    pte_offset = ppc_hash64_pteg_search(env, pteg_off, 0, ptem, pte);
> +    pte_offset = ppc_hash64_pteg_search(env, hash, 0, ptem, pte);
> 
>     if (pte_offset == -1) {
>         /* Secondary PTEG lookup */
> @@ -377,8 +428,7 @@ static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
>                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
>                 env->htab_mask, vsid, ptem, ~hash);
> 
> -        pteg_off = (~hash * HASH_PTEG_SIZE_64) & env->htab_mask;
> -        pte_offset = ppc_hash64_pteg_search(env, pteg_off, 1, ptem, pte);
> +        pte_offset = ppc_hash64_pteg_search(env, ~hash, 1, ptem, pte);
>     }
> 
>     return pte_offset;
> diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
> index 55f5a23..c6f64f6 100644
> --- a/target-ppc/mmu-hash64.h
> +++ b/target-ppc/mmu-hash64.h
> @@ -75,23 +75,30 @@ int ppc_hash64_handle_mmu_fault(CPUPPCState *env, target_ulong address, int rw,
> #define HPTE64_V_1TB_SEG        0x4000000000000000ULL
> #define HPTE64_V_VRMA_MASK      0x4001ffffff000000ULL
> 
> -static inline target_ulong ppc_hash64_load_hpte0(CPUPPCState *env,
> -                                                 hwaddr pte_offset)
> +
> +extern bool kvmppc_kern_htab;
> +void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index);
> +void ppc_hash64_stop_access(void *token);
> +
> +static inline target_ulong ppc_hash64_load_hpte0(CPUPPCState *env, void *token,
> +                                                 int index)
> {
> +    index *= HASH_PTE_SIZE_64;
>     if (env->external_htab) {
> -        return  ldq_p(env->external_htab + pte_offset);
> +        return  ldq_p(token + index);
>     } else {
> -        return ldq_phys(env->htab_base + pte_offset);
> +        return ldq_phys((uint64_t)(token + index));
>     }
> }
> 
> -static inline target_ulong ppc_hash64_load_hpte1(CPUPPCState *env,
> -                                                 hwaddr pte_offset)
> +static inline target_ulong ppc_hash64_load_hpte1(CPUPPCState *env, void *token,
> +                                                 int index)
> {
> +    index *= HASH_PTE_SIZE_64;
>     if (env->external_htab) {
> -        return ldq_p(env->external_htab + pte_offset + HASH_PTE_SIZE_64/2);
> +        return  ldq_p(token + index + HASH_PTE_SIZE_64/2);
>     } else {
> -        return ldq_phys(env->htab_base + pte_offset + HASH_PTE_SIZE_64/2);
> +        return ldq_phys((uint64_t)(token + index + HASH_PTE_SIZE_64/2));
>     }
> }
> 
> -- 
> 1.8.3.2
>
Aneesh Kumar K.V Dec. 19, 2013, 6:55 a.m. UTC | #2
Alexander Graf <agraf@suse.de> writes:

> On 07.11.2013, at 15:31, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>> 
>> With kvm enabled, we store the hash page table information in the hypervisor.
>> Use ioctl to read the htab contents. Without this we get the below error when
>> trying to read the guest address
>> 
>> (gdb) x/10 do_fork
>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>> (gdb)
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> ---
>> Changes from V6:
>> * drop htab_fd argument and use global variable kvmppc_kern_htab instead
>> 
>> hw/ppc/spapr.c          |  1 +
>> hw/ppc/spapr_hcall.c    | 50 +++++++++++++++++++------------
>> target-ppc/kvm.c        | 53 +++++++++++++++++++++++++++++++++
>> target-ppc/kvm_ppc.h    | 19 ++++++++++++
>> target-ppc/mmu-hash64.c | 78 ++++++++++++++++++++++++++++++++++++++++---------
>> target-ppc/mmu-hash64.h | 23 ++++++++++-----
>> 6 files changed, 183 insertions(+), 41 deletions(-)
>> 
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index d4f3502..8bf886e 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c

............

>> +    hpte += index * HASH_PTE_SIZE_64;
>> +
>>     ppc_hash64_store_hpte1(env, hpte, ptel);
>
> I'm not a big fan of fixing the read part, but leaving the write part
> broken. However I can see value in read only already, so I'm fine if
> the write part follows later.

When do we want store support per hpte entry w.r.t to hash page table maintained by the
hypervisor. I didn't find a use case while doing this, hence i left that
part out.

>
>>     /* eieio();  FIXME: need some sort of barrier for smp? */
>>     ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
>> 
>> -    args[0] = pte_index + i;
>> +    args[0] = pte_index + index;
>>     return H_SUCCESS;
>> }
>> 
.....

>> #ifndef CONFIG_KVM
>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>> index 67fc1b5..f59d199 100644
>> --- a/target-ppc/mmu-hash64.c
>> +++ b/target-ppc/mmu-hash64.c
>> @@ -41,6 +41,11 @@
>> #endif
>> 
>> /*
>> + * Used to indicate whether we have allocated htab in the
>> + * host kernel
>> + */
>> +bool kvmppc_kern_htab;
>> +/*
>>  * SLB handling
>>  */
>> 
>> @@ -302,29 +307,76 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>>     return prot;
>> }
>> 
>> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
>> +void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index)
>> +{
>> +    void *token = NULL;
>> +    hwaddr pte_offset;
>> +
>> +    pte_offset = pte_index * HASH_PTE_SIZE_64;
>> +    if (kvmppc_kern_htab) {
>> +        /*
>> +         * HTAB is controlled by KVM. Fetch the PTEG into a new buffer.
>> +         */
>> +        token = kvmppc_hash64_read_pteg(cpu, pte_index);
>> +        if (token) {
>> +            return token;
>> +        }
>> +        /*
>> +         * pteg read failed, even though we have allocated htab via
>> +         * kvmppc_reset_htab.
>> +         */
>> +        return NULL;
>> +    }
>> +    /*
>> +     * HTAB is controlled by QEMU. Just point to the internally
>> +     * accessible PTEG.
>> +     */
>> +    if (cpu->env.external_htab) {
>> +        token = cpu->env.external_htab + pte_offset;
>> +    } else if (cpu->env.htab_base) {
>> +        token = (uint8_t *) cpu->env.htab_base + pte_offset;
>
> This breaks if you run a 64-bit guest on a 32-bit host trying to
> access memory beyond 4GB. In that case htab_base is hwaddr (64bit)
> while uint8_t is only 32bit wide.

Wow!! didn't know that is possible.

>
> Just pass a 64bit token around. That makes it safe and easy.
>

I converted that to uint64_t. How about Patch 1 and 2. Are they ok to go
upstream ?

-aneesh
Paul Mackerras Dec. 19, 2013, 8:41 a.m. UTC | #3
On Thu, Dec 19, 2013 at 12:25:57PM +0530, Aneesh Kumar K.V wrote:
> Alexander Graf <agraf@suse.de> writes:
> 
> > This breaks if you run a 64-bit guest on a 32-bit host trying to
> > access memory beyond 4GB. In that case htab_base is hwaddr (64bit)
> > while uint8_t is only 32bit wide.
> 
> Wow!! didn't know that is possible.

I don't believe it is.  All of the 64-bit instructions would trap and
we don't have code to emulate them (and even if we did, it would be
painfully slow given that there would be a lot of them).

Paul.
Alexander Graf Dec. 19, 2013, 3:27 p.m. UTC | #4
On 19.12.2013, at 07:55, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Alexander Graf <agraf@suse.de> writes:
> 
>> On 07.11.2013, at 15:31, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>> 
>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>> 
>>> With kvm enabled, we store the hash page table information in the hypervisor.
>>> Use ioctl to read the htab contents. Without this we get the below error when
>>> trying to read the guest address
>>> 
>>> (gdb) x/10 do_fork
>>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>>> (gdb)
>>> 
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>> ---
>>> Changes from V6:
>>> * drop htab_fd argument and use global variable kvmppc_kern_htab instead
>>> 
>>> hw/ppc/spapr.c          |  1 +
>>> hw/ppc/spapr_hcall.c    | 50 +++++++++++++++++++------------
>>> target-ppc/kvm.c        | 53 +++++++++++++++++++++++++++++++++
>>> target-ppc/kvm_ppc.h    | 19 ++++++++++++
>>> target-ppc/mmu-hash64.c | 78 ++++++++++++++++++++++++++++++++++++++++---------
>>> target-ppc/mmu-hash64.h | 23 ++++++++++-----
>>> 6 files changed, 183 insertions(+), 41 deletions(-)
>>> 
>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>> index d4f3502..8bf886e 100644
>>> --- a/hw/ppc/spapr.c
>>> +++ b/hw/ppc/spapr.c
> 
> ............
> 
>>> +    hpte += index * HASH_PTE_SIZE_64;
>>> +
>>>    ppc_hash64_store_hpte1(env, hpte, ptel);
>> 
>> I'm not a big fan of fixing the read part, but leaving the write part
>> broken. However I can see value in read only already, so I'm fine if
>> the write part follows later.
> 
> When do we want store support per hpte entry w.r.t to hash page table maintained by the
> hypervisor. I didn't find a use case while doing this, hence i left that
> part out.

Well, it's mostly a matter of consistency. The only case where it ever matters is if we wanted to emulate H_ENTER in QEMU - for tracing or debugging purposes. But if we leave it halfway implemented like this, you might get the impression it could work, but it doesn't.

> 
>> 
>>>    /* eieio();  FIXME: need some sort of barrier for smp? */
>>>    ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
>>> 
>>> -    args[0] = pte_index + i;
>>> +    args[0] = pte_index + index;
>>>    return H_SUCCESS;
>>> }
>>> 
> .....
> 
>>> #ifndef CONFIG_KVM
>>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>>> index 67fc1b5..f59d199 100644
>>> --- a/target-ppc/mmu-hash64.c
>>> +++ b/target-ppc/mmu-hash64.c
>>> @@ -41,6 +41,11 @@
>>> #endif
>>> 
>>> /*
>>> + * Used to indicate whether we have allocated htab in the
>>> + * host kernel
>>> + */
>>> +bool kvmppc_kern_htab;
>>> +/*
>>> * SLB handling
>>> */
>>> 
>>> @@ -302,29 +307,76 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>>>    return prot;
>>> }
>>> 
>>> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
>>> +void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index)
>>> +{
>>> +    void *token = NULL;
>>> +    hwaddr pte_offset;
>>> +
>>> +    pte_offset = pte_index * HASH_PTE_SIZE_64;
>>> +    if (kvmppc_kern_htab) {
>>> +        /*
>>> +         * HTAB is controlled by KVM. Fetch the PTEG into a new buffer.
>>> +         */
>>> +        token = kvmppc_hash64_read_pteg(cpu, pte_index);
>>> +        if (token) {
>>> +            return token;
>>> +        }
>>> +        /*
>>> +         * pteg read failed, even though we have allocated htab via
>>> +         * kvmppc_reset_htab.
>>> +         */
>>> +        return NULL;
>>> +    }
>>> +    /*
>>> +     * HTAB is controlled by QEMU. Just point to the internally
>>> +     * accessible PTEG.
>>> +     */
>>> +    if (cpu->env.external_htab) {
>>> +        token = cpu->env.external_htab + pte_offset;
>>> +    } else if (cpu->env.htab_base) {
>>> +        token = (uint8_t *) cpu->env.htab_base + pte_offset;
>> 
>> This breaks if you run a 64-bit guest on a 32-bit host trying to
>> access memory beyond 4GB. In that case htab_base is hwaddr (64bit)
>> while uint8_t is only 32bit wide.
> 
> Wow!! didn't know that is possible.

Well, with TCG you can run a 64-bit guest on a 32-bit host system, right?

> 
>> 
>> Just pass a 64bit token around. That makes it safe and easy.
>> 
> 
> I converted that to uint64_t. How about Patch 1 and 2. Are they ok to go
> upstream ?

Yup. It took me a while to figure out that the patches are doing "the right thing". That code could really use some comments that explain what's going on - like the semantics of htab_mask (mask of the PTEG, will be shifted by 7 for physical address later), what htab_shift really is (log2 of the HTAB size), etc :).


Alex
Aneesh Kumar K.V Dec. 20, 2013, 5:24 a.m. UTC | #5
Alexander Graf <agraf@suse.de> writes:

> On 19.12.2013, at 07:55, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>
>> Alexander Graf <agraf@suse.de> writes:
>> 
>>> On 07.11.2013, at 15:31, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>>> 
>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>>> 
>>>> With kvm enabled, we store the hash page table information in the hypervisor.
>>>> Use ioctl to read the htab contents. Without this we get the below error when
>>>> trying to read the guest address
>>>> 
>>>> (gdb) x/10 do_fork
>>>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>>>> (gdb)
>>>> 
>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>>> ---
>>>> Changes from V6:
>>>> * drop htab_fd argument and use global variable kvmppc_kern_htab instead
>>>> 
>>>> hw/ppc/spapr.c          |  1 +
>>>> hw/ppc/spapr_hcall.c    | 50 +++++++++++++++++++------------
>>>> target-ppc/kvm.c        | 53 +++++++++++++++++++++++++++++++++
>>>> target-ppc/kvm_ppc.h    | 19 ++++++++++++
>>>> target-ppc/mmu-hash64.c | 78 ++++++++++++++++++++++++++++++++++++++++---------
>>>> target-ppc/mmu-hash64.h | 23 ++++++++++-----
>>>> 6 files changed, 183 insertions(+), 41 deletions(-)
>>>> 
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index d4f3502..8bf886e 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>> 
>> ............
>> 
>>>> +    hpte += index * HASH_PTE_SIZE_64;
>>>> +
>>>>    ppc_hash64_store_hpte1(env, hpte, ptel);
>>> 
>>> I'm not a big fan of fixing the read part, but leaving the write part
>>> broken. However I can see value in read only already, so I'm fine if
>>> the write part follows later.
>> 
>> When do we want store support per hpte entry w.r.t to hash page table maintained by the
>> hypervisor. I didn't find a use case while doing this, hence i left that
>> part out.
>
> Well, it's mostly a matter of consistency. The only case where it ever
> matters is if we wanted to emulate H_ENTER in QEMU - for tracing or
> debugging purposes. But if we leave it halfway implemented like this,
> you might get the impression it could work, but it doesn't.

>
>> 
>>> 
>>>>    /* eieio();  FIXME: need some sort of barrier for smp? */
>>>>    ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
>>>> 
>>>> -    args[0] = pte_index + i;
>>>> +    args[0] = pte_index + index;
>>>>    return H_SUCCESS;
>>>> }
>>>> 
>> .....
>> 
>>>> #ifndef CONFIG_KVM
>>>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>>>> index 67fc1b5..f59d199 100644
>>>> --- a/target-ppc/mmu-hash64.c
>>>> +++ b/target-ppc/mmu-hash64.c
>>>> @@ -41,6 +41,11 @@
>>>> #endif
>>>> 
>>>> /*
>>>> + * Used to indicate whether we have allocated htab in the
>>>> + * host kernel
>>>> + */
>>>> +bool kvmppc_kern_htab;
>>>> +/*
>>>> * SLB handling
>>>> */
>>>> 
>>>> @@ -302,29 +307,76 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>>>>    return prot;
>>>> }
>>>> 
>>>> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
>>>> +void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index)
>>>> +{
>>>> +    void *token = NULL;
>>>> +    hwaddr pte_offset;
>>>> +
>>>> +    pte_offset = pte_index * HASH_PTE_SIZE_64;
>>>> +    if (kvmppc_kern_htab) {
>>>> +        /*
>>>> +         * HTAB is controlled by KVM. Fetch the PTEG into a new buffer.
>>>> +         */
>>>> +        token = kvmppc_hash64_read_pteg(cpu, pte_index);
>>>> +        if (token) {
>>>> +            return token;
>>>> +        }
>>>> +        /*
>>>> +         * pteg read failed, even though we have allocated htab via
>>>> +         * kvmppc_reset_htab.
>>>> +         */
>>>> +        return NULL;
>>>> +    }
>>>> +    /*
>>>> +     * HTAB is controlled by QEMU. Just point to the internally
>>>> +     * accessible PTEG.
>>>> +     */
>>>> +    if (cpu->env.external_htab) {
>>>> +        token = cpu->env.external_htab + pte_offset;
>>>> +    } else if (cpu->env.htab_base) {
>>>> +        token = (uint8_t *) cpu->env.htab_base + pte_offset;
>>> 
>>> This breaks if you run a 64-bit guest on a 32-bit host trying to
>>> access memory beyond 4GB. In that case htab_base is hwaddr (64bit)
>>> while uint8_t is only 32bit wide.
>> 
>> Wow!! didn't know that is possible.
>
> Well, with TCG you can run a 64-bit guest on a 32-bit host system, right?
>
>> 
>>> 
>>> Just pass a 64bit token around. That makes it safe and easy.
>>> 
>> 
>> I converted that to uint64_t. How about Patch 1 and 2. Are they ok to go
>> upstream ?
>
> Yup. It took me a while to figure out that the patches are doing "the
> right thing". That code could really use some comments that explain
> what's going on - like the semantics of htab_mask (mask of the PTEG,
> will be shifted by 7 for physical address later), what htab_shift
> really is (log2 of the HTAB size), etc :).


How about

    /*
     * htab_mask is the mask used normalize hash value to PTEG index.
     * htab_shift is log2 of hash table size.
     * We have 8 hpte per group, and each hpte is 16 bytes.
     * ie have 128 bytes per hpte entry.
     */

 -aneesh
Alexander Graf Dec. 20, 2013, 10:53 a.m. UTC | #6
On 20.12.2013, at 06:24, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:

> Alexander Graf <agraf@suse.de> writes:
> 
>> On 19.12.2013, at 07:55, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>> 
>>> Alexander Graf <agraf@suse.de> writes:
>>> 
>>>> On 07.11.2013, at 15:31, Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>>>> 
>>>>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>>>> 
>>>>> With kvm enabled, we store the hash page table information in the hypervisor.
>>>>> Use ioctl to read the htab contents. Without this we get the below error when
>>>>> trying to read the guest address
>>>>> 
>>>>> (gdb) x/10 do_fork
>>>>> 0xc000000000098660 <do_fork>:   Cannot access memory at address 0xc000000000098660
>>>>> (gdb)
>>>>> 
>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>>>>> ---
>>>>> Changes from V6:
>>>>> * drop htab_fd argument and use global variable kvmppc_kern_htab instead
>>>>> 
>>>>> hw/ppc/spapr.c          |  1 +
>>>>> hw/ppc/spapr_hcall.c    | 50 +++++++++++++++++++------------
>>>>> target-ppc/kvm.c        | 53 +++++++++++++++++++++++++++++++++
>>>>> target-ppc/kvm_ppc.h    | 19 ++++++++++++
>>>>> target-ppc/mmu-hash64.c | 78 ++++++++++++++++++++++++++++++++++++++++---------
>>>>> target-ppc/mmu-hash64.h | 23 ++++++++++-----
>>>>> 6 files changed, 183 insertions(+), 41 deletions(-)
>>>>> 
>>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>>> index d4f3502..8bf886e 100644
>>>>> --- a/hw/ppc/spapr.c
>>>>> +++ b/hw/ppc/spapr.c
>>> 
>>> ............
>>> 
>>>>> +    hpte += index * HASH_PTE_SIZE_64;
>>>>> +
>>>>>   ppc_hash64_store_hpte1(env, hpte, ptel);
>>>> 
>>>> I'm not a big fan of fixing the read part, but leaving the write part
>>>> broken. However I can see value in read only already, so I'm fine if
>>>> the write part follows later.
>>> 
>>> When do we want store support per hpte entry w.r.t to hash page table maintained by the
>>> hypervisor. I didn't find a use case while doing this, hence i left that
>>> part out.
>> 
>> Well, it's mostly a matter of consistency. The only case where it ever
>> matters is if we wanted to emulate H_ENTER in QEMU - for tracing or
>> debugging purposes. But if we leave it halfway implemented like this,
>> you might get the impression it could work, but it doesn't.
> 
>> 
>>> 
>>>> 
>>>>>   /* eieio();  FIXME: need some sort of barrier for smp? */
>>>>>   ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
>>>>> 
>>>>> -    args[0] = pte_index + i;
>>>>> +    args[0] = pte_index + index;
>>>>>   return H_SUCCESS;
>>>>> }
>>>>> 
>>> .....
>>> 
>>>>> #ifndef CONFIG_KVM
>>>>> diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
>>>>> index 67fc1b5..f59d199 100644
>>>>> --- a/target-ppc/mmu-hash64.c
>>>>> +++ b/target-ppc/mmu-hash64.c
>>>>> @@ -41,6 +41,11 @@
>>>>> #endif
>>>>> 
>>>>> /*
>>>>> + * Used to indicate whether we have allocated htab in the
>>>>> + * host kernel
>>>>> + */
>>>>> +bool kvmppc_kern_htab;
>>>>> +/*
>>>>> * SLB handling
>>>>> */
>>>>> 
>>>>> @@ -302,29 +307,76 @@ static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
>>>>>   return prot;
>>>>> }
>>>>> 
>>>>> -static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
>>>>> +void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index)
>>>>> +{
>>>>> +    void *token = NULL;
>>>>> +    hwaddr pte_offset;
>>>>> +
>>>>> +    pte_offset = pte_index * HASH_PTE_SIZE_64;
>>>>> +    if (kvmppc_kern_htab) {
>>>>> +        /*
>>>>> +         * HTAB is controlled by KVM. Fetch the PTEG into a new buffer.
>>>>> +         */
>>>>> +        token = kvmppc_hash64_read_pteg(cpu, pte_index);
>>>>> +        if (token) {
>>>>> +            return token;
>>>>> +        }
>>>>> +        /*
>>>>> +         * pteg read failed, even though we have allocated htab via
>>>>> +         * kvmppc_reset_htab.
>>>>> +         */
>>>>> +        return NULL;
>>>>> +    }
>>>>> +    /*
>>>>> +     * HTAB is controlled by QEMU. Just point to the internally
>>>>> +     * accessible PTEG.
>>>>> +     */
>>>>> +    if (cpu->env.external_htab) {
>>>>> +        token = cpu->env.external_htab + pte_offset;
>>>>> +    } else if (cpu->env.htab_base) {
>>>>> +        token = (uint8_t *) cpu->env.htab_base + pte_offset;
>>>> 
>>>> This breaks if you run a 64-bit guest on a 32-bit host trying to
>>>> access memory beyond 4GB. In that case htab_base is hwaddr (64bit)
>>>> while uint8_t is only 32bit wide.
>>> 
>>> Wow!! didn't know that is possible.
>> 
>> Well, with TCG you can run a 64-bit guest on a 32-bit host system, right?
>> 
>>> 
>>>> 
>>>> Just pass a 64bit token around. That makes it safe and easy.
>>>> 
>>> 
>>> I converted that to uint64_t. How about Patch 1 and 2. Are they ok to go
>>> upstream ?
>> 
>> Yup. It took me a while to figure out that the patches are doing "the
>> right thing". That code could really use some comments that explain
>> what's going on - like the semantics of htab_mask (mask of the PTEG,
>> will be shifted by 7 for physical address later), what htab_shift
>> really is (log2 of the HTAB size), etc :).
> 
> 
> How about
> 
>    /*
>     * htab_mask is the mask used normalize hash value to PTEG index.
>     * htab_shift is log2 of hash table size.
>     * We have 8 hpte per group, and each hpte is 16 bytes.
>     * ie have 128 bytes per hpte entry.
>     */

I was thinking that maybe the htab_shift and htab_mask names are just confusing. But I haven't managed to come up with really good names either.

Just put some comment on the variable definitions and let's hope that makes us remember what they were supposed to mean next time we have to dig through this code.


Alex
diff mbox

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d4f3502..8bf886e 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -662,6 +662,7 @@  static void spapr_reset_htab(sPAPREnvironment *spapr)
     if (shift > 0) {
         /* Kernel handles htab, we don't need to allocate one */
         spapr->htab_shift = shift;
+        kvmppc_kern_htab = true;
     } else {
         if (!spapr->htab) {
             /* Allocate an htab if we don't yet have one */
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index f10ba8a..f9ea691 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -50,8 +50,9 @@  static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     target_ulong ptel = args[3];
     target_ulong page_shift = 12;
     target_ulong raddr;
-    target_ulong i;
+    target_ulong index;
     hwaddr hpte;
+    void *token;
 
     /* only handle 4k and 16M pages for now */
     if (pteh & HPTE64_V_LARGE) {
@@ -94,30 +95,37 @@  static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
         return H_PARAMETER;
     }
+
+    index = 0;
+    hpte = pte_index * HASH_PTE_SIZE_64;
     if (likely((flags & H_EXACT) == 0)) {
         pte_index &= ~7ULL;
-        hpte = pte_index * HASH_PTE_SIZE_64;
-        for (i = 0; ; ++i) {
-            if (i == 8) {
+        token = ppc_hash64_start_access(cpu, pte_index);
+        do {
+            if (index == 8) {
+                ppc_hash64_stop_access(token);
                 return H_PTEG_FULL;
             }
-            if ((ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) == 0) {
+            if ((ppc_hash64_load_hpte0(env, token, index) & HPTE64_V_VALID) == 0) {
                 break;
             }
-            hpte += HASH_PTE_SIZE_64;
-        }
+        } while (index++);
+        ppc_hash64_stop_access(token);
     } else {
-        i = 0;
-        hpte = pte_index * HASH_PTE_SIZE_64;
-        if (ppc_hash64_load_hpte0(env, hpte) & HPTE64_V_VALID) {
+        token = ppc_hash64_start_access(cpu, pte_index);
+        if (ppc_hash64_load_hpte0(env, token, 0) & HPTE64_V_VALID) {
+            ppc_hash64_stop_access(token);
             return H_PTEG_FULL;
         }
+        ppc_hash64_stop_access(token);
     }
+    hpte += index * HASH_PTE_SIZE_64;
+
     ppc_hash64_store_hpte1(env, hpte, ptel);
     /* eieio();  FIXME: need some sort of barrier for smp? */
     ppc_hash64_store_hpte0(env, hpte, pteh | HPTE64_V_HPTE_DIRTY);
 
-    args[0] = pte_index + i;
+    args[0] = pte_index + index;
     return H_SUCCESS;
 }
 
@@ -134,16 +142,17 @@  static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
                                 target_ulong *vp, target_ulong *rp)
 {
     hwaddr hpte;
+    void *token;
     target_ulong v, r, rb;
 
     if ((ptex * HASH_PTE_SIZE_64) & ~env->htab_mask) {
         return REMOVE_PARM;
     }
 
-    hpte = ptex * HASH_PTE_SIZE_64;
-
-    v = ppc_hash64_load_hpte0(env, hpte);
-    r = ppc_hash64_load_hpte1(env, hpte);
+    token = ppc_hash64_start_access(ppc_env_get_cpu(env), ptex);
+    v = ppc_hash64_load_hpte0(env, token, 0);
+    r = ppc_hash64_load_hpte1(env, token, 0);
+    ppc_hash64_stop_access(token);
 
     if ((v & HPTE64_V_VALID) == 0 ||
         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn) ||
@@ -152,6 +161,7 @@  static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex,
     }
     *vp = v;
     *rp = r;
+    hpte = ptex * HASH_PTE_SIZE_64;
     ppc_hash64_store_hpte0(env, hpte, HPTE64_V_HPTE_DIRTY);
     rb = compute_tlbie_rb(v, r, ptex);
     ppc_tlb_invalidate_one(env, rb);
@@ -260,16 +270,17 @@  static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     target_ulong pte_index = args[1];
     target_ulong avpn = args[2];
     hwaddr hpte;
+    void *token;
     target_ulong v, r, rb;
 
     if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
         return H_PARAMETER;
     }
 
-    hpte = pte_index * HASH_PTE_SIZE_64;
-
-    v = ppc_hash64_load_hpte0(env, hpte);
-    r = ppc_hash64_load_hpte1(env, hpte);
+    token = ppc_hash64_start_access(cpu, pte_index);
+    v = ppc_hash64_load_hpte0(env, token, 0);
+    r = ppc_hash64_load_hpte1(env, token, 0);
+    ppc_hash64_stop_access(token);
 
     if ((v & HPTE64_V_VALID) == 0 ||
         ((flags & H_AVPN) && (v & ~0x7fULL) != avpn)) {
@@ -282,6 +293,7 @@  static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr,
     r |= (flags << 48) & HPTE64_R_KEY_HI;
     r |= flags & (HPTE64_R_PP | HPTE64_R_N | HPTE64_R_KEY_LO);
     rb = compute_tlbie_rb(v, r, pte_index);
+    hpte = pte_index * HASH_PTE_SIZE_64;
     ppc_hash64_store_hpte0(env, hpte, (v & ~HPTE64_V_VALID) | HPTE64_V_HPTE_DIRTY);
     ppc_tlb_invalidate_one(env, rb);
     ppc_hash64_store_hpte1(env, hpte, r);
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index ac70efbf..b8f9544 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -1783,6 +1783,11 @@  bool kvmppc_has_cap_epr(void)
     return cap_epr;
 }
 
+bool kvmppc_has_cap_htab_fd(void)
+{
+    return cap_htab_fd;
+}
+
 static int kvm_ppc_register_host_cpu_type(void)
 {
     TypeInfo type_info = {
@@ -1888,3 +1893,51 @@  int kvm_arch_on_sigbus(int code, void *addr)
 void kvm_arch_init_irq_routing(KVMState *s)
 {
 }
+
+struct kvm_get_htab_buf {
+    struct kvm_get_htab_header header;
+    /*
+     * We required one extra byte for read
+     */
+    unsigned long hpte[(HPTES_PER_GROUP * 2) + 1];
+};
+
+void *kvmppc_hash64_read_pteg(PowerPCCPU *cpu, unsigned long pte_index)
+{
+    int htab_fd;
+    struct kvm_get_htab_fd ghf;
+    struct kvm_get_htab_buf  *hpte_buf;
+
+    ghf.flags = 0;
+    ghf.start_index = pte_index;
+    htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
+    if (htab_fd < 0) {
+        goto error_out;
+    }
+
+    hpte_buf = g_malloc0(sizeof(*hpte_buf));
+    /*
+     * Read the hpte group
+     */
+    if (read(htab_fd, hpte_buf, sizeof(*hpte_buf)) < 0) {
+        goto out_close;
+    }
+
+    close(htab_fd);
+    return hpte_buf->hpte;
+
+out_close:
+    g_free(hpte_buf);
+    close(htab_fd);
+error_out:
+    return NULL;
+}
+
+void kvmppc_hash64_free_pteg(void *token)
+{
+    struct kvm_get_htab_buf *htab_buf;
+
+    htab_buf = container_of(token, struct kvm_get_htab_buf, hpte);
+    g_free(htab_buf);
+    return;
+}
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index 4ae7bf2..9981e34 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -38,10 +38,13 @@  uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift);
 #endif /* !CONFIG_USER_ONLY */
 int kvmppc_fixup_cpu(PowerPCCPU *cpu);
 bool kvmppc_has_cap_epr(void);
+bool kvmppc_has_cap_htab_fd(void);
 int kvmppc_get_htab_fd(bool write);
 int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns);
 int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
                            uint16_t n_valid, uint16_t n_invalid);
+void *kvmppc_hash64_read_pteg(PowerPCCPU *cpu, unsigned long pte_index);
+void kvmppc_hash64_free_pteg(void *token);
 
 #else
 
@@ -164,6 +167,11 @@  static inline bool kvmppc_has_cap_epr(void)
     return false;
 }
 
+static inline bool kvmppc_has_cap_htab_fd(void)
+{
+    return false;
+}
+
 static inline int kvmppc_get_htab_fd(bool write)
 {
     return -1;
@@ -181,6 +189,17 @@  static inline int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
     abort();
 }
 
+static inline void *kvmppc_hash64_read_pteg(PowerPCCPU *cpu,
+                                            unsigned long pte_index)
+{
+    abort();
+}
+
+static inline void kvmppc_hash64_free_pteg(void *token)
+{
+    abort();
+}
+
 #endif
 
 #ifndef CONFIG_KVM
diff --git a/target-ppc/mmu-hash64.c b/target-ppc/mmu-hash64.c
index 67fc1b5..f59d199 100644
--- a/target-ppc/mmu-hash64.c
+++ b/target-ppc/mmu-hash64.c
@@ -41,6 +41,11 @@ 
 #endif
 
 /*
+ * Used to indicate whether we have allocated htab in the
+ * host kernel
+ */
+bool kvmppc_kern_htab;
+/*
  * SLB handling
  */
 
@@ -302,29 +307,76 @@  static int ppc_hash64_amr_prot(CPUPPCState *env, ppc_hash_pte64_t pte)
     return prot;
 }
 
-static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr pteg_off,
+void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index)
+{
+    void *token = NULL;
+    hwaddr pte_offset;
+
+    pte_offset = pte_index * HASH_PTE_SIZE_64;
+    if (kvmppc_kern_htab) {
+        /*
+         * HTAB is controlled by KVM. Fetch the PTEG into a new buffer.
+         */
+        token = kvmppc_hash64_read_pteg(cpu, pte_index);
+        if (token) {
+            return token;
+        }
+        /*
+         * pteg read failed, even though we have allocated htab via
+         * kvmppc_reset_htab.
+         */
+        return NULL;
+    }
+    /*
+     * HTAB is controlled by QEMU. Just point to the internally
+     * accessible PTEG.
+     */
+    if (cpu->env.external_htab) {
+        token = cpu->env.external_htab + pte_offset;
+    } else if (cpu->env.htab_base) {
+        token = (uint8_t *) cpu->env.htab_base + pte_offset;
+    }
+    return token;
+}
+
+void ppc_hash64_stop_access(void *token)
+{
+    if (kvmppc_kern_htab) {
+        return kvmppc_hash64_free_pteg(token);
+    }
+}
+
+static hwaddr ppc_hash64_pteg_search(CPUPPCState *env, hwaddr hash,
                                      bool secondary, target_ulong ptem,
                                      ppc_hash_pte64_t *pte)
 {
-    hwaddr pte_offset = pteg_off;
-    target_ulong pte0, pte1;
     int i;
+    void *token;
+    target_ulong pte0, pte1;
+    unsigned long pte_index;
 
+    pte_index = (hash & env->htab_mask) * HPTES_PER_GROUP;
+    token = ppc_hash64_start_access(ppc_env_get_cpu(env), pte_index);
+    if (!token) {
+        return -1;
+    }
     for (i = 0; i < HPTES_PER_GROUP; i++) {
-        pte0 = ppc_hash64_load_hpte0(env, pte_offset);
-        pte1 = ppc_hash64_load_hpte1(env, pte_offset);
+        pte0 = ppc_hash64_load_hpte0(env, token, i);
+        pte1 = ppc_hash64_load_hpte1(env, token, i);
 
         if ((pte0 & HPTE64_V_VALID)
             && (secondary == !!(pte0 & HPTE64_V_SECONDARY))
             && HPTE64_V_COMPARE(pte0, ptem)) {
             pte->pte0 = pte0;
             pte->pte1 = pte1;
-            return pte_offset;
+            ppc_hash64_stop_access(token);
+            return (pte_index + i) * HASH_PTE_SIZE_64;
         }
-
-        pte_offset += HASH_PTE_SIZE_64;
     }
-
+    ppc_hash64_stop_access(token);
+    /*
+     * We didn't find a valid entry.
+     */
     return -1;
 }
 
@@ -332,7 +384,7 @@  static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
                                      ppc_slb_t *slb, target_ulong eaddr,
                                      ppc_hash_pte64_t *pte)
 {
-    hwaddr pteg_off, pte_offset;
+    hwaddr pte_offset;
     hwaddr hash;
     uint64_t vsid, epnshift, epnmask, epn, ptem;
 
@@ -367,8 +419,7 @@  static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
             " vsid=" TARGET_FMT_lx " ptem=" TARGET_FMT_lx
             " hash=" TARGET_FMT_plx "\n",
             env->htab_base, env->htab_mask, vsid, ptem,  hash);
-    pteg_off = (hash * HASH_PTEG_SIZE_64) & env->htab_mask;
-    pte_offset = ppc_hash64_pteg_search(env, pteg_off, 0, ptem, pte);
+    pte_offset = ppc_hash64_pteg_search(env, hash, 0, ptem, pte);
 
     if (pte_offset == -1) {
         /* Secondary PTEG lookup */
@@ -377,8 +428,7 @@  static hwaddr ppc_hash64_htab_lookup(CPUPPCState *env,
                 " hash=" TARGET_FMT_plx "\n", env->htab_base,
                 env->htab_mask, vsid, ptem, ~hash);
 
-        pteg_off = (~hash * HASH_PTEG_SIZE_64) & env->htab_mask;
-        pte_offset = ppc_hash64_pteg_search(env, pteg_off, 1, ptem, pte);
+        pte_offset = ppc_hash64_pteg_search(env, ~hash, 1, ptem, pte);
     }
 
     return pte_offset;
diff --git a/target-ppc/mmu-hash64.h b/target-ppc/mmu-hash64.h
index 55f5a23..c6f64f6 100644
--- a/target-ppc/mmu-hash64.h
+++ b/target-ppc/mmu-hash64.h
@@ -75,23 +75,30 @@  int ppc_hash64_handle_mmu_fault(CPUPPCState *env, target_ulong address, int rw,
 #define HPTE64_V_1TB_SEG        0x4000000000000000ULL
 #define HPTE64_V_VRMA_MASK      0x4001ffffff000000ULL
 
-static inline target_ulong ppc_hash64_load_hpte0(CPUPPCState *env,
-                                                 hwaddr pte_offset)
+
+extern bool kvmppc_kern_htab;
+void *ppc_hash64_start_access(PowerPCCPU *cpu, unsigned long pte_index);
+void ppc_hash64_stop_access(void *token);
+
+static inline target_ulong ppc_hash64_load_hpte0(CPUPPCState *env, void *token,
+                                                 int index)
 {
+    index *= HASH_PTE_SIZE_64;
     if (env->external_htab) {
-        return  ldq_p(env->external_htab + pte_offset);
+        return  ldq_p(token + index);
     } else {
-        return ldq_phys(env->htab_base + pte_offset);
+        return ldq_phys((uint64_t)(token + index));
     }
 }
 
-static inline target_ulong ppc_hash64_load_hpte1(CPUPPCState *env,
-                                                 hwaddr pte_offset)
+static inline target_ulong ppc_hash64_load_hpte1(CPUPPCState *env, void *token,
+                                                 int index)
 {
+    index *= HASH_PTE_SIZE_64;
     if (env->external_htab) {
-        return ldq_p(env->external_htab + pte_offset + HASH_PTE_SIZE_64/2);
+        return  ldq_p(token + index + HASH_PTE_SIZE_64/2);
     } else {
-        return ldq_phys(env->htab_base + pte_offset + HASH_PTE_SIZE_64/2);
+        return ldq_phys((uint64_t)(token + index + HASH_PTE_SIZE_64/2));
     }
 }