diff mbox

[15/26] Virtual hash page table handling on pSeries machine

Message ID 1300251423-6715-16-git-send-email-david@gibson.dropbear.id.au
State New
Headers show

Commit Message

David Gibson March 16, 2011, 4:56 a.m. UTC
On pSeries logical partitions, excepting the old POWER4-style full system
partitions, the guest does not have direct access to the hardware page
table.  Instead, the pagetable exists in hypervisor memory, and the guest
must manipulate it with hypercalls.

However, our current pSeries emulation more closely resembles the old
style where the guest must set up and handle the pagetables itself.  This
patch converts it to act like a modern partition.

This involves two things: first, the hash translation path is modified to
permit the has table to be stored externally to the emulated machine's
RAM.  The pSeries machine init code configures the CPUs to use this mode.

Secondly, we emulate the PAPR hypercalls for manipulating the external
hashed page table.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
---
 hw/spapr.c          |   32 ++++++-
 hw/spapr_hcall.c    |  247 +++++++++++++++++++++++++++++++++++++++++++++++++++
 target-ppc/cpu.h    |    2 +
 target-ppc/helper.c |   36 ++++++--
 4 files changed, 305 insertions(+), 12 deletions(-)

Comments

Alexander Graf March 16, 2011, 3:03 p.m. UTC | #1
On 03/16/2011 05:56 AM, David Gibson wrote:
> On pSeries logical partitions, excepting the old POWER4-style full system
> partitions, the guest does not have direct access to the hardware page
> table.  Instead, the pagetable exists in hypervisor memory, and the guest
> must manipulate it with hypercalls.
>
> However, our current pSeries emulation more closely resembles the old
> style where the guest must set up and handle the pagetables itself.  This
> patch converts it to act like a modern partition.
>
> This involves two things: first, the hash translation path is modified to
> permit the has table to be stored externally to the emulated machine's
> RAM.  The pSeries machine init code configures the CPUs to use this mode.
>
> Secondly, we emulate the PAPR hypercalls for manipulating the external
> hashed page table.
>
> Signed-off-by: David Gibson<dwg@au1.ibm.com>
> ---
>   hw/spapr.c          |   32 ++++++-
>   hw/spapr_hcall.c    |  247 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   target-ppc/cpu.h    |    2 +
>   target-ppc/helper.c |   36 ++++++--
>   4 files changed, 305 insertions(+), 12 deletions(-)
>
> diff --git a/hw/spapr.c b/hw/spapr.c
> index 25e4a9e..c3d9286 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -50,12 +50,15 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>                                 sPAPREnvironment *spapr,
>                                 target_phys_addr_t initrd_base,
>                                 target_phys_addr_t initrd_size,
> -                              const char *kernel_cmdline)
> +                              const char *kernel_cmdline,
> +                              long hash_shift)
>   {
>       void *fdt;
>       uint64_t mem_reg_property[] = { 0, cpu_to_be64(ramsize) };
>       uint32_t start_prop = cpu_to_be32(initrd_base);
>       uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
> +    uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
> +    char hypertas_prop[] = "hcall-pft\0hcall-term";
>       int i;
>       char *modelname;
>       int ret;
> @@ -138,6 +141,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>            * full emu, for kvm we should copy it from the host */
>           _FDT((fdt_property_cell(fdt, "clock-frequency", 1000000000)));
>           _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
> +        _FDT((fdt_property(fdt, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop))));
>           _FDT((fdt_property_string(fdt, "status", "okay")));
>           _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
>
> @@ -153,6 +157,14 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>
>       _FDT((fdt_end_node(fdt)));
>
> +    /* RTAS */
> +    _FDT((fdt_begin_node(fdt, "rtas")));
> +
> +    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
> +                       sizeof(hypertas_prop))));
> +
> +    _FDT((fdt_end_node(fdt)));
> +
>       /* vdevice */
>       _FDT((fdt_begin_node(fdt, "vdevice")));
>
> @@ -203,12 +215,13 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>                              const char *cpu_model)
>   {
>       CPUState *envs[MAX_CPUS];
> -    void *fdt;
> +    void *fdt, *htab;
>       int i;
>       ram_addr_t ram_offset;
>       target_phys_addr_t fdt_addr;
>       uint32_t kernel_base, initrd_base;
> -    long kernel_size, initrd_size;
> +    long kernel_size, initrd_size, htab_size;
> +    long pteg_shift = 17;
>       int fdt_size;
>       sPAPREnvironment *spapr;
>
> @@ -248,6 +261,16 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>       ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
>       cpu_register_physical_memory(0, ram_size, ram_offset);
>
> +    /* allocate hash page table */
> +    htab_size = 1ULL<<  (pteg_shift + 7);

Linux makes the htab size depend on the provided amount of ram. 
Shouldn't we do the same?

> +    htab = qemu_mallocz(htab_size);
> +
> +    for (i = 0; i<  smp_cpus; i++) {
> +        envs[i]->external_htab = htab;
> +        envs[i]->htab_base = -1;
> +        envs[i]->htab_mask = htab_size - 1;
> +    }
> +
>       spapr->vio_bus = spapr_vio_bus_init();
>
>       for (i = 0; i<  MAX_SERIAL_PORTS; i++) {
> @@ -293,7 +316,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>
>       /* Prepare the device tree */
>       fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, envs, spapr,
> -                           initrd_base, initrd_size, kernel_cmdline);
> +                           initrd_base, initrd_size, kernel_cmdline,
> +                           pteg_shift + 7);
>       if (!fdt) {
>           hw_error("Couldn't create pSeries device tree\n");
>           exit(1);
> diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
> index 6ddac00..2b14000 100644
> --- a/hw/spapr_hcall.c
> +++ b/hw/spapr_hcall.c
> @@ -1,8 +1,246 @@
>   #include "sysemu.h"
>   #include "cpu.h"
>   #include "qemu-char.h"
> +#include "sysemu.h"
> +#include "qemu-char.h"
> +#include "exec-all.h"
>   #include "hw/spapr.h"
>
> +#define HPTES_PER_GROUP 8
> +
> +#define HPTE_V_SSIZE_SHIFT      62
> +#define HPTE_V_AVPN_SHIFT       7
> +#define HPTE_V_AVPN             0x3fffffffffffff80ULL
> +#define HPTE_V_AVPN_VAL(x)      (((x)&  HPTE_V_AVPN)>>  HPTE_V_AVPN_SHIFT)
> +#define HPTE_V_COMPARE(x,y)     (!(((x) ^ (y))&  0xffffffffffffff80UL))
> +#define HPTE_V_BOLTED           0x0000000000000010ULL
> +#define HPTE_V_LOCK             0x0000000000000008ULL
> +#define HPTE_V_LARGE            0x0000000000000004ULL
> +#define HPTE_V_SECONDARY        0x0000000000000002ULL
> +#define HPTE_V_VALID            0x0000000000000001ULL
> +
> +#define HPTE_R_PP0              0x8000000000000000ULL
> +#define HPTE_R_TS               0x4000000000000000ULL
> +#define HPTE_R_KEY_HI           0x3000000000000000ULL
> +#define HPTE_R_RPN_SHIFT        12
> +#define HPTE_R_RPN              0x3ffffffffffff000ULL
> +#define HPTE_R_FLAGS            0x00000000000003ffULL
> +#define HPTE_R_PP               0x0000000000000003ULL
> +#define HPTE_R_N                0x0000000000000004ULL
> +#define HPTE_R_G                0x0000000000000008ULL
> +#define HPTE_R_M                0x0000000000000010ULL
> +#define HPTE_R_I                0x0000000000000020ULL
> +#define HPTE_R_W                0x0000000000000040ULL
> +#define HPTE_R_WIMG             0x0000000000000078ULL
> +#define HPTE_R_C                0x0000000000000080ULL
> +#define HPTE_R_R                0x0000000000000100ULL
> +#define HPTE_R_KEY_LO           0x0000000000000e00ULL
> +
> +#define HPTE_V_1TB_SEG          0x4000000000000000ULL
> +#define HPTE_V_VRMA_MASK        0x4001ffffff000000ULL
> +
> +#define HPTE_V_HVLOCK           0x40ULL
> +
> +static inline int lock_hpte(void *hpte, target_ulong bits)
> +{
> +    uint64_t pteh;
> +
> +    pteh = ldq_p(hpte);
> +
> +    /* FIXME: probably need some sort of lockage for SMP */

Guest SMP doesn't get mapped to host SMP. So you're safe here.

> +    if (pteh&  bits) {
> +        return 0;
> +    }
> +    stq_p(hpte, pteh | HPTE_V_HVLOCK);
> +    return 1;
> +}
> +
> +static target_ulong compute_tlbie_rb(target_ulong v, target_ulong r,
> +                                     target_ulong pte_index)
> +{
> +    target_ulong rb, va_low;
> +
> +    rb = (v&  ~0x7fULL)<<  16; /* AVA field */
> +    va_low = pte_index>>  3;
> +    if (v&  HPTE_V_SECONDARY)

Braces

> +        va_low = ~va_low;
> +    /* xor vsid from AVA */
> +    if (!(v&  HPTE_V_1TB_SEG))

Braces

> +        va_low ^= v>>  12;
> +    else
> +        va_low ^= v>>  24;
> +    va_low&= 0x7ff;
> +    if (v&  HPTE_V_LARGE) {
> +        rb |= 1;                         /* L field */
> +#if 0 /* Disable that P7 specific bit for now */
> +        if (r&  0xff000) {
> +            /* non-16MB large page, must be 64k */
> +            /* (masks depend on page size) */
> +            rb |= 0x1000;                /* page encoding in LP field */
> +            rb |= (va_low&  0x7f)<<  16; /* 7b of VA in AVA/LP field */
> +            rb |= (va_low&  0xfe);       /* AVAL field */
> +        }
> +#endif
> +    } else {
> +        /* 4kB page */
> +        rb |= (va_low&  0x7ff)<<  12;   /* remaining 11b of AVA */
> +    }
> +    rb |= (v>>  54)&  0x300;            /* B field */
> +    return rb;
> +}
> +
> +static target_ulong h_enter(CPUState *env, sPAPREnvironment *spapr,
> +                            target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong flags = args[0];
> +    target_ulong pte_index = args[1];
> +    target_ulong pteh = args[2];
> +    target_ulong ptel = args[3];
> +    target_ulong porder;
> +    target_ulong i, pa;
> +    uint8_t *hpte;
> +
> +    /* only handle 4k and 16M pages for now */
> +    porder = 12;
> +    if (pteh&  HPTE_V_LARGE) {
> +        if ((ptel&  0xf000) == 0x1000) {
> +            /* 64k page */

According to the comment above and the #if 0 in tlbie you don't support 
64k pages?

> +            porder = 16;
> +        } else if ((ptel&  0xff000) == 0) {
> +            /* 16M page */
> +            porder = 24;
> +            /* lowest AVA bit must be 0 for 16M pages */
> +            if (pteh&  0x80)

Braces

> +                return H_PARAMETER;
> +        } else {
> +            return H_PARAMETER;
> +        }
> +    }
> +
> +    pa = ptel&  HPTE_R_RPN;
> +    /* FIXME: bounds check the pa? */
> +
> +    /* Check WIMG */
> +    if ((ptel&  HPTE_R_WIMG) != HPTE_R_M)

Braces

> +        return H_PARAMETER;
> +    pteh&= ~0x60ULL;
> +
> +    if ((pte_index * HASH_PTE_SIZE_64)&  ~env->htab_mask)

Braces

> +        return H_PARAMETER;
> +    if (likely((flags&  H_EXACT) == 0)) {
> +        pte_index&= ~7ULL;
> +        hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +        for (i = 0; ; ++i) {
> +            if (i == 8)

Braces

> +                return H_PTEG_FULL;
> +            if (((ldq_p(hpte)&  HPTE_V_VALID) == 0)&&
> +                lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
> +                break;
> +            }
> +            hpte += HASH_PTE_SIZE_64;
> +        }
> +    } else {
> +        i = 0;
> +        hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +        if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
> +            return H_PTEG_FULL;
> +        }
> +    }
> +    stq_p(hpte + (HASH_PTE_SIZE_64/2), ptel);
> +    /* eieio();  FIXME: need some sort of barrier for smp? */

see above :)

> +    stq_p(hpte, pteh);
> +
> +    assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +    args[0] = pte_index + i;
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_remove(CPUState *env, sPAPREnvironment *spapr,
> +                             target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong flags = args[0];
> +    target_ulong pte_index = args[1];
> +    target_ulong avpn = args[2];
> +    uint8_t *hpte;
> +    target_ulong v, r, rb;
> +
> +    if ((pte_index * HASH_PTE_SIZE_64)&  ~env->htab_mask) {
> +        return H_PARAMETER;
> +    }
> +
> +    hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
> +        /* We have no real concurrency in qemu soft-emulation, so we
> +         * will never actually have a contested lock */
> +        assert(0);
> +    }
> +
> +    v = ldq_p(hpte);
> +    r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
> +
> +    if ((v&  HPTE_V_VALID) == 0 ||
> +        ((flags&  H_AVPN)&&  (v&  ~0x7fULL) != avpn) ||
> +        ((flags&  H_ANDCOND)&&  (v&  avpn) != 0)) {
> +        stq_p(hpte, v&  ~HPTE_V_HVLOCK);
> +        assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +        return H_NOT_FOUND;
> +    }
> +    args[0] = v&  ~HPTE_V_HVLOCK;
> +    args[1] = r;
> +    stq_p(hpte, 0);
> +    rb = compute_tlbie_rb(v, r, pte_index);
> +//    ppc_tlb_invalidate_one(env, rb);

Huh?

> +    tlb_flush(env, 1);
> +    assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_protect(CPUState *env, sPAPREnvironment *spapr,
> +                              target_ulong opcode, target_ulong *args)
> +{
> +    target_ulong flags = args[0];
> +    target_ulong pte_index = args[1];
> +    target_ulong avpn = args[2];
> +    uint8_t *hpte;
> +    target_ulong v, r, rb;
> +
> +    if ((pte_index * HASH_PTE_SIZE_64)&  ~env->htab_mask) {
> +        return H_PARAMETER;
> +    }
> +
> +    hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
> +    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
> +        /* We have no real concurrency in qemu soft-emulation, so we
> +         * will never actually have a contested lock */
> +        assert(0);
> +    }
> +
> +    v = ldq_p(hpte);
> +    r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
> +
> +    if ((v&  HPTE_V_VALID) == 0 ||
> +        ((flags&  H_AVPN)&&  (v&  ~0x7fULL) != avpn)) {
> +        stq_p(hpte, v&  ~HPTE_V_HVLOCK);
> +        assert (!(ldq_p(hpte)&  HPTE_V_HVLOCK));
> +        return H_NOT_FOUND;
> +    }
> +
> +    r&= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
> +           HPTE_R_KEY_HI | HPTE_R_KEY_LO);
> +    r |= (flags<<  55)&  HPTE_R_PP0;
> +    r |= (flags<<  48)&  HPTE_R_KEY_HI;
> +    r |= flags&  (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
> +    rb = compute_tlbie_rb(v, r, pte_index);
> +    stq_p(hpte, v&  ~HPTE_V_VALID);
> +    //ppc_tlb_invalidate_one(env, rb);

Huh?

> +    tlb_flush(env, 1);

Wow, why do you need a full tlb flush here?



Alex
David Gibson March 17, 2011, 1:03 a.m. UTC | #2
On Wed, Mar 16, 2011 at 04:03:47PM +0100, Alexander Graf wrote:
> On 03/16/2011 05:56 AM, David Gibson wrote:
[snip]
> >@@ -248,6 +261,16 @@ static void ppc_spapr_init(ram_addr_t ram_size,
> >      ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
> >      cpu_register_physical_memory(0, ram_size, ram_offset);
> >
> >+    /* allocate hash page table */
> >+    htab_size = 1ULL<<  (pteg_shift + 7);
> 
> Linux makes the htab size depend on the provided amount of ram.
> Shouldn't we do the same?

Well... maybe.  In fact the guidelines for hash allocation tend to be
rather larger than really necessary for a Linux guest, so generally
16mb for the hash will be fine.  This does also correspond to the
allocation for the guest hash we use in our experimental kvm code
(making the hash exactly one hugepage makes the necessary contiguous
allocation easier).

[snip]
> >+    r&= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
> >+           HPTE_R_KEY_HI | HPTE_R_KEY_LO);
> >+    r |= (flags<<  55)&  HPTE_R_PP0;
> >+    r |= (flags<<  48)&  HPTE_R_KEY_HI;
> >+    r |= flags&  (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
> >+    rb = compute_tlbie_rb(v, r, pte_index);
> >+    stq_p(hpte, v&  ~HPTE_V_VALID);
> >+    //ppc_tlb_invalidate_one(env, rb);
> 
> Huh?
> 
> >+    tlb_flush(env, 1);
> 
> Wow, why do you need a full tlb flush here?

Ah, meant to revert that and forgot.  Originally I wasn't sure if
compute_tlbie_rb was deducing enough of the full virtual address to
make a targetted tlb invalidate safe.  I've since discovered it does,
but fixing this up to take advantage fell through the cracks.

Fixing that for the next version now.
Alexander Graf March 17, 2011, 7:35 a.m. UTC | #3
On 17.03.2011, at 02:03, David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Mar 16, 2011 at 04:03:47PM +0100, Alexander Graf wrote:
>> On 03/16/2011 05:56 AM, David Gibson wrote:
> [snip]
>>> @@ -248,6 +261,16 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>>>     ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
>>>     cpu_register_physical_memory(0, ram_size, ram_offset);
>>> 
>>> +    /* allocate hash page table */
>>> +    htab_size = 1ULL<<  (pteg_shift + 7);
>> 
>> Linux makes the htab size depend on the provided amount of ram.
>> Shouldn't we do the same?
> 
> Well... maybe.  In fact the guidelines for hash allocation tend to be
> rather larger than really necessary for a Linux guest, so generally
> 16mb for the hash will be fine.  This does also correspond to the
> allocation for the guest hash we use in our experimental kvm code
> (making the hash exactly one hugepage makes the necessary contiguous
> allocation easier).

Hrm - ok :).

Alex
diff mbox

Patch

diff --git a/hw/spapr.c b/hw/spapr.c
index 25e4a9e..c3d9286 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -50,12 +50,15 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
                               sPAPREnvironment *spapr,
                               target_phys_addr_t initrd_base,
                               target_phys_addr_t initrd_size,
-                              const char *kernel_cmdline)
+                              const char *kernel_cmdline,
+                              long hash_shift)
 {
     void *fdt;
     uint64_t mem_reg_property[] = { 0, cpu_to_be64(ramsize) };
     uint32_t start_prop = cpu_to_be32(initrd_base);
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
+    uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
+    char hypertas_prop[] = "hcall-pft\0hcall-term";
     int i;
     char *modelname;
     int ret;
@@ -138,6 +141,7 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
          * full emu, for kvm we should copy it from the host */
         _FDT((fdt_property_cell(fdt, "clock-frequency", 1000000000)));
         _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
+        _FDT((fdt_property(fdt, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop))));
         _FDT((fdt_property_string(fdt, "status", "okay")));
         _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
 
@@ -153,6 +157,14 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
 
     _FDT((fdt_end_node(fdt)));
 
+    /* RTAS */
+    _FDT((fdt_begin_node(fdt, "rtas")));
+
+    _FDT((fdt_property(fdt, "ibm,hypertas-functions", hypertas_prop,
+                       sizeof(hypertas_prop))));
+    
+    _FDT((fdt_end_node(fdt)));
+
     /* vdevice */
     _FDT((fdt_begin_node(fdt, "vdevice")));
 
@@ -203,12 +215,13 @@  static void ppc_spapr_init(ram_addr_t ram_size,
                            const char *cpu_model)
 {
     CPUState *envs[MAX_CPUS];
-    void *fdt;
+    void *fdt, *htab;
     int i;
     ram_addr_t ram_offset;
     target_phys_addr_t fdt_addr;
     uint32_t kernel_base, initrd_base;
-    long kernel_size, initrd_size;
+    long kernel_size, initrd_size, htab_size;
+    long pteg_shift = 17;
     int fdt_size;
     sPAPREnvironment *spapr;
 
@@ -248,6 +261,16 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
     cpu_register_physical_memory(0, ram_size, ram_offset);
 
+    /* allocate hash page table */
+    htab_size = 1ULL << (pteg_shift + 7);
+    htab = qemu_mallocz(htab_size);
+
+    for (i = 0; i < smp_cpus; i++) {
+        envs[i]->external_htab = htab;
+        envs[i]->htab_base = -1;
+        envs[i]->htab_mask = htab_size - 1;
+    }
+
     spapr->vio_bus = spapr_vio_bus_init();
 
     for (i = 0; i < MAX_SERIAL_PORTS; i++) {
@@ -293,7 +316,8 @@  static void ppc_spapr_init(ram_addr_t ram_size,
 
     /* Prepare the device tree */
     fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, envs, spapr,
-                           initrd_base, initrd_size, kernel_cmdline);
+                           initrd_base, initrd_size, kernel_cmdline,
+                           pteg_shift + 7);
     if (!fdt) {
         hw_error("Couldn't create pSeries device tree\n");
         exit(1);
diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index 6ddac00..2b14000 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -1,8 +1,246 @@ 
 #include "sysemu.h"
 #include "cpu.h"
 #include "qemu-char.h"
+#include "sysemu.h"
+#include "qemu-char.h"
+#include "exec-all.h"
 #include "hw/spapr.h"
 
+#define HPTES_PER_GROUP 8
+
+#define HPTE_V_SSIZE_SHIFT      62
+#define HPTE_V_AVPN_SHIFT       7
+#define HPTE_V_AVPN             0x3fffffffffffff80ULL
+#define HPTE_V_AVPN_VAL(x)      (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT)
+#define HPTE_V_COMPARE(x,y)     (!(((x) ^ (y)) & 0xffffffffffffff80UL))
+#define HPTE_V_BOLTED           0x0000000000000010ULL
+#define HPTE_V_LOCK             0x0000000000000008ULL
+#define HPTE_V_LARGE            0x0000000000000004ULL
+#define HPTE_V_SECONDARY        0x0000000000000002ULL
+#define HPTE_V_VALID            0x0000000000000001ULL
+
+#define HPTE_R_PP0              0x8000000000000000ULL
+#define HPTE_R_TS               0x4000000000000000ULL
+#define HPTE_R_KEY_HI           0x3000000000000000ULL
+#define HPTE_R_RPN_SHIFT        12
+#define HPTE_R_RPN              0x3ffffffffffff000ULL
+#define HPTE_R_FLAGS            0x00000000000003ffULL
+#define HPTE_R_PP               0x0000000000000003ULL
+#define HPTE_R_N                0x0000000000000004ULL
+#define HPTE_R_G                0x0000000000000008ULL
+#define HPTE_R_M                0x0000000000000010ULL
+#define HPTE_R_I                0x0000000000000020ULL
+#define HPTE_R_W                0x0000000000000040ULL
+#define HPTE_R_WIMG             0x0000000000000078ULL
+#define HPTE_R_C                0x0000000000000080ULL
+#define HPTE_R_R                0x0000000000000100ULL
+#define HPTE_R_KEY_LO           0x0000000000000e00ULL
+
+#define HPTE_V_1TB_SEG          0x4000000000000000ULL
+#define HPTE_V_VRMA_MASK        0x4001ffffff000000ULL
+
+#define HPTE_V_HVLOCK           0x40ULL
+
+static inline int lock_hpte(void *hpte, target_ulong bits)
+{
+    uint64_t pteh;
+
+    pteh = ldq_p(hpte);
+
+    /* FIXME: probably need some sort of lockage for SMP */
+    if (pteh & bits) {
+        return 0;
+    }
+    stq_p(hpte, pteh | HPTE_V_HVLOCK);
+    return 1;
+}
+
+static target_ulong compute_tlbie_rb(target_ulong v, target_ulong r,
+                                     target_ulong pte_index)
+{
+    target_ulong rb, va_low;
+
+    rb = (v & ~0x7fULL) << 16; /* AVA field */
+    va_low = pte_index >> 3;
+    if (v & HPTE_V_SECONDARY)
+        va_low = ~va_low;
+    /* xor vsid from AVA */
+    if (!(v & HPTE_V_1TB_SEG))
+        va_low ^= v >> 12;
+    else
+        va_low ^= v >> 24;
+    va_low &= 0x7ff;
+    if (v & HPTE_V_LARGE) {
+        rb |= 1;                         /* L field */
+#if 0 /* Disable that P7 specific bit for now */
+        if (r & 0xff000) {
+            /* non-16MB large page, must be 64k */
+            /* (masks depend on page size) */
+            rb |= 0x1000;                /* page encoding in LP field */
+            rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
+            rb |= (va_low & 0xfe);       /* AVAL field */
+        }
+#endif
+    } else {
+        /* 4kB page */
+        rb |= (va_low & 0x7ff) << 12;   /* remaining 11b of AVA */
+    }
+    rb |= (v >> 54) & 0x300;            /* B field */
+    return rb;
+}
+
+static target_ulong h_enter(CPUState *env, sPAPREnvironment *spapr,
+                            target_ulong opcode, target_ulong *args)
+{
+    target_ulong flags = args[0];
+    target_ulong pte_index = args[1];
+    target_ulong pteh = args[2];
+    target_ulong ptel = args[3];
+    target_ulong porder;
+    target_ulong i, pa;
+    uint8_t *hpte;
+
+    /* only handle 4k and 16M pages for now */
+    porder = 12;
+    if (pteh & HPTE_V_LARGE) {
+        if ((ptel & 0xf000) == 0x1000) {
+            /* 64k page */
+            porder = 16;
+        } else if ((ptel & 0xff000) == 0) {
+            /* 16M page */
+            porder = 24;
+            /* lowest AVA bit must be 0 for 16M pages */
+            if (pteh & 0x80)
+                return H_PARAMETER;
+        } else {
+            return H_PARAMETER;
+        }
+    }
+
+    pa = ptel & HPTE_R_RPN;
+    /* FIXME: bounds check the pa? */
+
+    /* Check WIMG */
+    if ((ptel & HPTE_R_WIMG) != HPTE_R_M)
+        return H_PARAMETER;
+    pteh &= ~0x60ULL;
+
+    if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask)
+        return H_PARAMETER;
+    if (likely((flags & H_EXACT) == 0)) {
+        pte_index &= ~7ULL;
+        hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
+        for (i = 0; ; ++i) {
+            if (i == 8)
+                return H_PTEG_FULL;
+            if (((ldq_p(hpte) & HPTE_V_VALID) == 0) &&
+                lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
+                break;
+            }
+            hpte += HASH_PTE_SIZE_64;
+        }
+    } else {
+        i = 0;
+        hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
+        if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) {
+            return H_PTEG_FULL;
+        }
+    }
+    stq_p(hpte + (HASH_PTE_SIZE_64/2), ptel);
+    /* eieio();  FIXME: need some sort of barrier for smp? */
+    stq_p(hpte, pteh);
+
+    assert (!(ldq_p(hpte) & HPTE_V_HVLOCK));
+    args[0] = pte_index + i;
+    return H_SUCCESS;
+}
+
+static target_ulong h_remove(CPUState *env, sPAPREnvironment *spapr,
+                             target_ulong opcode, target_ulong *args)
+{
+    target_ulong flags = args[0];
+    target_ulong pte_index = args[1];
+    target_ulong avpn = args[2];
+    uint8_t *hpte;
+    target_ulong v, r, rb;
+
+    if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
+        return H_PARAMETER;
+    }
+
+    hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
+    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
+        /* We have no real concurrency in qemu soft-emulation, so we
+         * will never actually have a contested lock */
+        assert(0);
+    }
+
+    v = ldq_p(hpte);
+    r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
+
+    if ((v & HPTE_V_VALID) == 0 ||
+        ((flags & H_AVPN) && (v & ~0x7fULL) != avpn) ||
+        ((flags & H_ANDCOND) && (v & avpn) != 0)) {
+        stq_p(hpte, v & ~HPTE_V_HVLOCK);
+        assert (!(ldq_p(hpte) & HPTE_V_HVLOCK));
+        return H_NOT_FOUND;
+    }
+    args[0] = v & ~HPTE_V_HVLOCK;
+    args[1] = r;
+    stq_p(hpte, 0);
+    rb = compute_tlbie_rb(v, r, pte_index);
+//    ppc_tlb_invalidate_one(env, rb);
+    tlb_flush(env, 1);
+    assert (!(ldq_p(hpte) & HPTE_V_HVLOCK));
+    return H_SUCCESS;
+}
+
+static target_ulong h_protect(CPUState *env, sPAPREnvironment *spapr,
+                              target_ulong opcode, target_ulong *args)
+{
+    target_ulong flags = args[0];
+    target_ulong pte_index = args[1];
+    target_ulong avpn = args[2];
+    uint8_t *hpte;
+    target_ulong v, r, rb;
+
+    if ((pte_index * HASH_PTE_SIZE_64) & ~env->htab_mask) {
+        return H_PARAMETER;
+    }
+
+    hpte = env->external_htab + (pte_index * HASH_PTE_SIZE_64);
+    while (!lock_hpte(hpte, HPTE_V_HVLOCK)) {
+        /* We have no real concurrency in qemu soft-emulation, so we
+         * will never actually have a contested lock */
+        assert(0);
+    }
+
+    v = ldq_p(hpte);
+    r = ldq_p(hpte + (HASH_PTE_SIZE_64/2));
+
+    if ((v & HPTE_V_VALID) == 0 ||
+        ((flags & H_AVPN) && (v & ~0x7fULL) != avpn)) {
+        stq_p(hpte, v & ~HPTE_V_HVLOCK);
+        assert (!(ldq_p(hpte) & HPTE_V_HVLOCK));
+        return H_NOT_FOUND;
+    }
+
+    r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+           HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+    r |= (flags << 55) & HPTE_R_PP0;
+    r |= (flags << 48) & HPTE_R_KEY_HI;
+    r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+    rb = compute_tlbie_rb(v, r, pte_index);
+    stq_p(hpte, v & ~HPTE_V_VALID);
+    //ppc_tlb_invalidate_one(env, rb);
+    tlb_flush(env, 1);
+    stq_p(hpte + (HASH_PTE_SIZE_64/2), r);
+    /* eieio(); FIXME: need some sort of barrier on smp? */
+    stq_p(hpte, v & ~HPTE_V_HVLOCK);
+    assert (!(ldq_p(hpte) & HPTE_V_HVLOCK));
+    return H_SUCCESS;
+}
+
 struct hypercall {
     spapr_hcall_fn fn;
 } hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
@@ -41,3 +279,12 @@  target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
     fprintf(stderr, "Unimplemented hcall 0x" TARGET_FMT_lx "\n", opcode);
     return H_FUNCTION;
 }
+
+static void hypercall_init(void)
+{
+    /* hcall-pft */
+    spapr_register_hypercall(H_ENTER, h_enter);
+    spapr_register_hypercall(H_REMOVE, h_remove);
+    spapr_register_hypercall(H_PROTECT, h_protect);
+}
+device_init(hypercall_init);
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index 3a47d11..29d6b49 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -670,6 +670,8 @@  struct CPUPPCState {
     target_phys_addr_t htab_base;
     target_phys_addr_t htab_mask;
     target_ulong sr[32];
+    /* externally stored hash table */
+    uint8_t *external_htab;
     /* BATs */
     int nb_BATs;
     target_ulong DBAT[2][8];
diff --git a/target-ppc/helper.c b/target-ppc/helper.c
index 13a5ab1..5ead62f 100644
--- a/target-ppc/helper.c
+++ b/target-ppc/helper.c
@@ -585,8 +585,13 @@  static inline int _find_pte(CPUState *env, mmu_ctx_t *ctx, int is_64b, int h,
     for (i = 0; i < 8; i++) {
 #if defined(TARGET_PPC64)
         if (is_64b) {
-            pte0 = ldq_phys(env->htab_base + pteg_off + (i * 16));
-            pte1 = ldq_phys(env->htab_base + pteg_off + (i * 16) + 8);
+            if (env->external_htab) {
+                pte0 = ldq_p(env->external_htab + pteg_off + (i * 16));
+                pte1 = ldq_p(env->external_htab + pteg_off + (i * 16) + 8);
+            } else {
+                pte0 = ldq_phys(env->htab_base + pteg_off + (i * 16));
+                pte1 = ldq_phys(env->htab_base + pteg_off + (i * 16) + 8);
+            }
 
             /* We have a TLB that saves 4K pages, so let's
              * split a huge page to 4k chunks */
@@ -602,8 +607,13 @@  static inline int _find_pte(CPUState *env, mmu_ctx_t *ctx, int is_64b, int h,
         } else
 #endif
         {
-            pte0 = ldl_phys(env->htab_base + pteg_off + (i * 8));
-            pte1 =  ldl_phys(env->htab_base + pteg_off + (i * 8) + 4);
+            if (env->external_htab) {
+                pte0 = ldl_p(env->external_htab + pteg_off + (i * 8));
+                pte1 = ldl_p(env->external_htab + pteg_off + (i * 8) + 4);
+            } else {
+                pte0 = ldl_phys(env->htab_base + pteg_off + (i * 8));
+                pte1 = ldl_phys(env->htab_base + pteg_off + (i * 8) + 4);
+            }
             r = pte32_check(ctx, pte0, pte1, h, rw, type);
             LOG_MMU("Load pte from " TARGET_FMT_lx " => " TARGET_FMT_lx " "
                     TARGET_FMT_lx " %d %d %d " TARGET_FMT_lx "\n",
@@ -643,13 +653,23 @@  static inline int _find_pte(CPUState *env, mmu_ctx_t *ctx, int is_64b, int h,
         if (pte_update_flags(ctx, &pte1, ret, rw) == 1) {
 #if defined(TARGET_PPC64)
             if (is_64b) {
-                stq_phys_notdirty(env->htab_base + pteg_off + (good * 16) + 8,
-                                  pte1);
+                if (env->external_htab) {
+                    stq_p(env->external_htab + pteg_off + (good * 16) + 8,
+                          pte1);
+                } else {
+                    stq_phys_notdirty(env->htab_base + pteg_off +
+                                      (good * 16) + 8, pte1);
+                }
             } else
 #endif
             {
-                stl_phys_notdirty(env->htab_base + pteg_off + (good * 8) + 4,
-                                  pte1);
+                if (env->external_htab) {
+                    stl_p(env->external_htab + pteg_off + (good * 8) + 4,
+                          pte1);
+                } else {
+                    stl_phys_notdirty(env->htab_base + pteg_off +
+                                      (good * 8) + 4, pte1);
+                }
             }
         }
     }