Patchwork pseries: Add support for new KVM hash table control call

login
register
mail settings
Submitter Benjamin Herrenschmidt
Date June 27, 2012, 12:10 p.m.
Message ID <1340799048.3732.55.camel@pasglop>
Download mbox | patch
Permalink /patch/167631/
State New
Headers show

Comments

Benjamin Herrenschmidt - June 27, 2012, 12:10 p.m.
From: David Gibson <david@gibson.dropbear.id.au>

This adds support for then new "reset htab" ioctl which allows qemu
to properly cleanup the MMU hash table when the guest is reset. With
the corresponding kernel support, reset of a guest now works properly.

This also paves the way for indicating a different size hash table
to the kernel and for the kernel to be able to impose limits on
the requested size.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 hw/spapr.c           |   88 ++++++++++++++++++++++++++++++++------------------
 hw/spapr.h           |    2 +-
 target-ppc/kvm.c     |   17 ++++++++++
 target-ppc/kvm_ppc.h |    7 ++++
 4 files changed, 82 insertions(+), 32 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt - June 27, 2012, 12:12 p.m.
On Wed, 2012-06-27 at 22:10 +1000, Benjamin Herrenschmidt wrote:
> From: David Gibson <david@gibson.dropbear.id.au>
> 
> This adds support for then new "reset htab" ioctl which allows qemu
> to properly cleanup the MMU hash table when the guest is reset. With
> the corresponding kernel support, reset of a guest now works properly.

Forgot to mention ... this depends on a newer linux kvm.h from Avi's
-next branch, so don't apply this patch to qemu until kvm.h had the
update adding the definitions for KVM_CAP_PPC_ALLOC_HTAB and
KVM_PPC_ALLOCATE_HTAB.

Cheers,
Ben.

> This also paves the way for indicating a different size hash table
> to the kernel and for the kernel to be able to impose limits on
> the requested size.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>  hw/spapr.c           |   88 ++++++++++++++++++++++++++++++++------------------
>  hw/spapr.h           |    2 +-
>  target-ppc/kvm.c     |   17 ++++++++++
>  target-ppc/kvm_ppc.h |    7 ++++
>  4 files changed, 82 insertions(+), 32 deletions(-)
> 
> diff --git a/hw/spapr.c b/hw/spapr.c
> index a6bc5e8..e19dbd8 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -83,6 +83,8 @@
>  
>  #define PHANDLE_XICP            0x00001111
>  
> +#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
> +
>  sPAPREnvironment *spapr;
>  static int spapr_has_graphics;
>  
> @@ -111,12 +113,13 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num,
>      return qirq;
>  }
>  
> -static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
> +static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
>  {
>      int ret = 0, offset;
>      CPUPPCState *env;
>      char cpu_model[32];
>      int smt = kvmppc_smt_threads();
> +    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
>  
>      assert(spapr->cpu_model);
>  
> @@ -140,8 +143,16 @@ static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
>              return offset;
>          }
>  
> -        ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> -                          sizeof(associativity));
> +        if (nb_numa_nodes > 1) {
> +            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> +                              sizeof(associativity));
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +
> +        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
> +                          pft_size_prop, sizeof(pft_size_prop));
>          if (ret < 0) {
>              return ret;
>          }
> @@ -189,15 +200,13 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>                                     target_phys_addr_t initrd_size,
>                                     target_phys_addr_t kernel_size,
>                                     const char *boot_device,
> -                                   const char *kernel_cmdline,
> -                                   long hash_shift)
> +                                   const char *kernel_cmdline)
>  {
>      void *fdt;
>      CPUPPCState *env;
>      uint64_t mem_reg_property[2];
>      uint32_t start_prop = cpu_to_be32(initrd_base);
>      uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
> -    uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
>      char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
>          "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
>      char qemu_hypertas_prop[] = "hcall-memop1";
> @@ -366,8 +375,6 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>          _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
>          _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
>          _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
> -        _FDT((fdt_property(fdt, "ibm,pft-size",
> -                           pft_size_prop, sizeof(pft_size_prop))));
>          _FDT((fdt_property_string(fdt, "status", "okay")));
>          _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
>  
> @@ -502,11 +509,9 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
>      }
>  
>      /* Advertise NUMA via ibm,associativity */
> -    if (nb_numa_nodes > 1) {
> -        ret = spapr_set_associativity(fdt, spapr);
> -        if (ret < 0) {
> -            fprintf(stderr, "Couldn't set up NUMA device tree properties\n");
> -        }
> +    ret = spapr_fixup_cpu_dt(fdt, spapr);
> +    if (ret < 0) {
> +        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
>      }
>  
>      if (!spapr_has_graphics) {
> @@ -536,12 +541,34 @@ static void emulate_spapr_hypercall(CPUPPCState *env)
>      env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]);
>  }
>  
> -static void spapr_reset(void *opaque)
> +static void spapr_reset_htab(void *opaque)
>  {
>      sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
> +    long shift;
> +
> +    /* allocate hash page table.  For now we always make this 16mb,
> +     * later we should probably make it scale to the size of guest
> +     * RAM */
> +
> +    shift = kvmppc_reset_htab(spapr->htab_shift);
> +
> +    if (shift > 0) {
> +        /* Kernel handles htab, we don't need to allocate one */
> +        spapr->htab_shift = shift;
> +    } else {
> +        if (!spapr->htab) {
> +            /* Allocate an htab if we don't yet have one */
> +            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
> +        }
> +
> +        /* And clear it */
> +        memset(spapr->htab, 0, HTAB_SIZE(spapr));
> +    }
> +}
>  
> -    /* flush out the hash table */
> -    memset(spapr->htab, 0, spapr->htab_size);
> +static void spapr_reset(void *opaque)
> +{
> +    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
>  
>      /* Load the fdt */
>      spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
> @@ -558,8 +585,16 @@ static void spapr_reset(void *opaque)
>  static void spapr_cpu_reset(void *opaque)
>  {
>      PowerPCCPU *cpu = opaque;
> +    CPUPPCState *env = &cpu->env;
>  
>      cpu_reset(CPU(cpu));
> +
> +    env->external_htab = spapr->htab;
> +    env->htab_base = -1;
> +    env->htab_mask = HTAB_SIZE(spapr) - 1;
> +
> +    env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
> +        (spapr->htab_shift - 18);
>  }
>  
>  static int spapr_vga_init(PCIBus *pci_bus)
> @@ -603,7 +638,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>      uint32_t initrd_base = 0;
>      long kernel_size = 0, initrd_size = 0;
>      long load_limit, rtas_limit, fw_size;
> -    long pteg_shift = 17;
>      char *filename;
>  
>      spapr = g_malloc0(sizeof(*spapr));
> @@ -632,6 +666,11 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>      spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
>      load_limit = spapr->fdt_addr - FW_OVERHEAD;
>  
> +    /* For now, always aim for a 16MB hash table */
> +    /* FIXME: we should change this default based on RAM size */
> +    spapr->htab_shift = 24;
> +    qemu_register_reset(spapr_reset_htab, spapr);
> +
>      /* init CPUs */
>      if (cpu_model == NULL) {
>          cpu_model = kvm_enabled() ? "host" : "POWER7";
> @@ -664,20 +703,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>          memory_region_add_subregion(sysmem, nonrma_base, ram);
>      }
>  
> -    /* allocate hash page table.  For now we always make this 16mb,
> -     * later we should probably make it scale to the size of guest
> -     * RAM */
> -    spapr->htab_size = 1ULL << (pteg_shift + 7);
> -    spapr->htab = qemu_memalign(spapr->htab_size, spapr->htab_size);
> -
>      for (env = first_cpu; env != NULL; env = env->next_cpu) {
> -        env->external_htab = spapr->htab;
> -        env->htab_base = -1;
> -        env->htab_mask = spapr->htab_size - 1;
> -
>          /* Tell KVM that we're in PAPR mode */
> -        env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
> -                             ((pteg_shift + 7) - 18);
>          env->spr[SPR_HIOR] = 0;
>  
>          if (kvm_enabled()) {
> @@ -816,8 +843,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>      spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
>                                              initrd_base, initrd_size,
>                                              kernel_size,
> -                                            boot_device, kernel_cmdline,
> -                                            pteg_shift + 7);
> +                                            boot_device, kernel_cmdline);
>      assert(spapr->fdt_skel != NULL);
>  
>      qemu_register_reset(spapr_reset, spapr);
> diff --git a/hw/spapr.h b/hw/spapr.h
> index 9153f29..7ec4d7c 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -15,7 +15,7 @@ typedef struct sPAPREnvironment {
>  
>      target_phys_addr_t ram_limit;
>      void *htab;
> -    long htab_size;
> +    long htab_shift;
>      target_phys_addr_t fdt_addr, rtas_addr;
>      long rtas_size;
>      void *fdt_skel;
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 829e180..12ae0d7 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -1101,6 +1101,23 @@ int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size)
>      return 0;
>  }
>  
> +int kvmppc_reset_htab(int shift_hint)
> +{
> +    uint32_t shift = shift_hint;
> +
> +    if (kvm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
> +        int ret;
> +        ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        return shift;
> +    }
> +
> +    /* For now.. */
> +    return 0;
> +}
> +
>  static inline uint32_t mfpvr(void)
>  {
>      uint32_t pvr;
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index e2f8703..b5be657 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -27,6 +27,7 @@ int kvmppc_smt_threads(void);
>  off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
>  void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
>  int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
> +int kvmppc_reset_htab(int shift_hint);
>  #endif /* !CONFIG_USER_ONLY */
>  const ppc_def_t *kvmppc_host_cpu_def(void);
>  int kvmppc_fixup_cpu(CPUPPCState *env);
> @@ -94,6 +95,12 @@ static inline int kvmppc_remove_spapr_tce(void *table, int pfd,
>  {
>      return -1;
>  }
> +
> +static inline int kvmppc_reset_htab(int shift_hint)
> +{
> +    return -1;
> +}
> +
>  #endif /* !CONFIG_USER_ONLY */
>  
>  static inline const ppc_def_t *kvmppc_host_cpu_def(void)
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - June 27, 2012, 12:30 p.m.
On 27.06.2012, at 14:10, Benjamin Herrenschmidt wrote:

> From: David Gibson <david@gibson.dropbear.id.au>
> 
> This adds support for then new "reset htab" ioctl which allows qemu
> to properly cleanup the MMU hash table when the guest is reset. With
> the corresponding kernel support, reset of a guest now works properly.
> 
> This also paves the way for indicating a different size hash table
> to the kernel and for the kernel to be able to impose limits on
> the requested size.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Thanks, applied to ppc-next. Next time, please base on top of a newer git base - I had to manually fix the patch to apply.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt - June 27, 2012, 9:30 p.m.
On Wed, 2012-06-27 at 14:30 +0200, Alexander Graf wrote:
> Thanks, applied to ppc-next. Next time, please base on top of a newer
> git base - I had to manually fix the patch to apply.

It was based on top of qemu master from yesterday. As you know that's
what I work on top of. Did you make sure you had the updated linux
headers btw ? :-)

Cheers,
Ben.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - June 27, 2012, 9:33 p.m.
On 27.06.2012, at 23:30, Benjamin Herrenschmidt wrote:

> On Wed, 2012-06-27 at 14:30 +0200, Alexander Graf wrote:
>> Thanks, applied to ppc-next. Next time, please base on top of a newer
>> git base - I had to manually fix the patch to apply.
> 
> It was based on top of qemu master from yesterday.

Yesterday? When yesterday? My queue got applied yesterday, dramatically changing the tree :).

> As you know that's what I work on top of. Did you make sure you had the updated linux
> headers btw ? :-)

Of course :)


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt - July 10, 2012, 7:25 a.m.
On Wed, 2012-06-27 at 22:10 +1000, Benjamin Herrenschmidt wrote:
> From: David Gibson <david@gibson.dropbear.id.au>
> 
> This adds support for then new "reset htab" ioctl which allows qemu
> to properly cleanup the MMU hash table when the guest is reset. With
> the corresponding kernel support, reset of a guest now works properly.
> 
> This also paves the way for indicating a different size hash table
> to the kernel and for the kernel to be able to impose limits on
> the requested size.

Alex, this has a bug, if you already applied it, please sneak:

> +int kvmppc_reset_htab(int shift_hint)
> +{
> +    uint32_t shift = shift_hint;
> +
> +    if (kvm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {

The above shall be if (kvm_enabled() && ....

Else it will segfault in kvm_check_extension

Or let me know if I should re-submit.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt - July 10, 2012, 8:16 a.m.
On Tue, 2012-07-10 at 17:25 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2012-06-27 at 22:10 +1000, Benjamin Herrenschmidt wrote:
> > From: David Gibson <david@gibson.dropbear.id.au>
> > 
> > This adds support for then new "reset htab" ioctl which allows qemu
> > to properly cleanup the MMU hash table when the guest is reset. With
> > the corresponding kernel support, reset of a guest now works properly.
> > 
> > This also paves the way for indicating a different size hash table
> > to the kernel and for the kernel to be able to impose limits on
> > the requested size.
> 
> Alex, this has a bug, if you already applied it, please sneak:

Actually just drop the whole thing, it also breaks PR KVM, I need
to work a bit more on it.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf - July 10, 2012, 10:34 a.m.
On 10.07.2012, at 10:16, Benjamin Herrenschmidt wrote:

> On Tue, 2012-07-10 at 17:25 +1000, Benjamin Herrenschmidt wrote:
>> On Wed, 2012-06-27 at 22:10 +1000, Benjamin Herrenschmidt wrote:
>>> From: David Gibson <david@gibson.dropbear.id.au>
>>> 
>>> This adds support for then new "reset htab" ioctl which allows qemu
>>> to properly cleanup the MMU hash table when the guest is reset. With
>>> the corresponding kernel support, reset of a guest now works properly.
>>> 
>>> This also paves the way for indicating a different size hash table
>>> to the kernel and for the kernel to be able to impose limits on
>>> the requested size.
>> 
>> Alex, this has a bug, if you already applied it, please sneak:
> 
> Actually just drop the whole thing, it also breaks PR KVM, I need
> to work a bit more on it.

Alrighty. Dropped :).


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/hw/spapr.c b/hw/spapr.c
index a6bc5e8..e19dbd8 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -83,6 +83,8 @@ 
 
 #define PHANDLE_XICP            0x00001111
 
+#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
+
 sPAPREnvironment *spapr;
 static int spapr_has_graphics;
 
@@ -111,12 +113,13 @@  qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num,
     return qirq;
 }
 
-static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
+static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
 {
     int ret = 0, offset;
     CPUPPCState *env;
     char cpu_model[32];
     int smt = kvmppc_smt_threads();
+    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
 
     assert(spapr->cpu_model);
 
@@ -140,8 +143,16 @@  static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
             return offset;
         }
 
-        ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
-                          sizeof(associativity));
+        if (nb_numa_nodes > 1) {
+            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
+                              sizeof(associativity));
+            if (ret < 0) {
+                return ret;
+            }
+        }
+
+        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
+                          pft_size_prop, sizeof(pft_size_prop));
         if (ret < 0) {
             return ret;
         }
@@ -189,15 +200,13 @@  static void *spapr_create_fdt_skel(const char *cpu_model,
                                    target_phys_addr_t initrd_size,
                                    target_phys_addr_t kernel_size,
                                    const char *boot_device,
-                                   const char *kernel_cmdline,
-                                   long hash_shift)
+                                   const char *kernel_cmdline)
 {
     void *fdt;
     CPUPPCState *env;
     uint64_t mem_reg_property[2];
     uint32_t start_prop = cpu_to_be32(initrd_base);
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
-    uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
     char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
         "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
     char qemu_hypertas_prop[] = "hcall-memop1";
@@ -366,8 +375,6 @@  static void *spapr_create_fdt_skel(const char *cpu_model,
         _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
         _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
         _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
-        _FDT((fdt_property(fdt, "ibm,pft-size",
-                           pft_size_prop, sizeof(pft_size_prop))));
         _FDT((fdt_property_string(fdt, "status", "okay")));
         _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
 
@@ -502,11 +509,9 @@  static void spapr_finalize_fdt(sPAPREnvironment *spapr,
     }
 
     /* Advertise NUMA via ibm,associativity */
-    if (nb_numa_nodes > 1) {
-        ret = spapr_set_associativity(fdt, spapr);
-        if (ret < 0) {
-            fprintf(stderr, "Couldn't set up NUMA device tree properties\n");
-        }
+    ret = spapr_fixup_cpu_dt(fdt, spapr);
+    if (ret < 0) {
+        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
     }
 
     if (!spapr_has_graphics) {
@@ -536,12 +541,34 @@  static void emulate_spapr_hypercall(CPUPPCState *env)
     env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]);
 }
 
-static void spapr_reset(void *opaque)
+static void spapr_reset_htab(void *opaque)
 {
     sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
+    long shift;
+
+    /* allocate hash page table.  For now we always make this 16mb,
+     * later we should probably make it scale to the size of guest
+     * RAM */
+
+    shift = kvmppc_reset_htab(spapr->htab_shift);
+
+    if (shift > 0) {
+        /* Kernel handles htab, we don't need to allocate one */
+        spapr->htab_shift = shift;
+    } else {
+        if (!spapr->htab) {
+            /* Allocate an htab if we don't yet have one */
+            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
+        }
+
+        /* And clear it */
+        memset(spapr->htab, 0, HTAB_SIZE(spapr));
+    }
+}
 
-    /* flush out the hash table */
-    memset(spapr->htab, 0, spapr->htab_size);
+static void spapr_reset(void *opaque)
+{
+    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
 
     /* Load the fdt */
     spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
@@ -558,8 +585,16 @@  static void spapr_reset(void *opaque)
 static void spapr_cpu_reset(void *opaque)
 {
     PowerPCCPU *cpu = opaque;
+    CPUPPCState *env = &cpu->env;
 
     cpu_reset(CPU(cpu));
+
+    env->external_htab = spapr->htab;
+    env->htab_base = -1;
+    env->htab_mask = HTAB_SIZE(spapr) - 1;
+
+    env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
+        (spapr->htab_shift - 18);
 }
 
 static int spapr_vga_init(PCIBus *pci_bus)
@@ -603,7 +638,6 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     uint32_t initrd_base = 0;
     long kernel_size = 0, initrd_size = 0;
     long load_limit, rtas_limit, fw_size;
-    long pteg_shift = 17;
     char *filename;
 
     spapr = g_malloc0(sizeof(*spapr));
@@ -632,6 +666,11 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
     load_limit = spapr->fdt_addr - FW_OVERHEAD;
 
+    /* For now, always aim for a 16MB hash table */
+    /* FIXME: we should change this default based on RAM size */
+    spapr->htab_shift = 24;
+    qemu_register_reset(spapr_reset_htab, spapr);
+
     /* init CPUs */
     if (cpu_model == NULL) {
         cpu_model = kvm_enabled() ? "host" : "POWER7";
@@ -664,20 +703,8 @@  static void ppc_spapr_init(ram_addr_t ram_size,
         memory_region_add_subregion(sysmem, nonrma_base, ram);
     }
 
-    /* allocate hash page table.  For now we always make this 16mb,
-     * later we should probably make it scale to the size of guest
-     * RAM */
-    spapr->htab_size = 1ULL << (pteg_shift + 7);
-    spapr->htab = qemu_memalign(spapr->htab_size, spapr->htab_size);
-
     for (env = first_cpu; env != NULL; env = env->next_cpu) {
-        env->external_htab = spapr->htab;
-        env->htab_base = -1;
-        env->htab_mask = spapr->htab_size - 1;
-
         /* Tell KVM that we're in PAPR mode */
-        env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
-                             ((pteg_shift + 7) - 18);
         env->spr[SPR_HIOR] = 0;
 
         if (kvm_enabled()) {
@@ -816,8 +843,7 @@  static void ppc_spapr_init(ram_addr_t ram_size,
     spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
                                             initrd_base, initrd_size,
                                             kernel_size,
-                                            boot_device, kernel_cmdline,
-                                            pteg_shift + 7);
+                                            boot_device, kernel_cmdline);
     assert(spapr->fdt_skel != NULL);
 
     qemu_register_reset(spapr_reset, spapr);
diff --git a/hw/spapr.h b/hw/spapr.h
index 9153f29..7ec4d7c 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -15,7 +15,7 @@  typedef struct sPAPREnvironment {
 
     target_phys_addr_t ram_limit;
     void *htab;
-    long htab_size;
+    long htab_shift;
     target_phys_addr_t fdt_addr, rtas_addr;
     long rtas_size;
     void *fdt_skel;
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 829e180..12ae0d7 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -1101,6 +1101,23 @@  int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size)
     return 0;
 }
 
+int kvmppc_reset_htab(int shift_hint)
+{
+    uint32_t shift = shift_hint;
+
+    if (kvm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
+        int ret;
+        ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
+        if (ret < 0) {
+            return ret;
+        }
+        return shift;
+    }
+
+    /* For now.. */
+    return 0;
+}
+
 static inline uint32_t mfpvr(void)
 {
     uint32_t pvr;
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index e2f8703..b5be657 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -27,6 +27,7 @@  int kvmppc_smt_threads(void);
 off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
 int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
+int kvmppc_reset_htab(int shift_hint);
 #endif /* !CONFIG_USER_ONLY */
 const ppc_def_t *kvmppc_host_cpu_def(void);
 int kvmppc_fixup_cpu(CPUPPCState *env);
@@ -94,6 +95,12 @@  static inline int kvmppc_remove_spapr_tce(void *table, int pfd,
 {
     return -1;
 }
+
+static inline int kvmppc_reset_htab(int shift_hint)
+{
+    return -1;
+}
+
 #endif /* !CONFIG_USER_ONLY */
 
 static inline const ppc_def_t *kvmppc_host_cpu_def(void)