| Submitter | David Gibson |
|---|---|
| Date | Sept. 29, 2011, 6:45 a.m. |
| Message ID | <1317278706-16105-3-git-send-email-david@gibson.dropbear.id.au> |
| Download | mbox | patch |
| Permalink | /patch/116907/ |
| State | New |
| Headers | show |
Comments
On 29.09.2011, at 08:45, David Gibson wrote: > At present, using the hypervisor aware Book3S-HV KVM will only work > with qemu on POWER7 CPUs. PPC970 CPUs also have hypervisor > capability, but they lack the VRMA feature which makes assigning guest > memory easier. > > In order to allow KVM Book3S-HV on PPC970, we need to specially > allocate the first chunk of guest memory (the "Real Mode Area" or > RMA), so that it is physically contiguous. > > Sufficiently recent host kernels allow such contiguous RMAs to be > allocated, with a kvm capability advertising whether the feature is > available and/or necessary on this hardware. This patch enables qemu > to use this support, thus allowing kvm acceleration of pseries qemu > machines on PPC970 hardware. > > Signed-off-by: Paul Mackerras <paulus@samba.org> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au> > --- > hw/spapr.c | 50 ++++++++++++++++++++++++++++++++++++++++-------- > target-ppc/kvm.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ > target-ppc/kvm_ppc.h | 6 +++++ > 3 files changed, 98 insertions(+), 9 deletions(-) > > diff --git a/hw/spapr.c b/hw/spapr.c > index ba9ae1c..d51425a 100644 > --- a/hw/spapr.c > +++ b/hw/spapr.c > @@ -89,6 +89,7 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num) > } > > static void *spapr_create_fdt_skel(const char *cpu_model, > + target_phys_addr_t rma_size, > target_phys_addr_t initrd_base, > target_phys_addr_t initrd_size, > const char *boot_device, > @@ -97,7 +98,9 @@ static void *spapr_create_fdt_skel(const char *cpu_model, > { > void *fdt; > CPUState *env; > - uint64_t mem_reg_property[] = { 0, cpu_to_be64(ram_size) }; > + uint64_t mem_reg_property_rma[] = { 0, cpu_to_be64(rma_size) }; > + uint64_t mem_reg_property_nonrma[] = { cpu_to_be64(rma_size), > + cpu_to_be64(ram_size - rma_size) }; > uint32_t start_prop = cpu_to_be32(initrd_base); > uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size); > uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)}; > @@ -143,15 +146,25 @@ static void *spapr_create_fdt_skel(const char *cpu_model, > > _FDT((fdt_end_node(fdt))); > > - /* memory node */ > + /* memory node(s) */ > _FDT((fdt_begin_node(fdt, "memory@0"))); > > _FDT((fdt_property_string(fdt, "device_type", "memory"))); > - _FDT((fdt_property(fdt, "reg", > - mem_reg_property, sizeof(mem_reg_property)))); > - > + _FDT((fdt_property(fdt, "reg", mem_reg_property_rma, > + sizeof(mem_reg_property_rma)))); > _FDT((fdt_end_node(fdt))); > > + if (ram_size > rma_size) { > + char mem_name[32]; > + > + sprintf(mem_name, "memory@%" PRIx64, (uint64_t)rma_size); > + _FDT((fdt_begin_node(fdt, mem_name))); > + _FDT((fdt_property_string(fdt, "device_type", "memory"))); > + _FDT((fdt_property(fdt, "reg", mem_reg_property_nonrma, > + sizeof(mem_reg_property_nonrma)))); > + _FDT((fdt_end_node(fdt))); > + } > + > /* cpus */ > _FDT((fdt_begin_node(fdt, "cpus"))); > > @@ -341,6 +354,7 @@ static void ppc_spapr_init(ram_addr_t ram_size, > { > CPUState *env; > int i; > + target_phys_addr_t rma_alloc_size, rma_size; > ram_addr_t ram_offset; > uint32_t initrd_base; > long kernel_size, initrd_size, fw_size; > @@ -350,10 +364,23 @@ static void ppc_spapr_init(ram_addr_t ram_size, > spapr = g_malloc(sizeof(*spapr)); > cpu_ppc_hypercall = emulate_spapr_hypercall; > > + /* Allocate RMA if necessary */ > + rma_alloc_size = kvmppc_alloc_rma("ppc_spapr.rma"); > + > + if (rma_alloc_size == -1) { > + hw_error("qemu: Unable to create RMA\n"); > + exit(1); > + } > + if (rma_alloc_size && (rma_alloc_size < ram_size)) { > + rma_size = rma_alloc_size; > + } else { > + rma_size = ram_size; > + } > + > /* We place the device tree just below either the top of RAM, or > * 2GB, so that it can be processed with 32-bit code if > * necessary */ > - spapr->fdt_addr = MIN(ram_size, 0x80000000) - FDT_MAX_SIZE; > + spapr->fdt_addr = MIN(rma_size, 0x80000000) - FDT_MAX_SIZE; The change looks sane, so I'd assume the description above is now wrong :) > spapr->rtas_addr = spapr->fdt_addr - RTAS_MAX_SIZE; > > /* init CPUs */ > @@ -378,8 +405,13 @@ static void ppc_spapr_init(ram_addr_t ram_size, > > /* allocate RAM */ > spapr->ram_limit = ram_size; > - ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", spapr->ram_limit); > - cpu_register_physical_memory(0, ram_size, ram_offset); > + if (spapr->ram_limit > rma_alloc_size) { > + ram_addr_t nonrma_base = rma_alloc_size; > + ram_addr_t nonrma_size = spapr->ram_limit - rma_alloc_size; > + > + ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", nonrma_size); > + cpu_register_physical_memory(nonrma_base, nonrma_size, ram_offset); > + } > > /* allocate hash page table. For now we always make this 16mb, > * later we should probably make it scale to the size of guest > @@ -503,7 +535,7 @@ static void ppc_spapr_init(ram_addr_t ram_size, > } > > /* Prepare the device tree */ > - spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, > + spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size, > initrd_base, initrd_size, > boot_device, kernel_cmdline, > pteg_shift + 7); > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index 2c1bc7a..37ee902 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -55,6 +55,9 @@ static int cap_interrupt_level = false; > static int cap_segstate; > static int cap_booke_sregs; > static int cap_ppc_smt = 0; > +#ifdef KVM_CAP_PPC_RMA No need for these ifdefs anymore thanks to qemu local kvm headers :) Alex
Patch
diff --git a/hw/spapr.c b/hw/spapr.c index ba9ae1c..d51425a 100644 --- a/hw/spapr.c +++ b/hw/spapr.c @@ -89,6 +89,7 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num) } static void *spapr_create_fdt_skel(const char *cpu_model, + target_phys_addr_t rma_size, target_phys_addr_t initrd_base, target_phys_addr_t initrd_size, const char *boot_device, @@ -97,7 +98,9 @@ static void *spapr_create_fdt_skel(const char *cpu_model, { void *fdt; CPUState *env; - uint64_t mem_reg_property[] = { 0, cpu_to_be64(ram_size) }; + uint64_t mem_reg_property_rma[] = { 0, cpu_to_be64(rma_size) }; + uint64_t mem_reg_property_nonrma[] = { cpu_to_be64(rma_size), + cpu_to_be64(ram_size - rma_size) }; uint32_t start_prop = cpu_to_be32(initrd_base); uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size); uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)}; @@ -143,15 +146,25 @@ static void *spapr_create_fdt_skel(const char *cpu_model, _FDT((fdt_end_node(fdt))); - /* memory node */ + /* memory node(s) */ _FDT((fdt_begin_node(fdt, "memory@0"))); _FDT((fdt_property_string(fdt, "device_type", "memory"))); - _FDT((fdt_property(fdt, "reg", - mem_reg_property, sizeof(mem_reg_property)))); - + _FDT((fdt_property(fdt, "reg", mem_reg_property_rma, + sizeof(mem_reg_property_rma)))); _FDT((fdt_end_node(fdt))); + if (ram_size > rma_size) { + char mem_name[32]; + + sprintf(mem_name, "memory@%" PRIx64, (uint64_t)rma_size); + _FDT((fdt_begin_node(fdt, mem_name))); + _FDT((fdt_property_string(fdt, "device_type", "memory"))); + _FDT((fdt_property(fdt, "reg", mem_reg_property_nonrma, + sizeof(mem_reg_property_nonrma)))); + _FDT((fdt_end_node(fdt))); + } + /* cpus */ _FDT((fdt_begin_node(fdt, "cpus"))); @@ -341,6 +354,7 @@ static void ppc_spapr_init(ram_addr_t ram_size, { CPUState *env; int i; + target_phys_addr_t rma_alloc_size, rma_size; ram_addr_t ram_offset; uint32_t initrd_base; long kernel_size, initrd_size, fw_size; @@ -350,10 +364,23 @@ static void ppc_spapr_init(ram_addr_t ram_size, spapr = g_malloc(sizeof(*spapr)); cpu_ppc_hypercall = emulate_spapr_hypercall; + /* Allocate RMA if necessary */ + rma_alloc_size = kvmppc_alloc_rma("ppc_spapr.rma"); + + if (rma_alloc_size == -1) { + hw_error("qemu: Unable to create RMA\n"); + exit(1); + } + if (rma_alloc_size && (rma_alloc_size < ram_size)) { + rma_size = rma_alloc_size; + } else { + rma_size = ram_size; + } + /* We place the device tree just below either the top of RAM, or * 2GB, so that it can be processed with 32-bit code if * necessary */ - spapr->fdt_addr = MIN(ram_size, 0x80000000) - FDT_MAX_SIZE; + spapr->fdt_addr = MIN(rma_size, 0x80000000) - FDT_MAX_SIZE; spapr->rtas_addr = spapr->fdt_addr - RTAS_MAX_SIZE; /* init CPUs */ @@ -378,8 +405,13 @@ static void ppc_spapr_init(ram_addr_t ram_size, /* allocate RAM */ spapr->ram_limit = ram_size; - ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", spapr->ram_limit); - cpu_register_physical_memory(0, ram_size, ram_offset); + if (spapr->ram_limit > rma_alloc_size) { + ram_addr_t nonrma_base = rma_alloc_size; + ram_addr_t nonrma_size = spapr->ram_limit - rma_alloc_size; + + ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", nonrma_size); + cpu_register_physical_memory(nonrma_base, nonrma_size, ram_offset); + } /* allocate hash page table. For now we always make this 16mb, * later we should probably make it scale to the size of guest @@ -503,7 +535,7 @@ static void ppc_spapr_init(ram_addr_t ram_size, } /* Prepare the device tree */ - spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, + spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size, initrd_base, initrd_size, boot_device, kernel_cmdline, pteg_shift + 7); diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 2c1bc7a..37ee902 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -55,6 +55,9 @@ static int cap_interrupt_level = false; static int cap_segstate; static int cap_booke_sregs; static int cap_ppc_smt = 0; +#ifdef KVM_CAP_PPC_RMA +static int cap_ppc_rma = 0; +#endif /* XXX We have a race condition where we actually have a level triggered * interrupt, but the infrastructure can't expose that yet, so the guest @@ -81,6 +84,9 @@ int kvm_arch_init(KVMState *s) #ifdef KVM_CAP_PPC_SMT cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT); #endif +#ifdef KVM_CAP_PPC_RMA + cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA); +#endif if (!cap_interrupt_level) { fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the " @@ -741,6 +747,51 @@ int kvmppc_smt_threads(void) return cap_ppc_smt ? cap_ppc_smt : 1; } +off_t kvmppc_alloc_rma(const char *name) +{ +#ifndef KVM_CAP_PPC_RMA + return 0; +#else + void *rma; + ram_addr_t rma_offset; + off_t size; + int fd; + struct kvm_allocate_rma ret; + + /* If cap_ppc_rma == 0, contiguous RMA allocation is not supported + * if cap_ppc_rma == 1, contiguous RMA allocation is supported, but + * not necessary on this hardware + * if cap_ppc_rma == 2, contiguous RMA allocation is needed on this hardware + * + * FIXME: We should allow the user to force contiguous RMA + * allocation in the cap_ppc_rma==1 case. + */ + if (cap_ppc_rma < 2) { + return 0; + } + + fd = kvm_vm_ioctl(kvm_state, KVM_ALLOCATE_RMA, &ret); + if (fd < 0) { + fprintf(stderr, "KVM: Error on KVM_ALLOCATE_RMA: %s\n", + strerror(errno)); + return -1; + } + + size = MIN(ret.rma_size, 256ul << 20); + + rma = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (rma == MAP_FAILED) { + fprintf(stderr, "KVM: Error mapping RMA: %s\n", strerror(errno)); + return -1; + }; + + rma_offset = qemu_ram_alloc_from_ptr(NULL, name, size, rma); + cpu_register_physical_memory(0, size, rma_offset); + + return size; +#endif +} + bool kvm_arch_stop_on_emulation_error(CPUState *env) { return true; diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h index c298411..ad9903c 100644 --- a/target-ppc/kvm_ppc.h +++ b/target-ppc/kvm_ppc.h @@ -19,6 +19,7 @@ int kvmppc_get_hypercall(CPUState *env, uint8_t *buf, int buf_len); int kvmppc_set_interrupt(CPUState *env, int irq, int level); void kvmppc_set_papr(CPUState *env); int kvmppc_smt_threads(void); +off_t kvmppc_alloc_rma(const char *name); #else @@ -51,6 +52,11 @@ static inline int kvmppc_smt_threads(void) return 1; } +static inline off_t kvmppc_alloc_rma(const char *name) +{ + return 0; +} + #endif #ifndef CONFIG_KVM