Patchwork [24/58] PPC: E500: Add PV spinning code

login
register
mail settings
Submitter Alexander Graf
Date Sept. 14, 2011, 8:42 a.m.
Message ID <1315989802-18753-25-git-send-email-agraf@suse.de>
Download mbox | patch
Permalink /patch/114607/
State New
Headers show

Comments

Alexander Graf - Sept. 14, 2011, 8:42 a.m.
CPUs that are not the boot CPU need to run in spinning code to check if they
should run off to execute and if so where to jump to. This usually happens
by leaving secondary CPUs looping and checking if some variable in memory
changed.

In an environment like Qemu however we can be more clever. We can just export
the spin table the primary CPU modifies as MMIO region that would event based
wake up the respective secondary CPUs. That saves us quite some cycles while
the secondary CPUs are not up yet.

So this patch adds a PV device that simply exports the spinning table into the
guest and thus allows the primary CPU to wake up secondary ones.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - change into MMIO scheme
  - map the secondary NIP instead of 0 1:1
  - only map 64MB for TLB, same as u-boot
  - prepare code for 64-bit spinnings

v2 -> v3:

  - remove r6
  - set MAS2_M
  - map EA 0
  - use second TLB1 entry

v3 -> v4:

  - change to memoryops

v4 -> v5:

  - fix endianness bugs
---
 Makefile.target        |    2 +-
 hw/ppce500_mpc8544ds.c |   33 ++++++++-
 hw/ppce500_spin.c      |  186 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+), 5 deletions(-)
 create mode 100644 hw/ppce500_spin.c
Blue Swirl - Sept. 17, 2011, 4:58 p.m.
On Wed, Sep 14, 2011 at 8:42 AM, Alexander Graf <agraf@suse.de> wrote:
> CPUs that are not the boot CPU need to run in spinning code to check if they
> should run off to execute and if so where to jump to. This usually happens
> by leaving secondary CPUs looping and checking if some variable in memory
> changed.
>
> In an environment like Qemu however we can be more clever. We can just export
> the spin table the primary CPU modifies as MMIO region that would event based
> wake up the respective secondary CPUs. That saves us quite some cycles while
> the secondary CPUs are not up yet.
>
> So this patch adds a PV device that simply exports the spinning table into the
> guest and thus allows the primary CPU to wake up secondary ones.

On Sparc32, there is no need for a PV device. The CPU is woken up from
halted state with an IPI. Maybe you could use this approach?

> Signed-off-by: Alexander Graf <agraf@suse.de>
>
> ---
>
> v1 -> v2:
>
>  - change into MMIO scheme
>  - map the secondary NIP instead of 0 1:1
>  - only map 64MB for TLB, same as u-boot
>  - prepare code for 64-bit spinnings
>
> v2 -> v3:
>
>  - remove r6
>  - set MAS2_M
>  - map EA 0
>  - use second TLB1 entry
>
> v3 -> v4:
>
>  - change to memoryops
>
> v4 -> v5:
>
>  - fix endianness bugs
> ---
>  Makefile.target        |    2 +-
>  hw/ppce500_mpc8544ds.c |   33 ++++++++-
>  hw/ppce500_spin.c      |  186 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 216 insertions(+), 5 deletions(-)
>  create mode 100644 hw/ppce500_spin.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 2ed9099..3f689ce 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -247,7 +247,7 @@ endif
>  obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
>  obj-ppc-y += ppc440.o ppc440_bamboo.o
>  # PowerPC E500 boards
> -obj-ppc-y += ppce500_mpc8544ds.o mpc8544_guts.o
> +obj-ppc-y += ppce500_mpc8544ds.o mpc8544_guts.o ppce500_spin.o
>  # PowerPC 440 Xilinx ML507 reference board.
>  obj-ppc-y += virtex_ml507.o
>  obj-ppc-$(CONFIG_KVM) += kvm_ppc.o
> diff --git a/hw/ppce500_mpc8544ds.c b/hw/ppce500_mpc8544ds.c
> index 9379624..3b8b449 100644
> --- a/hw/ppce500_mpc8544ds.c
> +++ b/hw/ppce500_mpc8544ds.c
> @@ -49,6 +49,7 @@
>  #define MPC8544_PCI_IO             0xE1000000
>  #define MPC8544_PCI_IOLEN          0x10000
>  #define MPC8544_UTIL_BASE          (MPC8544_CCSRBAR_BASE + 0xe0000)
> +#define MPC8544_SPIN_BASE          0xEF000000
>
>  struct boot_info
>  {
> @@ -164,6 +165,18 @@ static void mmubooke_create_initial_mapping(CPUState *env,
>     tlb->mas7_3 |= MAS3_UR | MAS3_UW | MAS3_UX | MAS3_SR | MAS3_SW | MAS3_SX;
>  }
>
> +static void mpc8544ds_cpu_reset_sec(void *opaque)
> +{
> +    CPUState *env = opaque;
> +
> +    cpu_reset(env);
> +
> +    /* Secondary CPU starts in halted state for now. Needs to change when
> +       implementing non-kernel boot. */
> +    env->halted = 1;
> +    env->exception_index = EXCP_HLT;
> +}
> +
>  static void mpc8544ds_cpu_reset(void *opaque)
>  {
>     CPUState *env = opaque;
> @@ -172,6 +185,7 @@ static void mpc8544ds_cpu_reset(void *opaque)
>     cpu_reset(env);
>
>     /* Set initial guest state. */
> +    env->halted = 0;
>     env->gpr[1] = (16<<20) - 8;
>     env->gpr[3] = bi->dt_base;
>     env->nip = bi->entry;
> @@ -199,7 +213,6 @@ static void mpc8544ds_init(ram_addr_t ram_size,
>     unsigned int pci_irq_nrs[4] = {1, 2, 3, 4};
>     qemu_irq **irqs, *mpic;
>     DeviceState *dev;
> -    struct boot_info *boot_info;
>     CPUState *firstenv = NULL;
>
>     /* Setup CPUs */
> @@ -234,9 +247,16 @@ static void mpc8544ds_init(ram_addr_t ram_size,
>         env->spr[SPR_40x_TCR] = 1 << 26;
>
>         /* Register reset handler */
> -        boot_info = g_malloc0(sizeof(struct boot_info));
> -        qemu_register_reset(mpc8544ds_cpu_reset, env);
> -        env->load_info = boot_info;
> +        if (!i) {
> +            /* Primary CPU */
> +            struct boot_info *boot_info;
> +            boot_info = g_malloc0(sizeof(struct boot_info));
> +            qemu_register_reset(mpc8544ds_cpu_reset, env);
> +            env->load_info = boot_info;
> +        } else {
> +            /* Secondary CPUs */
> +            qemu_register_reset(mpc8544ds_cpu_reset_sec, env);
> +        }
>     }
>
>     env = firstenv;
> @@ -289,6 +309,9 @@ static void mpc8544ds_init(ram_addr_t ram_size,
>         }
>     }
>
> +    /* Register spinning region */
> +    sysbus_create_simple("e500-spin", MPC8544_SPIN_BASE, NULL);
> +
>     /* Load kernel. */
>     if (kernel_filename) {
>         kernel_size = load_uimage(kernel_filename, &entry, &loadaddr, NULL);
> @@ -321,6 +344,8 @@ static void mpc8544ds_init(ram_addr_t ram_size,
>
>     /* If we're loading a kernel directly, we must load the device tree too. */
>     if (kernel_filename) {
> +        struct boot_info *boot_info;
> +
>  #ifndef CONFIG_FDT
>         cpu_abort(env, "Compiled without FDT support - can't load kernel\n");
>  #endif
> diff --git a/hw/ppce500_spin.c b/hw/ppce500_spin.c
> new file mode 100644
> index 0000000..38451ac
> --- /dev/null
> +++ b/hw/ppce500_spin.c
> @@ -0,0 +1,186 @@
> +#include "hw.h"
> +#include "sysemu.h"
> +#include "sysbus.h"
> +#include "kvm.h"
> +
> +#define MAX_CPUS 32
> +
> +typedef struct spin_info {
> +    uint64_t addr;
> +    uint64_t r3;
> +    uint32_t resv;
> +    uint32_t pir;
> +    uint64_t reserved;
> +} __attribute__ ((packed)) SpinInfo;

This attribute isn't needed, the fields are aligned and also the
structure is internal to QEMU so misalignment wouldn't matter.

In the future, please use QEMU_PACKED.

> +
> +typedef struct spin_state {
> +    SysBusDevice busdev;
> +    MemoryRegion iomem;
> +    SpinInfo spin[MAX_CPUS];
> +} SpinState;
> +
> +typedef struct spin_kick {
> +    CPUState *env;
> +    SpinInfo *spin;
> +} SpinKick;
> +
> +static void spin_reset(void *opaque)
> +{
> +    SpinState *s = opaque;
> +    int i;
> +
> +    for (i = 0; i < MAX_CPUS; i++) {
> +        SpinInfo *info = &s->spin[i];
> +
> +        info->pir = i;
> +        info->r3 = i;
> +        info->addr = 1;
> +    }
> +}
> +
> +/* Create -kernel TLB entries for BookE, linearly spanning 256MB.  */
> +static inline target_phys_addr_t booke206_page_size_to_tlb(uint64_t size)
> +{
> +    return (ffs(size >> 10) - 1) >> 1;
> +}
> +
> +static void mmubooke_create_initial_mapping(CPUState *env,
> +                                     target_ulong va,
> +                                     target_phys_addr_t pa,
> +                                     target_phys_addr_t len)
> +{
> +    ppcmas_tlb_t *tlb = booke206_get_tlbm(env, 1, 0, 1);
> +    target_phys_addr_t size;
> +
> +    size = (booke206_page_size_to_tlb(len) << MAS1_TSIZE_SHIFT);
> +    tlb->mas1 = MAS1_VALID | size;
> +    tlb->mas2 = (va & TARGET_PAGE_MASK) | MAS2_M;
> +    tlb->mas7_3 = pa & TARGET_PAGE_MASK;
> +    tlb->mas7_3 |= MAS3_UR | MAS3_UW | MAS3_UX | MAS3_SR | MAS3_SW | MAS3_SX;
> +}
> +
> +static void spin_kick(void *data)
> +{
> +    SpinKick *kick = data;
> +    CPUState *env = kick->env;
> +    SpinInfo *curspin = kick->spin;
> +    target_phys_addr_t map_size = 64 * 1024 * 1024;
> +    target_phys_addr_t map_start;
> +
> +    cpu_synchronize_state(env);
> +    stl_p(&curspin->pir, env->spr[SPR_PIR]);
> +    env->nip = ldq_p(&curspin->addr) & (map_size - 1);

ldq_be_p() for non-PV emulation.

> +    env->gpr[3] = ldq_p(&curspin->r3);
> +    env->gpr[4] = 0;
> +    env->gpr[5] = 0;
> +    env->gpr[6] = 0;
> +    env->gpr[7] = map_size;
> +    env->gpr[8] = 0;
> +    env->gpr[9] = 0;
> +
> +    map_start = ldq_p(&curspin->addr) & ~(map_size - 1);
> +    mmubooke_create_initial_mapping(env, 0, map_start, map_size);
> +
> +    env->halted = 0;
> +    env->exception_index = -1;
> +    qemu_cpu_kick(env);
> +}
> +
> +static void spin_write(void *opaque, target_phys_addr_t addr, uint64_t value,
> +                       unsigned len)
> +{
> +    SpinState *s = opaque;
> +    int env_idx = addr / sizeof(SpinInfo);
> +    CPUState *env;
> +    SpinInfo *curspin = &s->spin[env_idx];
> +    uint8_t *curspin_p = (uint8_t*)curspin;
> +
> +    for (env = first_cpu; env != NULL; env = env->next_cpu) {
> +        if (env->cpu_index == env_idx) {
> +            break;
> +        }
> +    }
> +
> +    if (!env) {
> +        /* Unknown CPU */
> +        return;
> +    }
> +
> +    if (!env->cpu_index) {
> +        /* primary CPU doesn't spin */
> +        return;
> +    }
> +
> +    curspin_p = &curspin_p[addr % sizeof(SpinInfo)];
> +    switch (len) {
> +    case 1:
> +        stb_p(curspin_p, value);
> +        break;
> +    case 2:
> +        stw_p(curspin_p, value);
> +        break;
> +    case 4:
> +        stl_p(curspin_p, value);
> +        break;
> +    }
> +
> +    if (!(ldq_p(&curspin->addr) & 1)) {
> +        /* run CPU */
> +        SpinKick kick = {
> +            .env = env,
> +            .spin = curspin,
> +        };
> +
> +        run_on_cpu(env, spin_kick, &kick);
> +    }
> +}
> +
> +static uint64_t spin_read(void *opaque, target_phys_addr_t addr, unsigned len)
> +{
> +    SpinState *s = opaque;
> +    uint8_t *spin_p = &((uint8_t*)s->spin)[addr];
> +
> +    switch (len) {
> +    case 1:
> +        return ldub_p(spin_p);
> +    case 2:
> +        return lduw_p(spin_p);
> +    case 4:
> +        return ldl_p(spin_p);
> +    default:
> +        assert(0);

abort()

> +    }
> +}
> +
> +const MemoryRegionOps spin_rw_ops = {
> +    .read = spin_read,
> +    .write = spin_write,
> +    .endianness = DEVICE_BIG_ENDIAN,
> +};
> +
> +static int ppce500_spin_initfn(SysBusDevice *dev)
> +{
> +    SpinState *s;
> +
> +    s = FROM_SYSBUS(SpinState, sysbus_from_qdev(dev));
> +
> +    memory_region_init_io(&s->iomem, &spin_rw_ops, s, "e500 spin pv device",
> +                          sizeof(SpinInfo) * MAX_CPUS);
> +    sysbus_init_mmio_region(dev, &s->iomem);
> +
> +    qemu_register_reset(spin_reset, s);
> +
> +    return 0;
> +}
> +
> +static SysBusDeviceInfo ppce500_spin_info = {
> +    .init         = ppce500_spin_initfn,
> +    .qdev.name    = "e500-spin",
> +    .qdev.size    = sizeof(SpinState),
> +};
> +
> +static void ppce500_spin_register(void)
> +{
> +    sysbus_register_withprop(&ppce500_spin_info);
> +}
> +device_init(ppce500_spin_register);
> --
> 1.6.0.2
>
>
Alexander Graf - Sept. 17, 2011, 5:15 p.m.
Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:

> On Wed, Sep 14, 2011 at 8:42 AM, Alexander Graf <agraf@suse.de> wrote:
>> CPUs that are not the boot CPU need to run in spinning code to check if they
>> should run off to execute and if so where to jump to. This usually happens
>> by leaving secondary CPUs looping and checking if some variable in memory
>> changed.
>> 
>> In an environment like Qemu however we can be more clever. We can just export
>> the spin table the primary CPU modifies as MMIO region that would event based
>> wake up the respective secondary CPUs. That saves us quite some cycles while
>> the secondary CPUs are not up yet.
>> 
>> So this patch adds a PV device that simply exports the spinning table into the
>> guest and thus allows the primary CPU to wake up secondary ones.
> 
> On Sparc32, there is no need for a PV device. The CPU is woken up from
> halted state with an IPI. Maybe you could use this approach?

The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).

Alex

>
Blue Swirl - Sept. 17, 2011, 5:40 p.m.
On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>
> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>
>> On Wed, Sep 14, 2011 at 8:42 AM, Alexander Graf <agraf@suse.de> wrote:
>>> CPUs that are not the boot CPU need to run in spinning code to check if they
>>> should run off to execute and if so where to jump to. This usually happens
>>> by leaving secondary CPUs looping and checking if some variable in memory
>>> changed.
>>>
>>> In an environment like Qemu however we can be more clever. We can just export
>>> the spin table the primary CPU modifies as MMIO region that would event based
>>> wake up the respective secondary CPUs. That saves us quite some cycles while
>>> the secondary CPUs are not up yet.
>>>
>>> So this patch adds a PV device that simply exports the spinning table into the
>>> guest and thus allows the primary CPU to wake up secondary ones.
>>
>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>> halted state with an IPI. Maybe you could use this approach?
>
> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).

OK. I hoped that there were no implementations yet. The header (btw
missing) should point to the spec.
Alexander Graf - Sept. 19, 2011, 11:35 a.m.
On 17.09.2011, at 19:40, Blue Swirl wrote:

> On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>> 
>> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>> 
>>> On Wed, Sep 14, 2011 at 8:42 AM, Alexander Graf <agraf@suse.de> wrote:
>>>> CPUs that are not the boot CPU need to run in spinning code to check if they
>>>> should run off to execute and if so where to jump to. This usually happens
>>>> by leaving secondary CPUs looping and checking if some variable in memory
>>>> changed.
>>>> 
>>>> In an environment like Qemu however we can be more clever. We can just export
>>>> the spin table the primary CPU modifies as MMIO region that would event based
>>>> wake up the respective secondary CPUs. That saves us quite some cycles while
>>>> the secondary CPUs are not up yet.
>>>> 
>>>> So this patch adds a PV device that simply exports the spinning table into the
>>>> guest and thus allows the primary CPU to wake up secondary ones.
>>> 
>>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>>> halted state with an IPI. Maybe you could use this approach?
>> 
>> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).
> 
> OK. I hoped that there were no implementations yet. The header (btw
> missing) should point to the spec.

IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.

Stuart / Scott, do you have any pointers to documentation where the spinning is explained?


Alex
Scott Wood - Sept. 19, 2011, 4:12 p.m.
On 09/19/2011 06:35 AM, Alexander Graf wrote:
> 
> On 17.09.2011, at 19:40, Blue Swirl wrote:
> 
>> On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>>>
>>> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>>>
>>>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>>>> halted state with an IPI. Maybe you could use this approach?
>>>
>>> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).
>>
>> OK. I hoped that there were no implementations yet. The header (btw
>> missing) should point to the spec.

The goal with the spin table stuff, suboptimal as it is, was something
that would work on any powerpc implementation.  Other
implementation-specific release mechanisms are allowed, and are
indicated by a property in the cpu node, but only if the loader knows
that the OS supports it.

> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.

It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
just released which clarifies some things such as WIMG.

> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?

https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf

-Scott
Blue Swirl - Sept. 24, 2011, 7:41 a.m.
On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
> On 09/19/2011 06:35 AM, Alexander Graf wrote:
>>
>> On 17.09.2011, at 19:40, Blue Swirl wrote:
>>
>>> On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>>>>
>>>> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>>>>
>>>>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>>>>> halted state with an IPI. Maybe you could use this approach?
>>>>
>>>> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).
>>>
>>> OK. I hoped that there were no implementations yet. The header (btw
>>> missing) should point to the spec.
>
> The goal with the spin table stuff, suboptimal as it is, was something
> that would work on any powerpc implementation.  Other
> implementation-specific release mechanisms are allowed, and are
> indicated by a property in the cpu node, but only if the loader knows
> that the OS supports it.
>
>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>
> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
> just released which clarifies some things such as WIMG.
>
>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>
> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf

Chapter 5.5.2 describes the table. This is actually an interface
between OS and Open Firmware, obviously there can't be a real hardware
device that magically loads r3 etc.

The device method would break abstraction layers, it's much like
vmport stuff in x86. Using a hypercall would be a small improvement.
Instead it should be possible to implement a small boot ROM which puts
the secondary CPUs into managed halt state without spinning, then the
boot CPU could send an IPI to a halted CPU to wake them up based on
the spin table, just like real HW would do. On Sparc32 OpenBIOS this
is something like a few lines of ASM on both sides.
Alexander Graf - Sept. 24, 2011, 8:03 a.m.
On 24.09.2011, at 09:41, Blue Swirl wrote:

> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>> On 09/19/2011 06:35 AM, Alexander Graf wrote:
>>> 
>>> On 17.09.2011, at 19:40, Blue Swirl wrote:
>>> 
>>>> On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>>>>> 
>>>>> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>>>>> 
>>>>>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>>>>>> halted state with an IPI. Maybe you could use this approach?
>>>>> 
>>>>> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).
>>>> 
>>>> OK. I hoped that there were no implementations yet. The header (btw
>>>> missing) should point to the spec.
>> 
>> The goal with the spin table stuff, suboptimal as it is, was something
>> that would work on any powerpc implementation.  Other
>> implementation-specific release mechanisms are allowed, and are
>> indicated by a property in the cpu node, but only if the loader knows
>> that the OS supports it.
>> 
>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>> 
>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>> just released which clarifies some things such as WIMG.
>> 
>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>> 
>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
> 
> Chapter 5.5.2 describes the table. This is actually an interface
> between OS and Open Firmware, obviously there can't be a real hardware
> device that magically loads r3 etc.
> 
> The device method would break abstraction layers, it's much like
> vmport stuff in x86. Using a hypercall would be a small improvement.
> Instead it should be possible to implement a small boot ROM which puts
> the secondary CPUs into managed halt state without spinning, then the
> boot CPU could send an IPI to a halted CPU to wake them up based on
> the spin table, just like real HW would do. On Sparc32 OpenBIOS this
> is something like a few lines of ASM on both sides.

That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott. Maybe one day we will get u-boot support. Then u-boot will spin on the CPU itself and when that time comes, we can check if we can implement a prettier version.

Btw, we can't do the IPI method without exposing something to the guest that u-boot would usually not expose. There simply is no event. All that happens is a write to memory to tell the other CPU that it should wake up. So while sending an IPI to the other CPU is the "clean" way to go, I agree, we can either be compatible or "clean". And if I get the choice I'm rather compatible.

So we have the choice between having code inside the guest that spins, maybe even only checks every x ms, by programming a timer, or we can try to make an event out of the memory write. V1 was the former, v2 (this one) is the latter. This version performs a lot better and is easier to understand.


Alex
Blue Swirl - Sept. 24, 2011, 8:44 a.m.
On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>
> On 24.09.2011, at 09:41, Blue Swirl wrote:
>
>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>> On 09/19/2011 06:35 AM, Alexander Graf wrote:
>>>>
>>>> On 17.09.2011, at 19:40, Blue Swirl wrote:
>>>>
>>>>> On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>>>>>>
>>>>>> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>>>>>>
>>>>>>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>>>>>>> halted state with an IPI. Maybe you could use this approach?
>>>>>>
>>>>>> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).
>>>>>
>>>>> OK. I hoped that there were no implementations yet. The header (btw
>>>>> missing) should point to the spec.
>>>
>>> The goal with the spin table stuff, suboptimal as it is, was something
>>> that would work on any powerpc implementation.  Other
>>> implementation-specific release mechanisms are allowed, and are
>>> indicated by a property in the cpu node, but only if the loader knows
>>> that the OS supports it.
>>>
>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>
>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>> just released which clarifies some things such as WIMG.
>>>
>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>
>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>
>> Chapter 5.5.2 describes the table. This is actually an interface
>> between OS and Open Firmware, obviously there can't be a real hardware
>> device that magically loads r3 etc.
>>
>> The device method would break abstraction layers, it's much like
>> vmport stuff in x86. Using a hypercall would be a small improvement.
>> Instead it should be possible to implement a small boot ROM which puts
>> the secondary CPUs into managed halt state without spinning, then the
>> boot CPU could send an IPI to a halted CPU to wake them up based on
>> the spin table, just like real HW would do. On Sparc32 OpenBIOS this
>> is something like a few lines of ASM on both sides.
>
> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott. Maybe one day we will get u-boot support. Then u-boot will spin on the CPU itself and when that time comes, we can check if we can implement a prettier version.
>
> Btw, we can't do the IPI method without exposing something to the guest that u-boot would usually not expose. There simply is no event. All that happens is a write to memory to tell the other CPU that it should wake up. So while sending an IPI to the other CPU is the "clean" way to go, I agree, we can either be compatible or "clean". And if I get the choice I'm rather compatible.

There are also warts in Sparc32 design, for example there is no
instruction to halt the CPU, instead a device (only available on some
models) can do it.

> So we have the choice between having code inside the guest that spins, maybe even only checks every x ms, by programming a timer, or we can try to make an event out of the memory write. V1 was the former, v2 (this one) is the latter. This version performs a lot better and is easier to understand.

The abstraction layers should not be broken lightly, I suppose some
performance or laziness^Wlocal optimization reasons were behind vmport
design too. The ideal way to solve this could be to detect a spinning
CPU and optimize that for all architectures, that could be tricky
though (if a CPU remains in the same TB for extended periods, inspect
the TB: if it performs a loop with a single load instruction, replace
the load by a special wait operation for any memory stores to that
page).
Alexander Graf - Sept. 24, 2011, 10 a.m.
On 24.09.2011, at 10:44, Blue Swirl wrote:

> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>> 
>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>> 
>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>> On 09/19/2011 06:35 AM, Alexander Graf wrote:
>>>>> 
>>>>> On 17.09.2011, at 19:40, Blue Swirl wrote:
>>>>> 
>>>>>> On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>>>>>>> 
>>>>>>> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>>>>>>> 
>>>>>>>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>>>>>>>> halted state with an IPI. Maybe you could use this approach?
>>>>>>> 
>>>>>>> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).
>>>>>> 
>>>>>> OK. I hoped that there were no implementations yet. The header (btw
>>>>>> missing) should point to the spec.
>>>> 
>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>> that would work on any powerpc implementation.  Other
>>>> implementation-specific release mechanisms are allowed, and are
>>>> indicated by a property in the cpu node, but only if the loader knows
>>>> that the OS supports it.
>>>> 
>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>> 
>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>> just released which clarifies some things such as WIMG.
>>>> 
>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>> 
>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>> 
>>> Chapter 5.5.2 describes the table. This is actually an interface
>>> between OS and Open Firmware, obviously there can't be a real hardware
>>> device that magically loads r3 etc.
>>> 
>>> The device method would break abstraction layers, it's much like
>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>> Instead it should be possible to implement a small boot ROM which puts
>>> the secondary CPUs into managed halt state without spinning, then the
>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>> the spin table, just like real HW would do. On Sparc32 OpenBIOS this
>>> is something like a few lines of ASM on both sides.
>> 
>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott. Maybe one day we will get u-boot support. Then u-boot will spin on the CPU itself and when that time comes, we can check if we can implement a prettier version.
>> 
>> Btw, we can't do the IPI method without exposing something to the guest that u-boot would usually not expose. There simply is no event. All that happens is a write to memory to tell the other CPU that it should wake up. So while sending an IPI to the other CPU is the "clean" way to go, I agree, we can either be compatible or "clean". And if I get the choice I'm rather compatible.
> 
> There are also warts in Sparc32 design, for example there is no
> instruction to halt the CPU, instead a device (only available on some
> models) can do it.

Ugh, nice :)

> 
>> So we have the choice between having code inside the guest that spins, maybe even only checks every x ms, by programming a timer, or we can try to make an event out of the memory write. V1 was the former, v2 (this one) is the latter. This version performs a lot better and is easier to understand.
> 
> The abstraction layers should not be broken lightly, I suppose some
> performance or laziness^Wlocal optimization reasons were behind vmport
> design too. The ideal way to solve this could be to detect a spinning
> CPU and optimize that for all architectures, that could be tricky
> though (if a CPU remains in the same TB for extended periods, inspect
> the TB: if it performs a loop with a single load instruction, replace
> the load by a special wait operation for any memory stores to that
> page).

I agree.

However, for now I'd like to have _something_ that we can easily replace later on. We don't do savevm or migration yet, so the danger of changing the device model from one version to the next is minimal. To the guest kernel, this is seamless, as the interface stays exactly the same.

In fact, the whole kernel loading way we go today is pretty much wrong. We should rather do it similar to OpenBIOS where firmware always loads and then pulls the kernel from QEMU using a PV interface. At that point, we would have to implement such an optimization as you suggest. Or implement a hypercall :). But at least we'd always be running the same guest software stack.

So what I'm suggesting is that for now, we're making progress and then scratch the device we're introducing here later on, when we move towards different models on how to initialize the machine. As it stands however, I much rather have working code here and concentrate on the 50 other places that are broken than optimize a case that already works well enough because it could be done prettier. Let's rather iterate over this interface again when we hit another road block. At that point in time, we'll have more experience with the shortcomings too.


Alex
Blue Swirl - Sept. 24, 2011, 10:18 a.m.
On Sat, Sep 24, 2011 at 10:00 AM, Alexander Graf <agraf@suse.de> wrote:
>
> On 24.09.2011, at 10:44, Blue Swirl wrote:
>
>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>>
>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>
>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>> On 09/19/2011 06:35 AM, Alexander Graf wrote:
>>>>>>
>>>>>> On 17.09.2011, at 19:40, Blue Swirl wrote:
>>>>>>
>>>>>>> On Sat, Sep 17, 2011 at 5:15 PM, Alexander Graf <agraf@suse.de> wrote:
>>>>>>>>
>>>>>>>> Am 17.09.2011 um 18:58 schrieb Blue Swirl <blauwirbel@gmail.com>:
>>>>>>>>
>>>>>>>>> On Sparc32, there is no need for a PV device. The CPU is woken up from
>>>>>>>>> halted state with an IPI. Maybe you could use this approach?
>>>>>>>>
>>>>>>>> The way it's done here is defined by u-boot and now also nailed down in the ePAPR architecture spec. While alternatives might be more appealing, this is how guests work today :).
>>>>>>>
>>>>>>> OK. I hoped that there were no implementations yet. The header (btw
>>>>>>> missing) should point to the spec.
>>>>>
>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>> that would work on any powerpc implementation.  Other
>>>>> implementation-specific release mechanisms are allowed, and are
>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>> that the OS supports it.
>>>>>
>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>
>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>> just released which clarifies some things such as WIMG.
>>>>>
>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>
>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>
>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>> device that magically loads r3 etc.
>>>>
>>>> The device method would break abstraction layers, it's much like
>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>> Instead it should be possible to implement a small boot ROM which puts
>>>> the secondary CPUs into managed halt state without spinning, then the
>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>> the spin table, just like real HW would do. On Sparc32 OpenBIOS this
>>>> is something like a few lines of ASM on both sides.
>>>
>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott. Maybe one day we will get u-boot support. Then u-boot will spin on the CPU itself and when that time comes, we can check if we can implement a prettier version.
>>>
>>> Btw, we can't do the IPI method without exposing something to the guest that u-boot would usually not expose. There simply is no event. All that happens is a write to memory to tell the other CPU that it should wake up. So while sending an IPI to the other CPU is the "clean" way to go, I agree, we can either be compatible or "clean". And if I get the choice I'm rather compatible.
>>
>> There are also warts in Sparc32 design, for example there is no
>> instruction to halt the CPU, instead a device (only available on some
>> models) can do it.
>
> Ugh, nice :)
>
>>
>>> So we have the choice between having code inside the guest that spins, maybe even only checks every x ms, by programming a timer, or we can try to make an event out of the memory write. V1 was the former, v2 (this one) is the latter. This version performs a lot better and is easier to understand.
>>
>> The abstraction layers should not be broken lightly, I suppose some
>> performance or laziness^Wlocal optimization reasons were behind vmport
>> design too. The ideal way to solve this could be to detect a spinning
>> CPU and optimize that for all architectures, that could be tricky
>> though (if a CPU remains in the same TB for extended periods, inspect
>> the TB: if it performs a loop with a single load instruction, replace
>> the load by a special wait operation for any memory stores to that
>> page).
>
> I agree.
>
> However, for now I'd like to have _something_ that we can easily replace later on. We don't do savevm or migration yet, so the danger of changing the device model from one version to the next is minimal. To the guest kernel, this is seamless, as the interface stays exactly the same.
>
> In fact, the whole kernel loading way we go today is pretty much wrong. We should rather do it similar to OpenBIOS where firmware always loads and then pulls the kernel from QEMU using a PV interface. At that point, we would have to implement such an optimization as you suggest. Or implement a hypercall :). But at least we'd always be running the same guest software stack.

Fully agree, also the hypercall stuff (especially OF tree handling)
could be pushed to OpenBIOS and make it the hypervisor one day.

> So what I'm suggesting is that for now, we're making progress and then scratch the device we're introducing here later on, when we move towards different models on how to initialize the machine. As it stands however, I much rather have working code here and concentrate on the 50 other places that are broken than optimize a case that already works well enough because it could be done prettier. Let's rather iterate over this interface again when we hit another road block. At that point in time, we'll have more experience with the shortcomings too.

OK, if we all agree that the interface is temporary. Maybe the device
file should include warnings about that.
Scott Wood - Sept. 26, 2011, 11:19 p.m.
On 09/24/2011 05:00 AM, Alexander Graf wrote:
> On 24.09.2011, at 10:44, Blue Swirl wrote:
>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>> that would work on any powerpc implementation.  Other
>>>>> implementation-specific release mechanisms are allowed, and are
>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>> that the OS supports it.
>>>>>
>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>
>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>> just released which clarifies some things such as WIMG.
>>>>>
>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>
>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>
>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>> device that magically loads r3 etc.

Not Open Firmware, but rather an ePAPR-compliant loader.

>>>> The device method would break abstraction layers, 

Which abstraction layers?

>>>> it's much like
>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>> Instead it should be possible to implement a small boot ROM which puts
>>>> the secondary CPUs into managed halt state without spinning, then the
>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>> the spin table, just like real HW would do.

The spin table, with no IPI or halt state, is what real HW does (or
rather, what software does on real HW) today.  It's ugly and inefficient
but it should work everywhere.  Anything else would be dependent on a
specific HW implementation.

>>>> On Sparc32 OpenBIOS this
>>>> is something like a few lines of ASM on both sides.
>>>
>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott.

I had some comments on the actual v1 implementation as well. :-)

>>> So we have the choice between having code inside the guest that
>>> spins, maybe even only checks every x ms, by programming a timer,
>>> or we can try to make an event out of the memory write. V1 was
>>> the former, v2 (this one) is the latter. This version performs a
>>> lot better and is easier to understand.
>>
>> The abstraction layers should not be broken lightly, I suppose some
>> performance or laziness^Wlocal optimization reasons were behind vmport
>> design too. The ideal way to solve this could be to detect a spinning
>> CPU and optimize that for all architectures, that could be tricky
>> though (if a CPU remains in the same TB for extended periods, inspect
>> the TB: if it performs a loop with a single load instruction, replace
>> the load by a special wait operation for any memory stores to that
>> page).

How's that going to work with KVM?

> In fact, the whole kernel loading way we go today is pretty much
> wrong. We should rather do it similar to OpenBIOS where firmware
> always loads and then pulls the kernel from QEMU using a PV
> interface. At that point, we would have to implement such an
> optimization as you suggest. Or implement a hypercall :). 

I think the current approach is more usable for most purposes.  If you
start U-Boot instead of a kernel, how do pass information on from the
user (kernel, rfs, etc)?  Require the user to create flash images[1]?
Maybe that's a useful mode of operation in some cases, but I don't think
we should be slavishly bound to it.  Think of the current approach as
something between whole-system and userspace emulation.

Where does the device tree come from?  How do you tell the guest about
what devices it has, especially in virtualization scenarios with non-PCI
passthrough devices, or custom qdev instantiations?

> But at least we'd always be running the same guest software stack.

No we wouldn't.  Any U-Boot that runs under QEMU would have to be
heavily modified, unless we want to implement a ton of random device
emulation, at least one extra memory translation layer (LAWs, localbus
windows, CCSRBAR, and such), hacks to allow locked cache lines to
operate despite a lack of backing store, etc.

-Scott

[1] Keep in mind that a major use case for e500 KVM is on host systems
that don't have a hard drive.  I want to *reduce* the amount of memory
we waste to store this stuff, not increase it.
Blue Swirl - Sept. 27, 2011, 3:50 p.m.
On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
> On 09/24/2011 05:00 AM, Alexander Graf wrote:
>> On 24.09.2011, at 10:44, Blue Swirl wrote:
>>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>>> that would work on any powerpc implementation.  Other
>>>>>> implementation-specific release mechanisms are allowed, and are
>>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>>> that the OS supports it.
>>>>>>
>>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>>
>>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>>> just released which clarifies some things such as WIMG.
>>>>>>
>>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>>
>>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>>
>>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>>> device that magically loads r3 etc.
>
> Not Open Firmware, but rather an ePAPR-compliant loader.

'boot program to client program interface definition'.

>>>>> The device method would break abstraction layers,
>
> Which abstraction layers?

QEMU system emulation emulates hardware, not software. Hardware
devices don't touch CPU registers.

>>>>> it's much like
>>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>>> Instead it should be possible to implement a small boot ROM which puts
>>>>> the secondary CPUs into managed halt state without spinning, then the
>>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>>> the spin table, just like real HW would do.
>
> The spin table, with no IPI or halt state, is what real HW does (or
> rather, what software does on real HW) today.  It's ugly and inefficient
> but it should work everywhere.  Anything else would be dependent on a
> specific HW implementation.

Yes. Hardware doesn't ever implement the spin table.

>>>>> On Sparc32 OpenBIOS this
>>>>> is something like a few lines of ASM on both sides.
>>>>
>>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott.
>
> I had some comments on the actual v1 implementation as well. :-)
>
>>>> So we have the choice between having code inside the guest that
>>>> spins, maybe even only checks every x ms, by programming a timer,
>>>> or we can try to make an event out of the memory write. V1 was
>>>> the former, v2 (this one) is the latter. This version performs a
>>>> lot better and is easier to understand.
>>>
>>> The abstraction layers should not be broken lightly, I suppose some
>>> performance or laziness^Wlocal optimization reasons were behind vmport
>>> design too. The ideal way to solve this could be to detect a spinning
>>> CPU and optimize that for all architectures, that could be tricky
>>> though (if a CPU remains in the same TB for extended periods, inspect
>>> the TB: if it performs a loop with a single load instruction, replace
>>> the load by a special wait operation for any memory stores to that
>>> page).
>
> How's that going to work with KVM?
>
>> In fact, the whole kernel loading way we go today is pretty much
>> wrong. We should rather do it similar to OpenBIOS where firmware
>> always loads and then pulls the kernel from QEMU using a PV
>> interface. At that point, we would have to implement such an
>> optimization as you suggest. Or implement a hypercall :).
>
> I think the current approach is more usable for most purposes.  If you
> start U-Boot instead of a kernel, how do pass information on from the
> user (kernel, rfs, etc)?  Require the user to create flash images[1]?

No, for example OpenBIOS gets the kernel command line from fw_cfg device.

> Maybe that's a useful mode of operation in some cases, but I don't think
> we should be slavishly bound to it.  Think of the current approach as
> something between whole-system and userspace emulation.

This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
kernel level but lower. Perhaps this mode should be enabled with
-semihosting flag or a new flag. Then the bare metal version could be
run without the flag.

> Where does the device tree come from?  How do you tell the guest about
> what devices it has, especially in virtualization scenarios with non-PCI
> passthrough devices, or custom qdev instantiations?
>
>> But at least we'd always be running the same guest software stack.
>
> No we wouldn't.  Any U-Boot that runs under QEMU would have to be
> heavily modified, unless we want to implement a ton of random device
> emulation, at least one extra memory translation layer (LAWs, localbus
> windows, CCSRBAR, and such), hacks to allow locked cache lines to
> operate despite a lack of backing store, etc.

I'd say HW emulation business as usual. Now with the new memory API,
it should be possible to emulate the caches with line locking and TLBs
etc., this was not previously possible. IIRC implementing locked cache
lines would allow x86 to boot unmodified coreboot.

> -Scott
>
> [1] Keep in mind that a major use case for e500 KVM is on host systems
> that don't have a hard drive.  I want to *reduce* the amount of memory
> we waste to store this stuff, not increase it.

Interesting use case. Is there a display device?
Alexander Graf - Sept. 27, 2011, 3:59 p.m.
On 27.09.2011, at 17:50, Blue Swirl wrote:

> On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
>> On 09/24/2011 05:00 AM, Alexander Graf wrote:
>>> On 24.09.2011, at 10:44, Blue Swirl wrote:
>>>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>>>> that would work on any powerpc implementation.  Other
>>>>>>> implementation-specific release mechanisms are allowed, and are
>>>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>>>> that the OS supports it.
>>>>>>> 
>>>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>>> 
>>>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>>>> just released which clarifies some things such as WIMG.
>>>>>>> 
>>>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>>> 
>>>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>>> 
>>>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>>>> device that magically loads r3 etc.
>> 
>> Not Open Firmware, but rather an ePAPR-compliant loader.
> 
> 'boot program to client program interface definition'.
> 
>>>>>> The device method would break abstraction layers,
>> 
>> Which abstraction layers?
> 
> QEMU system emulation emulates hardware, not software. Hardware
> devices don't touch CPU registers.

The great part about this emulated device is that it's basically guest software running in host context. To the guest, it's not a device in the ordinary sense, such as vmport, but rather the same as software running on another core, just that the other core isn't running any software.

Sure, if you consider this a device, it does break abstraction layers. Just consider it as host running guest code, then it makes sense :).

> 
>>>>>> it's much like
>>>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>>>> Instead it should be possible to implement a small boot ROM which puts
>>>>>> the secondary CPUs into managed halt state without spinning, then the
>>>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>>>> the spin table, just like real HW would do.
>> 
>> The spin table, with no IPI or halt state, is what real HW does (or
>> rather, what software does on real HW) today.  It's ugly and inefficient
>> but it should work everywhere.  Anything else would be dependent on a
>> specific HW implementation.
> 
> Yes. Hardware doesn't ever implement the spin table.
> 
>>>>>> On Sparc32 OpenBIOS this
>>>>>> is something like a few lines of ASM on both sides.
>>>>> 
>>>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott.
>> 
>> I had some comments on the actual v1 implementation as well. :-)
>> 
>>>>> So we have the choice between having code inside the guest that
>>>>> spins, maybe even only checks every x ms, by programming a timer,
>>>>> or we can try to make an event out of the memory write. V1 was
>>>>> the former, v2 (this one) is the latter. This version performs a
>>>>> lot better and is easier to understand.
>>>> 
>>>> The abstraction layers should not be broken lightly, I suppose some
>>>> performance or laziness^Wlocal optimization reasons were behind vmport
>>>> design too. The ideal way to solve this could be to detect a spinning
>>>> CPU and optimize that for all architectures, that could be tricky
>>>> though (if a CPU remains in the same TB for extended periods, inspect
>>>> the TB: if it performs a loop with a single load instruction, replace
>>>> the load by a special wait operation for any memory stores to that
>>>> page).
>> 
>> How's that going to work with KVM?
>> 
>>> In fact, the whole kernel loading way we go today is pretty much
>>> wrong. We should rather do it similar to OpenBIOS where firmware
>>> always loads and then pulls the kernel from QEMU using a PV
>>> interface. At that point, we would have to implement such an
>>> optimization as you suggest. Or implement a hypercall :).
>> 
>> I think the current approach is more usable for most purposes.  If you
>> start U-Boot instead of a kernel, how do pass information on from the
>> user (kernel, rfs, etc)?  Require the user to create flash images[1]?
> 
> No, for example OpenBIOS gets the kernel command line from fw_cfg device.
> 
>> Maybe that's a useful mode of operation in some cases, but I don't think
>> we should be slavishly bound to it.  Think of the current approach as
>> something between whole-system and userspace emulation.
> 
> This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
> kernel level but lower. Perhaps this mode should be enabled with
> -semihosting flag or a new flag. Then the bare metal version could be
> run without the flag.

and then we'd have 2 implementations for running in system emulation mode and need to maintain both. I don't think that scales very well.

> 
>> Where does the device tree come from?  How do you tell the guest about
>> what devices it has, especially in virtualization scenarios with non-PCI
>> passthrough devices, or custom qdev instantiations?
>> 
>>> But at least we'd always be running the same guest software stack.
>> 
>> No we wouldn't.  Any U-Boot that runs under QEMU would have to be
>> heavily modified, unless we want to implement a ton of random device
>> emulation, at least one extra memory translation layer (LAWs, localbus
>> windows, CCSRBAR, and such), hacks to allow locked cache lines to
>> operate despite a lack of backing store, etc.
> 
> I'd say HW emulation business as usual. Now with the new memory API,
> it should be possible to emulate the caches with line locking and TLBs
> etc., this was not previously possible. IIRC implementing locked cache
> lines would allow x86 to boot unmodified coreboot.

So how would you emulate cache lines with line locking on KVM?

However, we already have a number of hacks in SeaBIOS to run in QEMU, so I don't see an issue in adding a few here and there in u-boot. The memory pressure is a real issue though. I'm not sure how we'd manage that one. Maybe we could try and reuse the host u-boot binary? heh

> 
>> -Scott
>> 
>> [1] Keep in mind that a major use case for e500 KVM is on host systems
>> that don't have a hard drive.  I want to *reduce* the amount of memory
>> we waste to store this stuff, not increase it.
> 
> Interesting use case. Is there a display device?

There are some boards with display and/or PCI(e), yes.


Alex
Blue Swirl - Sept. 27, 2011, 4:53 p.m.
On Tue, Sep 27, 2011 at 3:59 PM, Alexander Graf <agraf@suse.de> wrote:
>
> On 27.09.2011, at 17:50, Blue Swirl wrote:
>
>> On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
>>> On 09/24/2011 05:00 AM, Alexander Graf wrote:
>>>> On 24.09.2011, at 10:44, Blue Swirl wrote:
>>>>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>>>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>>>>> that would work on any powerpc implementation.  Other
>>>>>>>> implementation-specific release mechanisms are allowed, and are
>>>>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>>>>> that the OS supports it.
>>>>>>>>
>>>>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>>>>
>>>>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>>>>> just released which clarifies some things such as WIMG.
>>>>>>>>
>>>>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>>>>
>>>>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>>>>
>>>>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>>>>> device that magically loads r3 etc.
>>>
>>> Not Open Firmware, but rather an ePAPR-compliant loader.
>>
>> 'boot program to client program interface definition'.
>>
>>>>>>> The device method would break abstraction layers,
>>>
>>> Which abstraction layers?
>>
>> QEMU system emulation emulates hardware, not software. Hardware
>> devices don't touch CPU registers.
>
> The great part about this emulated device is that it's basically guest software running in host context. To the guest, it's not a device in the ordinary sense, such as vmport, but rather the same as software running on another core, just that the other core isn't running any software.
>
> Sure, if you consider this a device, it does break abstraction layers. Just consider it as host running guest code, then it makes sense :).
>
>>
>>>>>>> it's much like
>>>>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>>>>> Instead it should be possible to implement a small boot ROM which puts
>>>>>>> the secondary CPUs into managed halt state without spinning, then the
>>>>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>>>>> the spin table, just like real HW would do.
>>>
>>> The spin table, with no IPI or halt state, is what real HW does (or
>>> rather, what software does on real HW) today.  It's ugly and inefficient
>>> but it should work everywhere.  Anything else would be dependent on a
>>> specific HW implementation.
>>
>> Yes. Hardware doesn't ever implement the spin table.
>>
>>>>>>> On Sparc32 OpenBIOS this
>>>>>>> is something like a few lines of ASM on both sides.
>>>>>>
>>>>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott.
>>>
>>> I had some comments on the actual v1 implementation as well. :-)
>>>
>>>>>> So we have the choice between having code inside the guest that
>>>>>> spins, maybe even only checks every x ms, by programming a timer,
>>>>>> or we can try to make an event out of the memory write. V1 was
>>>>>> the former, v2 (this one) is the latter. This version performs a
>>>>>> lot better and is easier to understand.
>>>>>
>>>>> The abstraction layers should not be broken lightly, I suppose some
>>>>> performance or laziness^Wlocal optimization reasons were behind vmport
>>>>> design too. The ideal way to solve this could be to detect a spinning
>>>>> CPU and optimize that for all architectures, that could be tricky
>>>>> though (if a CPU remains in the same TB for extended periods, inspect
>>>>> the TB: if it performs a loop with a single load instruction, replace
>>>>> the load by a special wait operation for any memory stores to that
>>>>> page).
>>>
>>> How's that going to work with KVM?
>>>
>>>> In fact, the whole kernel loading way we go today is pretty much
>>>> wrong. We should rather do it similar to OpenBIOS where firmware
>>>> always loads and then pulls the kernel from QEMU using a PV
>>>> interface. At that point, we would have to implement such an
>>>> optimization as you suggest. Or implement a hypercall :).
>>>
>>> I think the current approach is more usable for most purposes.  If you
>>> start U-Boot instead of a kernel, how do pass information on from the
>>> user (kernel, rfs, etc)?  Require the user to create flash images[1]?
>>
>> No, for example OpenBIOS gets the kernel command line from fw_cfg device.
>>
>>> Maybe that's a useful mode of operation in some cases, but I don't think
>>> we should be slavishly bound to it.  Think of the current approach as
>>> something between whole-system and userspace emulation.
>>
>> This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
>> kernel level but lower. Perhaps this mode should be enabled with
>> -semihosting flag or a new flag. Then the bare metal version could be
>> run without the flag.
>
> and then we'd have 2 implementations for running in system emulation mode and need to maintain both. I don't think that scales very well.

No, but such hacks are not common.

>>
>>> Where does the device tree come from?  How do you tell the guest about
>>> what devices it has, especially in virtualization scenarios with non-PCI
>>> passthrough devices, or custom qdev instantiations?
>>>
>>>> But at least we'd always be running the same guest software stack.
>>>
>>> No we wouldn't.  Any U-Boot that runs under QEMU would have to be
>>> heavily modified, unless we want to implement a ton of random device
>>> emulation, at least one extra memory translation layer (LAWs, localbus
>>> windows, CCSRBAR, and such), hacks to allow locked cache lines to
>>> operate despite a lack of backing store, etc.
>>
>> I'd say HW emulation business as usual. Now with the new memory API,
>> it should be possible to emulate the caches with line locking and TLBs
>> etc., this was not previously possible. IIRC implementing locked cache
>> lines would allow x86 to boot unmodified coreboot.
>
> So how would you emulate cache lines with line locking on KVM?

The cache would be a MMIO device which registers to handle all memory
space. Configuring the cache controller changes how the device
operates. Put this device between CPU and memory and other devices.
Performance would probably be horrible, so CPU should disable the
device automatically after some time.

> However, we already have a number of hacks in SeaBIOS to run in QEMU, so I don't see an issue in adding a few here and there in u-boot. The memory pressure is a real issue though. I'm not sure how we'd manage that one. Maybe we could try and reuse the host u-boot binary? heh

I don't think SeaBIOS breaks layering except for fw_cfg. For extremely
memory limited situation, perhaps QEMU (or Native KVM Tool for lean
and mean version) could be run without glibc, inside kernel or even
interfacing directly with the hypervisor. I'd also continue making it
possible to disable building unused devices and features.

>>
>>> -Scott
>>>
>>> [1] Keep in mind that a major use case for e500 KVM is on host systems
>>> that don't have a hard drive.  I want to *reduce* the amount of memory
>>> we waste to store this stuff, not increase it.
>>
>> Interesting use case. Is there a display device?
>
> There are some boards with display and/or PCI(e), yes.
>
>
> Alex
>
>
Richard Henderson - Sept. 27, 2011, 5:01 p.m.
On 09/27/2011 09:53 AM, Blue Swirl wrote:
>> > So how would you emulate cache lines with line locking on KVM?
> The cache would be a MMIO device which registers to handle all memory
> space. Configuring the cache controller changes how the device
> operates. Put this device between CPU and memory and other devices.
> Performance would probably be horrible, so CPU should disable the
> device automatically after some time.
> 

Seems like a better alternative would be to add an mmio device when
a line is actually locked.  And the device would cover *only* the
locked line.  I assume that following the boot process these lines
are unlocked, and the normal running state of the system would have
none of these mmio devices active.


r~
Alexander Graf - Sept. 27, 2011, 5:03 p.m.
On 27.09.2011, at 18:53, Blue Swirl wrote:

> On Tue, Sep 27, 2011 at 3:59 PM, Alexander Graf <agraf@suse.de> wrote:
>> 
>> On 27.09.2011, at 17:50, Blue Swirl wrote:
>> 
>>> On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>> On 09/24/2011 05:00 AM, Alexander Graf wrote:
>>>>> On 24.09.2011, at 10:44, Blue Swirl wrote:
>>>>>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>>>>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>>>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>>>>>> that would work on any powerpc implementation.  Other
>>>>>>>>> implementation-specific release mechanisms are allowed, and are
>>>>>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>>>>>> that the OS supports it.
>>>>>>>>> 
>>>>>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>>>>> 
>>>>>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>>>>>> just released which clarifies some things such as WIMG.
>>>>>>>>> 
>>>>>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>>>>> 
>>>>>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>>>>> 
>>>>>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>>>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>>>>>> device that magically loads r3 etc.
>>>> 
>>>> Not Open Firmware, but rather an ePAPR-compliant loader.
>>> 
>>> 'boot program to client program interface definition'.
>>> 
>>>>>>>> The device method would break abstraction layers,
>>>> 
>>>> Which abstraction layers?
>>> 
>>> QEMU system emulation emulates hardware, not software. Hardware
>>> devices don't touch CPU registers.
>> 
>> The great part about this emulated device is that it's basically guest software running in host context. To the guest, it's not a device in the ordinary sense, such as vmport, but rather the same as software running on another core, just that the other core isn't running any software.
>> 
>> Sure, if you consider this a device, it does break abstraction layers. Just consider it as host running guest code, then it makes sense :).
>> 
>>> 
>>>>>>>> it's much like
>>>>>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>>>>>> Instead it should be possible to implement a small boot ROM which puts
>>>>>>>> the secondary CPUs into managed halt state without spinning, then the
>>>>>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>>>>>> the spin table, just like real HW would do.
>>>> 
>>>> The spin table, with no IPI or halt state, is what real HW does (or
>>>> rather, what software does on real HW) today.  It's ugly and inefficient
>>>> but it should work everywhere.  Anything else would be dependent on a
>>>> specific HW implementation.
>>> 
>>> Yes. Hardware doesn't ever implement the spin table.
>>> 
>>>>>>>> On Sparc32 OpenBIOS this
>>>>>>>> is something like a few lines of ASM on both sides.
>>>>>>> 
>>>>>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott.
>>>> 
>>>> I had some comments on the actual v1 implementation as well. :-)
>>>> 
>>>>>>> So we have the choice between having code inside the guest that
>>>>>>> spins, maybe even only checks every x ms, by programming a timer,
>>>>>>> or we can try to make an event out of the memory write. V1 was
>>>>>>> the former, v2 (this one) is the latter. This version performs a
>>>>>>> lot better and is easier to understand.
>>>>>> 
>>>>>> The abstraction layers should not be broken lightly, I suppose some
>>>>>> performance or laziness^Wlocal optimization reasons were behind vmport
>>>>>> design too. The ideal way to solve this could be to detect a spinning
>>>>>> CPU and optimize that for all architectures, that could be tricky
>>>>>> though (if a CPU remains in the same TB for extended periods, inspect
>>>>>> the TB: if it performs a loop with a single load instruction, replace
>>>>>> the load by a special wait operation for any memory stores to that
>>>>>> page).
>>>> 
>>>> How's that going to work with KVM?
>>>> 
>>>>> In fact, the whole kernel loading way we go today is pretty much
>>>>> wrong. We should rather do it similar to OpenBIOS where firmware
>>>>> always loads and then pulls the kernel from QEMU using a PV
>>>>> interface. At that point, we would have to implement such an
>>>>> optimization as you suggest. Or implement a hypercall :).
>>>> 
>>>> I think the current approach is more usable for most purposes.  If you
>>>> start U-Boot instead of a kernel, how do pass information on from the
>>>> user (kernel, rfs, etc)?  Require the user to create flash images[1]?
>>> 
>>> No, for example OpenBIOS gets the kernel command line from fw_cfg device.
>>> 
>>>> Maybe that's a useful mode of operation in some cases, but I don't think
>>>> we should be slavishly bound to it.  Think of the current approach as
>>>> something between whole-system and userspace emulation.
>>> 
>>> This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
>>> kernel level but lower. Perhaps this mode should be enabled with
>>> -semihosting flag or a new flag. Then the bare metal version could be
>>> run without the flag.
>> 
>> and then we'd have 2 implementations for running in system emulation mode and need to maintain both. I don't think that scales very well.
> 
> No, but such hacks are not common.
> 
>>> 
>>>> Where does the device tree come from?  How do you tell the guest about
>>>> what devices it has, especially in virtualization scenarios with non-PCI
>>>> passthrough devices, or custom qdev instantiations?
>>>> 
>>>>> But at least we'd always be running the same guest software stack.
>>>> 
>>>> No we wouldn't.  Any U-Boot that runs under QEMU would have to be
>>>> heavily modified, unless we want to implement a ton of random device
>>>> emulation, at least one extra memory translation layer (LAWs, localbus
>>>> windows, CCSRBAR, and such), hacks to allow locked cache lines to
>>>> operate despite a lack of backing store, etc.
>>> 
>>> I'd say HW emulation business as usual. Now with the new memory API,
>>> it should be possible to emulate the caches with line locking and TLBs
>>> etc., this was not previously possible. IIRC implementing locked cache
>>> lines would allow x86 to boot unmodified coreboot.
>> 
>> So how would you emulate cache lines with line locking on KVM?
> 
> The cache would be a MMIO device which registers to handle all memory
> space. Configuring the cache controller changes how the device
> operates. Put this device between CPU and memory and other devices.
> Performance would probably be horrible, so CPU should disable the
> device automatically after some time.

So how would you execute code on this region then? :)

> 
>> However, we already have a number of hacks in SeaBIOS to run in QEMU, so I don't see an issue in adding a few here and there in u-boot. The memory pressure is a real issue though. I'm not sure how we'd manage that one. Maybe we could try and reuse the host u-boot binary? heh
> 
> I don't think SeaBIOS breaks layering except for fw_cfg.

I'm not saying we're breaking layering there. I'm saying that changing u-boot is not so bad, since it's the same as we do with SeaBIOS. It was an argument in favor of your position.

> For extremely
> memory limited situation, perhaps QEMU (or Native KVM Tool for lean
> and mean version) could be run without glibc, inside kernel or even
> interfacing directly with the hypervisor. I'd also continue making it
> possible to disable building unused devices and features.

I'm pretty sure you're not the only one with that goal ;).


Alex
Blue Swirl - Sept. 27, 2011, 5:17 p.m.
On Tue, Sep 27, 2011 at 5:01 PM, Richard Henderson <rth@twiddle.net> wrote:
> On 09/27/2011 09:53 AM, Blue Swirl wrote:
>>> > So how would you emulate cache lines with line locking on KVM?
>> The cache would be a MMIO device which registers to handle all memory
>> space. Configuring the cache controller changes how the device
>> operates. Put this device between CPU and memory and other devices.
>> Performance would probably be horrible, so CPU should disable the
>> device automatically after some time.
>>
>
> Seems like a better alternative would be to add an mmio device when
> a line is actually locked.  And the device would cover *only* the
> locked line.  I assume that following the boot process these lines
> are unlocked, and the normal running state of the system would have
> none of these mmio devices active.

The BIOS may also attempt to perform tests with the cache device,
probe for cache sizes or read back I/D TLB lines via diagnostic modes.
That wouldn't work in your approach.
Richard Henderson - Sept. 27, 2011, 5:19 p.m.
On 09/27/2011 10:17 AM, Blue Swirl wrote:
> On Tue, Sep 27, 2011 at 5:01 PM, Richard Henderson <rth@twiddle.net> wrote:
>> On 09/27/2011 09:53 AM, Blue Swirl wrote:
>>>>> So how would you emulate cache lines with line locking on KVM?
>>> The cache would be a MMIO device which registers to handle all memory
>>> space. Configuring the cache controller changes how the device
>>> operates. Put this device between CPU and memory and other devices.
>>> Performance would probably be horrible, so CPU should disable the
>>> device automatically after some time.
>>>
>>
>> Seems like a better alternative would be to add an mmio device when
>> a line is actually locked.  And the device would cover *only* the
>> locked line.  I assume that following the boot process these lines
>> are unlocked, and the normal running state of the system would have
>> none of these mmio devices active.
> 
> The BIOS may also attempt to perform tests with the cache device,
> probe for cache sizes or read back I/D TLB lines via diagnostic modes.
> That wouldn't work in your approach.

Err... why not?


r~
Blue Swirl - Sept. 27, 2011, 5:20 p.m.
On Tue, Sep 27, 2011 at 5:03 PM, Alexander Graf <agraf@suse.de> wrote:
>
> On 27.09.2011, at 18:53, Blue Swirl wrote:
>
>> On Tue, Sep 27, 2011 at 3:59 PM, Alexander Graf <agraf@suse.de> wrote:
>>>
>>> On 27.09.2011, at 17:50, Blue Swirl wrote:
>>>
>>>> On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>> On 09/24/2011 05:00 AM, Alexander Graf wrote:
>>>>>> On 24.09.2011, at 10:44, Blue Swirl wrote:
>>>>>>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>>>>>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>>>>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>>>>>>> that would work on any powerpc implementation.  Other
>>>>>>>>>> implementation-specific release mechanisms are allowed, and are
>>>>>>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>>>>>>> that the OS supports it.
>>>>>>>>>>
>>>>>>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>>>>>>
>>>>>>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>>>>>>> just released which clarifies some things such as WIMG.
>>>>>>>>>>
>>>>>>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>>>>>>
>>>>>>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>>>>>>
>>>>>>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>>>>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>>>>>>> device that magically loads r3 etc.
>>>>>
>>>>> Not Open Firmware, but rather an ePAPR-compliant loader.
>>>>
>>>> 'boot program to client program interface definition'.
>>>>
>>>>>>>>> The device method would break abstraction layers,
>>>>>
>>>>> Which abstraction layers?
>>>>
>>>> QEMU system emulation emulates hardware, not software. Hardware
>>>> devices don't touch CPU registers.
>>>
>>> The great part about this emulated device is that it's basically guest software running in host context. To the guest, it's not a device in the ordinary sense, such as vmport, but rather the same as software running on another core, just that the other core isn't running any software.
>>>
>>> Sure, if you consider this a device, it does break abstraction layers. Just consider it as host running guest code, then it makes sense :).
>>>
>>>>
>>>>>>>>> it's much like
>>>>>>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>>>>>>> Instead it should be possible to implement a small boot ROM which puts
>>>>>>>>> the secondary CPUs into managed halt state without spinning, then the
>>>>>>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>>>>>>> the spin table, just like real HW would do.
>>>>>
>>>>> The spin table, with no IPI or halt state, is what real HW does (or
>>>>> rather, what software does on real HW) today.  It's ugly and inefficient
>>>>> but it should work everywhere.  Anything else would be dependent on a
>>>>> specific HW implementation.
>>>>
>>>> Yes. Hardware doesn't ever implement the spin table.
>>>>
>>>>>>>>> On Sparc32 OpenBIOS this
>>>>>>>>> is something like a few lines of ASM on both sides.
>>>>>>>>
>>>>>>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott.
>>>>>
>>>>> I had some comments on the actual v1 implementation as well. :-)
>>>>>
>>>>>>>> So we have the choice between having code inside the guest that
>>>>>>>> spins, maybe even only checks every x ms, by programming a timer,
>>>>>>>> or we can try to make an event out of the memory write. V1 was
>>>>>>>> the former, v2 (this one) is the latter. This version performs a
>>>>>>>> lot better and is easier to understand.
>>>>>>>
>>>>>>> The abstraction layers should not be broken lightly, I suppose some
>>>>>>> performance or laziness^Wlocal optimization reasons were behind vmport
>>>>>>> design too. The ideal way to solve this could be to detect a spinning
>>>>>>> CPU and optimize that for all architectures, that could be tricky
>>>>>>> though (if a CPU remains in the same TB for extended periods, inspect
>>>>>>> the TB: if it performs a loop with a single load instruction, replace
>>>>>>> the load by a special wait operation for any memory stores to that
>>>>>>> page).
>>>>>
>>>>> How's that going to work with KVM?
>>>>>
>>>>>> In fact, the whole kernel loading way we go today is pretty much
>>>>>> wrong. We should rather do it similar to OpenBIOS where firmware
>>>>>> always loads and then pulls the kernel from QEMU using a PV
>>>>>> interface. At that point, we would have to implement such an
>>>>>> optimization as you suggest. Or implement a hypercall :).
>>>>>
>>>>> I think the current approach is more usable for most purposes.  If you
>>>>> start U-Boot instead of a kernel, how do pass information on from the
>>>>> user (kernel, rfs, etc)?  Require the user to create flash images[1]?
>>>>
>>>> No, for example OpenBIOS gets the kernel command line from fw_cfg device.
>>>>
>>>>> Maybe that's a useful mode of operation in some cases, but I don't think
>>>>> we should be slavishly bound to it.  Think of the current approach as
>>>>> something between whole-system and userspace emulation.
>>>>
>>>> This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
>>>> kernel level but lower. Perhaps this mode should be enabled with
>>>> -semihosting flag or a new flag. Then the bare metal version could be
>>>> run without the flag.
>>>
>>> and then we'd have 2 implementations for running in system emulation mode and need to maintain both. I don't think that scales very well.
>>
>> No, but such hacks are not common.
>>
>>>>
>>>>> Where does the device tree come from?  How do you tell the guest about
>>>>> what devices it has, especially in virtualization scenarios with non-PCI
>>>>> passthrough devices, or custom qdev instantiations?
>>>>>
>>>>>> But at least we'd always be running the same guest software stack.
>>>>>
>>>>> No we wouldn't.  Any U-Boot that runs under QEMU would have to be
>>>>> heavily modified, unless we want to implement a ton of random device
>>>>> emulation, at least one extra memory translation layer (LAWs, localbus
>>>>> windows, CCSRBAR, and such), hacks to allow locked cache lines to
>>>>> operate despite a lack of backing store, etc.
>>>>
>>>> I'd say HW emulation business as usual. Now with the new memory API,
>>>> it should be possible to emulate the caches with line locking and TLBs
>>>> etc., this was not previously possible. IIRC implementing locked cache
>>>> lines would allow x86 to boot unmodified coreboot.
>>>
>>> So how would you emulate cache lines with line locking on KVM?
>>
>> The cache would be a MMIO device which registers to handle all memory
>> space. Configuring the cache controller changes how the device
>> operates. Put this device between CPU and memory and other devices.
>> Performance would probably be horrible, so CPU should disable the
>> device automatically after some time.
>
> So how would you execute code on this region then? :)

Easy, fix QEMU to allow executing from MMIO. (Yeah, I forgot about that).

>>
>>> However, we already have a number of hacks in SeaBIOS to run in QEMU, so I don't see an issue in adding a few here and there in u-boot. The memory pressure is a real issue though. I'm not sure how we'd manage that one. Maybe we could try and reuse the host u-boot binary? heh
>>
>> I don't think SeaBIOS breaks layering except for fw_cfg.
>
> I'm not saying we're breaking layering there. I'm saying that changing u-boot is not so bad, since it's the same as we do with SeaBIOS. It was an argument in favor of your position.

Never mind then ;-)

>> For extremely
>> memory limited situation, perhaps QEMU (or Native KVM Tool for lean
>> and mean version) could be run without glibc, inside kernel or even
>> interfacing directly with the hypervisor. I'd also continue making it
>> possible to disable building unused devices and features.
>
> I'm pretty sure you're not the only one with that goal ;).

Great, let's do it.
Alexander Graf - Sept. 27, 2011, 5:23 p.m.
On 27.09.2011, at 19:20, Blue Swirl wrote:

> On Tue, Sep 27, 2011 at 5:03 PM, Alexander Graf <agraf@suse.de> wrote:
>> 
>> On 27.09.2011, at 18:53, Blue Swirl wrote:
>> 
>>> On Tue, Sep 27, 2011 at 3:59 PM, Alexander Graf <agraf@suse.de> wrote:
>>>> 
>>>> On 27.09.2011, at 17:50, Blue Swirl wrote:
>>>> 
>>>>> On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>> On 09/24/2011 05:00 AM, Alexander Graf wrote:
>>>>>>> On 24.09.2011, at 10:44, Blue Swirl wrote:
>>>>>>>> On Sat, Sep 24, 2011 at 8:03 AM, Alexander Graf <agraf@suse.de> wrote:
>>>>>>>>> On 24.09.2011, at 09:41, Blue Swirl wrote:
>>>>>>>>>> On Mon, Sep 19, 2011 at 4:12 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>>>>>>> The goal with the spin table stuff, suboptimal as it is, was something
>>>>>>>>>>> that would work on any powerpc implementation.  Other
>>>>>>>>>>> implementation-specific release mechanisms are allowed, and are
>>>>>>>>>>> indicated by a property in the cpu node, but only if the loader knows
>>>>>>>>>>> that the OS supports it.
>>>>>>>>>>> 
>>>>>>>>>>>> IIUC the spec that includes these bits is not finalized yet. It is however in use on all u-boot versions for e500 that I'm aware of and the method Linux uses to bring up secondary CPUs.
>>>>>>>>>>> 
>>>>>>>>>>> It's in ePAPR 1.0, which has been out for a while now.  ePAPR 1.1 was
>>>>>>>>>>> just released which clarifies some things such as WIMG.
>>>>>>>>>>> 
>>>>>>>>>>>> Stuart / Scott, do you have any pointers to documentation where the spinning is explained?
>>>>>>>>>>> 
>>>>>>>>>>> https://www.power.org/resources/downloads/Power_ePAPR_APPROVED_v1.1.pdf
>>>>>>>>>> 
>>>>>>>>>> Chapter 5.5.2 describes the table. This is actually an interface
>>>>>>>>>> between OS and Open Firmware, obviously there can't be a real hardware
>>>>>>>>>> device that magically loads r3 etc.
>>>>>> 
>>>>>> Not Open Firmware, but rather an ePAPR-compliant loader.
>>>>> 
>>>>> 'boot program to client program interface definition'.
>>>>> 
>>>>>>>>>> The device method would break abstraction layers,
>>>>>> 
>>>>>> Which abstraction layers?
>>>>> 
>>>>> QEMU system emulation emulates hardware, not software. Hardware
>>>>> devices don't touch CPU registers.
>>>> 
>>>> The great part about this emulated device is that it's basically guest software running in host context. To the guest, it's not a device in the ordinary sense, such as vmport, but rather the same as software running on another core, just that the other core isn't running any software.
>>>> 
>>>> Sure, if you consider this a device, it does break abstraction layers. Just consider it as host running guest code, then it makes sense :).
>>>> 
>>>>> 
>>>>>>>>>> it's much like
>>>>>>>>>> vmport stuff in x86. Using a hypercall would be a small improvement.
>>>>>>>>>> Instead it should be possible to implement a small boot ROM which puts
>>>>>>>>>> the secondary CPUs into managed halt state without spinning, then the
>>>>>>>>>> boot CPU could send an IPI to a halted CPU to wake them up based on
>>>>>>>>>> the spin table, just like real HW would do.
>>>>>> 
>>>>>> The spin table, with no IPI or halt state, is what real HW does (or
>>>>>> rather, what software does on real HW) today.  It's ugly and inefficient
>>>>>> but it should work everywhere.  Anything else would be dependent on a
>>>>>> specific HW implementation.
>>>>> 
>>>>> Yes. Hardware doesn't ever implement the spin table.
>>>>> 
>>>>>>>>>> On Sparc32 OpenBIOS this
>>>>>>>>>> is something like a few lines of ASM on both sides.
>>>>>>>>> 
>>>>>>>>> That sounds pretty close to what I had implemented in v1. Back then the only comment was to do it using this method from Scott.
>>>>>> 
>>>>>> I had some comments on the actual v1 implementation as well. :-)
>>>>>> 
>>>>>>>>> So we have the choice between having code inside the guest that
>>>>>>>>> spins, maybe even only checks every x ms, by programming a timer,
>>>>>>>>> or we can try to make an event out of the memory write. V1 was
>>>>>>>>> the former, v2 (this one) is the latter. This version performs a
>>>>>>>>> lot better and is easier to understand.
>>>>>>>> 
>>>>>>>> The abstraction layers should not be broken lightly, I suppose some
>>>>>>>> performance or laziness^Wlocal optimization reasons were behind vmport
>>>>>>>> design too. The ideal way to solve this could be to detect a spinning
>>>>>>>> CPU and optimize that for all architectures, that could be tricky
>>>>>>>> though (if a CPU remains in the same TB for extended periods, inspect
>>>>>>>> the TB: if it performs a loop with a single load instruction, replace
>>>>>>>> the load by a special wait operation for any memory stores to that
>>>>>>>> page).
>>>>>> 
>>>>>> How's that going to work with KVM?
>>>>>> 
>>>>>>> In fact, the whole kernel loading way we go today is pretty much
>>>>>>> wrong. We should rather do it similar to OpenBIOS where firmware
>>>>>>> always loads and then pulls the kernel from QEMU using a PV
>>>>>>> interface. At that point, we would have to implement such an
>>>>>>> optimization as you suggest. Or implement a hypercall :).
>>>>>> 
>>>>>> I think the current approach is more usable for most purposes.  If you
>>>>>> start U-Boot instead of a kernel, how do pass information on from the
>>>>>> user (kernel, rfs, etc)?  Require the user to create flash images[1]?
>>>>> 
>>>>> No, for example OpenBIOS gets the kernel command line from fw_cfg device.
>>>>> 
>>>>>> Maybe that's a useful mode of operation in some cases, but I don't think
>>>>>> we should be slavishly bound to it.  Think of the current approach as
>>>>>> something between whole-system and userspace emulation.
>>>>> 
>>>>> This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
>>>>> kernel level but lower. Perhaps this mode should be enabled with
>>>>> -semihosting flag or a new flag. Then the bare metal version could be
>>>>> run without the flag.
>>>> 
>>>> and then we'd have 2 implementations for running in system emulation mode and need to maintain both. I don't think that scales very well.
>>> 
>>> No, but such hacks are not common.
>>> 
>>>>> 
>>>>>> Where does the device tree come from?  How do you tell the guest about
>>>>>> what devices it has, especially in virtualization scenarios with non-PCI
>>>>>> passthrough devices, or custom qdev instantiations?
>>>>>> 
>>>>>>> But at least we'd always be running the same guest software stack.
>>>>>> 
>>>>>> No we wouldn't.  Any U-Boot that runs under QEMU would have to be
>>>>>> heavily modified, unless we want to implement a ton of random device
>>>>>> emulation, at least one extra memory translation layer (LAWs, localbus
>>>>>> windows, CCSRBAR, and such), hacks to allow locked cache lines to
>>>>>> operate despite a lack of backing store, etc.
>>>>> 
>>>>> I'd say HW emulation business as usual. Now with the new memory API,
>>>>> it should be possible to emulate the caches with line locking and TLBs
>>>>> etc., this was not previously possible. IIRC implementing locked cache
>>>>> lines would allow x86 to boot unmodified coreboot.
>>>> 
>>>> So how would you emulate cache lines with line locking on KVM?
>>> 
>>> The cache would be a MMIO device which registers to handle all memory
>>> space. Configuring the cache controller changes how the device
>>> operates. Put this device between CPU and memory and other devices.
>>> Performance would probably be horrible, so CPU should disable the
>>> device automatically after some time.
>> 
>> So how would you execute code on this region then? :)
> 
> Easy, fix QEMU to allow executing from MMIO. (Yeah, I forgot about that).

It's not quite as easy to fix KVM to do the same though unfortunately. We'd have to either implement a full instruction emulator in the kernel (x86 style) or transfer all state from KVM into QEMU to execute it there (hell breaks loose). Both alternatives are not exactly appealing.

> 
>>> 
>>>> However, we already have a number of hacks in SeaBIOS to run in QEMU, so I don't see an issue in adding a few here and there in u-boot. The memory pressure is a real issue though. I'm not sure how we'd manage that one. Maybe we could try and reuse the host u-boot binary? heh
>>> 
>>> I don't think SeaBIOS breaks layering except for fw_cfg.
>> 
>> I'm not saying we're breaking layering there. I'm saying that changing u-boot is not so bad, since it's the same as we do with SeaBIOS. It was an argument in favor of your position.
> 
> Never mind then ;-)
> 
>>> For extremely
>>> memory limited situation, perhaps QEMU (or Native KVM Tool for lean
>>> and mean version) could be run without glibc, inside kernel or even
>>> interfacing directly with the hypervisor. I'd also continue making it
>>> possible to disable building unused devices and features.
>> 
>> I'm pretty sure you're not the only one with that goal ;).
> 
> Great, let's do it.

VGA comes first :)


Alex
Blue Swirl - Sept. 27, 2011, 5:23 p.m.
On Tue, Sep 27, 2011 at 5:19 PM, Richard Henderson <rth@twiddle.net> wrote:
> On 09/27/2011 10:17 AM, Blue Swirl wrote:
>> On Tue, Sep 27, 2011 at 5:01 PM, Richard Henderson <rth@twiddle.net> wrote:
>>> On 09/27/2011 09:53 AM, Blue Swirl wrote:
>>>>>> So how would you emulate cache lines with line locking on KVM?
>>>> The cache would be a MMIO device which registers to handle all memory
>>>> space. Configuring the cache controller changes how the device
>>>> operates. Put this device between CPU and memory and other devices.
>>>> Performance would probably be horrible, so CPU should disable the
>>>> device automatically after some time.
>>>>
>>>
>>> Seems like a better alternative would be to add an mmio device when
>>> a line is actually locked.  And the device would cover *only* the
>>> locked line.  I assume that following the boot process these lines
>>> are unlocked, and the normal running state of the system would have
>>> none of these mmio devices active.
>>
>> The BIOS may also attempt to perform tests with the cache device,
>> probe for cache sizes or read back I/D TLB lines via diagnostic modes.
>> That wouldn't work in your approach.
>
> Err... why not?

This is not related to the locked cache line mode. The BIOS could just
perform ordinary writes and reads from random memory addresses and
expect that the cache diagnostics registers change accordingly. The
cache device would have to cover all of memory to catch the accesses
and then update the registers.
Scott Wood - Sept. 27, 2011, 5:58 p.m.
On 09/27/2011 12:03 PM, Alexander Graf wrote:
> On 27.09.2011, at 18:53, Blue Swirl wrote:
>> On Tue, Sep 27, 2011 at 3:59 PM, Alexander Graf <agraf@suse.de> wrote:
>>> On 27.09.2011, at 17:50, Blue Swirl wrote:
>>>> On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>> I think the current approach is more usable for most purposes.  If you
>>>>> start U-Boot instead of a kernel, how do pass information on from the
>>>>> user (kernel, rfs, etc)?  Require the user to create flash images[1]?
>>>>
>>>> No, for example OpenBIOS gets the kernel command line from fw_cfg device.

Is that really so different from making QEMU act as an ePAPR-compliant
loader?

A big difference here is that on x86 you have a large quantity of
runtime services provided by the BIOS, which I can certainly understand
not wanting to shove into QEMU (though I'd still think you'd want to
hack out the initialization parts of the BIOS that depend on special
cache behavior, RAM controllers, and such, if at all practical).  This
spin table stuff is the only runtime service provided by an ePAPR loader.

>>>>> Maybe that's a useful mode of operation in some cases, but I don't think
>>>>> we should be slavishly bound to it.  Think of the current approach as
>>>>> something between whole-system and userspace emulation.
>>>>
>>>> This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
>>>> kernel level but lower. Perhaps this mode should be enabled with
>>>> -semihosting flag or a new flag. Then the bare metal version could be
>>>> run without the flag.
>>>
>>> and then we'd have 2 implementations for running in system emulation mode and need to maintain both. I don't think that scales very well.
>>
>> No, but such hacks are not common.

How much would actually need to be duplicated?  Seems like other than
turning some features off (a smaller set of devices, and not needing
extra address translation/cache layers), you'd just have a different way
of loading/starting the guest.  Most of that code would be common to all
ePAPR targets (with a few HW-specific variants for the MMU setup portion).

>>> So how would you emulate cache lines with line locking on KVM?
>>
>> The cache would be a MMIO device which registers to handle all memory
>> space. Configuring the cache controller changes how the device
>> operates. Put this device between CPU and memory and other devices.
>> Performance would probably be horrible, so CPU should disable the
>> device automatically after some time.
> 
> So how would you execute code on this region then? :)

I think U-Boot only does this for data, not code.  But U-Boot will want
to execute out of flash, and it will want to be able to move/resize the
flash window using the localbus controller (can't treat it as a
statically located rom image).

Easier would be to run a U-Boot that assumes RAM is already configured,
which we support for booting from sources other than NOR flash (would be
some minor tweakage to decouple it from those boot scenarios).  There'd
still be a lot of random I/O that needs to be emulated, or avoided with
changes to U-Boot.  And there'd still be the question of where the
device tree comes from, how information gets passed on from qemu, etc.

Full system emulation at that level would be an interesting project and
have its uses, but it would be a lot of work and should not get in the
way of workloads that don't need/want it.  The requirements and
usability considerations for virtualization are not always the same as
for traditional emulation use cases.

-Scott
Blue Swirl - Sept. 27, 2011, 6:47 p.m.
On Tue, Sep 27, 2011 at 5:58 PM, Scott Wood <scottwood@freescale.com> wrote:
> On 09/27/2011 12:03 PM, Alexander Graf wrote:
>> On 27.09.2011, at 18:53, Blue Swirl wrote:
>>> On Tue, Sep 27, 2011 at 3:59 PM, Alexander Graf <agraf@suse.de> wrote:
>>>> On 27.09.2011, at 17:50, Blue Swirl wrote:
>>>>> On Mon, Sep 26, 2011 at 11:19 PM, Scott Wood <scottwood@freescale.com> wrote:
>>>>>> I think the current approach is more usable for most purposes.  If you
>>>>>> start U-Boot instead of a kernel, how do pass information on from the
>>>>>> user (kernel, rfs, etc)?  Require the user to create flash images[1]?
>>>>>
>>>>> No, for example OpenBIOS gets the kernel command line from fw_cfg device.
>
> Is that really so different from making QEMU act as an ePAPR-compliant
> loader?

The difference is that the fw_cfg interface is pretty simple and
private between QEMU and OpenBIOS, whereas magical HW to support ePAPR
interface is public and towards a kernel. With the ROM approach, the
interface would exist between the ROM and kernel as intended.

> A big difference here is that on x86 you have a large quantity of
> runtime services provided by the BIOS, which I can certainly understand
> not wanting to shove into QEMU (though I'd still think you'd want to
> hack out the initialization parts of the BIOS that depend on special
> cache behavior, RAM controllers, and such, if at all practical).  This
> spin table stuff is the only runtime service provided by an ePAPR loader.

I think I've confused sPAPR (which has a lot of messy OF tree stuff) and ePAPR.

>>>>>> Maybe that's a useful mode of operation in some cases, but I don't think
>>>>>> we should be slavishly bound to it.  Think of the current approach as
>>>>>> something between whole-system and userspace emulation.
>>>>>
>>>>> This is similar to ARM, M68k and Xtensa semi-hosting mode, but not at
>>>>> kernel level but lower. Perhaps this mode should be enabled with
>>>>> -semihosting flag or a new flag. Then the bare metal version could be
>>>>> run without the flag.
>>>>
>>>> and then we'd have 2 implementations for running in system emulation mode and need to maintain both. I don't think that scales very well.
>>>
>>> No, but such hacks are not common.
>
> How much would actually need to be duplicated?  Seems like other than
> turning some features off (a smaller set of devices, and not needing
> extra address translation/cache layers), you'd just have a different way
> of loading/starting the guest.  Most of that code would be common to all
> ePAPR targets (with a few HW-specific variants for the MMU setup portion).

That looks OK.

>>>> So how would you emulate cache lines with line locking on KVM?
>>>
>>> The cache would be a MMIO device which registers to handle all memory
>>> space. Configuring the cache controller changes how the device
>>> operates. Put this device between CPU and memory and other devices.
>>> Performance would probably be horrible, so CPU should disable the
>>> device automatically after some time.
>>
>> So how would you execute code on this region then? :)
>
> I think U-Boot only does this for data, not code.  But U-Boot will want
> to execute out of flash, and it will want to be able to move/resize the
> flash window using the localbus controller (can't treat it as a
> statically located rom image).

This is a different issue, a limitation exists that QEMU can't execute
from MMIO region. But flash is a bit different, on write it's MMIO but
on read like RAM, so it should work.

> Easier would be to run a U-Boot that assumes RAM is already configured,
> which we support for booting from sources other than NOR flash (would be
> some minor tweakage to decouple it from those boot scenarios).  There'd
> still be a lot of random I/O that needs to be emulated, or avoided with
> changes to U-Boot.  And there'd still be the question of where the
> device tree comes from, how information gets passed on from qemu, etc.

I'd use fw_cfg for passing the device tree to the ROM.

> Full system emulation at that level would be an interesting project and
> have its uses, but it would be a lot of work and should not get in the
> way of workloads that don't need/want it.  The requirements and
> usability considerations for virtualization are not always the same as
> for traditional emulation use cases.

Yes and it's also possible to emulate different interfaces of a
system. But from QEMU code maintenance point of view, semi-hosting
approaches should not get in the way of system emulation. If they are
optional and do not affect the rest of the system, they are OK.

On the other hand, maybe it's a problem with QEMU architecture, the
design is very much tuned to support HW emulation and not various
software interfaces.

Patch

diff --git a/Makefile.target b/Makefile.target
index 2ed9099..3f689ce 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -247,7 +247,7 @@  endif
 obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
 obj-ppc-y += ppc440.o ppc440_bamboo.o
 # PowerPC E500 boards
-obj-ppc-y += ppce500_mpc8544ds.o mpc8544_guts.o
+obj-ppc-y += ppce500_mpc8544ds.o mpc8544_guts.o ppce500_spin.o
 # PowerPC 440 Xilinx ML507 reference board.
 obj-ppc-y += virtex_ml507.o
 obj-ppc-$(CONFIG_KVM) += kvm_ppc.o
diff --git a/hw/ppce500_mpc8544ds.c b/hw/ppce500_mpc8544ds.c
index 9379624..3b8b449 100644
--- a/hw/ppce500_mpc8544ds.c
+++ b/hw/ppce500_mpc8544ds.c
@@ -49,6 +49,7 @@ 
 #define MPC8544_PCI_IO             0xE1000000
 #define MPC8544_PCI_IOLEN          0x10000
 #define MPC8544_UTIL_BASE          (MPC8544_CCSRBAR_BASE + 0xe0000)
+#define MPC8544_SPIN_BASE          0xEF000000
 
 struct boot_info
 {
@@ -164,6 +165,18 @@  static void mmubooke_create_initial_mapping(CPUState *env,
     tlb->mas7_3 |= MAS3_UR | MAS3_UW | MAS3_UX | MAS3_SR | MAS3_SW | MAS3_SX;
 }
 
+static void mpc8544ds_cpu_reset_sec(void *opaque)
+{
+    CPUState *env = opaque;
+
+    cpu_reset(env);
+
+    /* Secondary CPU starts in halted state for now. Needs to change when
+       implementing non-kernel boot. */
+    env->halted = 1;
+    env->exception_index = EXCP_HLT;
+}
+
 static void mpc8544ds_cpu_reset(void *opaque)
 {
     CPUState *env = opaque;
@@ -172,6 +185,7 @@  static void mpc8544ds_cpu_reset(void *opaque)
     cpu_reset(env);
 
     /* Set initial guest state. */
+    env->halted = 0;
     env->gpr[1] = (16<<20) - 8;
     env->gpr[3] = bi->dt_base;
     env->nip = bi->entry;
@@ -199,7 +213,6 @@  static void mpc8544ds_init(ram_addr_t ram_size,
     unsigned int pci_irq_nrs[4] = {1, 2, 3, 4};
     qemu_irq **irqs, *mpic;
     DeviceState *dev;
-    struct boot_info *boot_info;
     CPUState *firstenv = NULL;
 
     /* Setup CPUs */
@@ -234,9 +247,16 @@  static void mpc8544ds_init(ram_addr_t ram_size,
         env->spr[SPR_40x_TCR] = 1 << 26;
 
         /* Register reset handler */
-        boot_info = g_malloc0(sizeof(struct boot_info));
-        qemu_register_reset(mpc8544ds_cpu_reset, env);
-        env->load_info = boot_info;
+        if (!i) {
+            /* Primary CPU */
+            struct boot_info *boot_info;
+            boot_info = g_malloc0(sizeof(struct boot_info));
+            qemu_register_reset(mpc8544ds_cpu_reset, env);
+            env->load_info = boot_info;
+        } else {
+            /* Secondary CPUs */
+            qemu_register_reset(mpc8544ds_cpu_reset_sec, env);
+        }
     }
 
     env = firstenv;
@@ -289,6 +309,9 @@  static void mpc8544ds_init(ram_addr_t ram_size,
         }
     }
 
+    /* Register spinning region */
+    sysbus_create_simple("e500-spin", MPC8544_SPIN_BASE, NULL);
+
     /* Load kernel. */
     if (kernel_filename) {
         kernel_size = load_uimage(kernel_filename, &entry, &loadaddr, NULL);
@@ -321,6 +344,8 @@  static void mpc8544ds_init(ram_addr_t ram_size,
 
     /* If we're loading a kernel directly, we must load the device tree too. */
     if (kernel_filename) {
+        struct boot_info *boot_info;
+
 #ifndef CONFIG_FDT
         cpu_abort(env, "Compiled without FDT support - can't load kernel\n");
 #endif
diff --git a/hw/ppce500_spin.c b/hw/ppce500_spin.c
new file mode 100644
index 0000000..38451ac
--- /dev/null
+++ b/hw/ppce500_spin.c
@@ -0,0 +1,186 @@ 
+#include "hw.h"
+#include "sysemu.h"
+#include "sysbus.h"
+#include "kvm.h"
+
+#define MAX_CPUS 32
+
+typedef struct spin_info {
+    uint64_t addr;
+    uint64_t r3;
+    uint32_t resv;
+    uint32_t pir;
+    uint64_t reserved;
+} __attribute__ ((packed)) SpinInfo;
+
+typedef struct spin_state {
+    SysBusDevice busdev;
+    MemoryRegion iomem;
+    SpinInfo spin[MAX_CPUS];
+} SpinState;
+
+typedef struct spin_kick {
+    CPUState *env;
+    SpinInfo *spin;
+} SpinKick;
+
+static void spin_reset(void *opaque)
+{
+    SpinState *s = opaque;
+    int i;
+
+    for (i = 0; i < MAX_CPUS; i++) {
+        SpinInfo *info = &s->spin[i];
+
+        info->pir = i;
+        info->r3 = i;
+        info->addr = 1;
+    }
+}
+
+/* Create -kernel TLB entries for BookE, linearly spanning 256MB.  */
+static inline target_phys_addr_t booke206_page_size_to_tlb(uint64_t size)
+{
+    return (ffs(size >> 10) - 1) >> 1;
+}
+
+static void mmubooke_create_initial_mapping(CPUState *env,
+                                     target_ulong va,
+                                     target_phys_addr_t pa,
+                                     target_phys_addr_t len)
+{
+    ppcmas_tlb_t *tlb = booke206_get_tlbm(env, 1, 0, 1);
+    target_phys_addr_t size;
+
+    size = (booke206_page_size_to_tlb(len) << MAS1_TSIZE_SHIFT);
+    tlb->mas1 = MAS1_VALID | size;
+    tlb->mas2 = (va & TARGET_PAGE_MASK) | MAS2_M;
+    tlb->mas7_3 = pa & TARGET_PAGE_MASK;
+    tlb->mas7_3 |= MAS3_UR | MAS3_UW | MAS3_UX | MAS3_SR | MAS3_SW | MAS3_SX;
+}
+
+static void spin_kick(void *data)
+{
+    SpinKick *kick = data;
+    CPUState *env = kick->env;
+    SpinInfo *curspin = kick->spin;
+    target_phys_addr_t map_size = 64 * 1024 * 1024;
+    target_phys_addr_t map_start;
+
+    cpu_synchronize_state(env);
+    stl_p(&curspin->pir, env->spr[SPR_PIR]);
+    env->nip = ldq_p(&curspin->addr) & (map_size - 1);
+    env->gpr[3] = ldq_p(&curspin->r3);
+    env->gpr[4] = 0;
+    env->gpr[5] = 0;
+    env->gpr[6] = 0;
+    env->gpr[7] = map_size;
+    env->gpr[8] = 0;
+    env->gpr[9] = 0;
+
+    map_start = ldq_p(&curspin->addr) & ~(map_size - 1);
+    mmubooke_create_initial_mapping(env, 0, map_start, map_size);
+
+    env->halted = 0;
+    env->exception_index = -1;
+    qemu_cpu_kick(env);
+}
+
+static void spin_write(void *opaque, target_phys_addr_t addr, uint64_t value,
+                       unsigned len)
+{
+    SpinState *s = opaque;
+    int env_idx = addr / sizeof(SpinInfo);
+    CPUState *env;
+    SpinInfo *curspin = &s->spin[env_idx];
+    uint8_t *curspin_p = (uint8_t*)curspin;
+
+    for (env = first_cpu; env != NULL; env = env->next_cpu) {
+        if (env->cpu_index == env_idx) {
+            break;
+        }
+    }
+
+    if (!env) {
+        /* Unknown CPU */
+        return;
+    }
+
+    if (!env->cpu_index) {
+        /* primary CPU doesn't spin */
+        return;
+    }
+
+    curspin_p = &curspin_p[addr % sizeof(SpinInfo)];
+    switch (len) {
+    case 1:
+        stb_p(curspin_p, value);
+        break;
+    case 2:
+        stw_p(curspin_p, value);
+        break;
+    case 4:
+        stl_p(curspin_p, value);
+        break;
+    }
+
+    if (!(ldq_p(&curspin->addr) & 1)) {
+        /* run CPU */
+        SpinKick kick = {
+            .env = env,
+            .spin = curspin,
+        };
+
+        run_on_cpu(env, spin_kick, &kick);
+    }
+}
+
+static uint64_t spin_read(void *opaque, target_phys_addr_t addr, unsigned len)
+{
+    SpinState *s = opaque;
+    uint8_t *spin_p = &((uint8_t*)s->spin)[addr];
+
+    switch (len) {
+    case 1:
+        return ldub_p(spin_p);
+    case 2:
+        return lduw_p(spin_p);
+    case 4:
+        return ldl_p(spin_p);
+    default:
+        assert(0);
+    }
+}
+
+const MemoryRegionOps spin_rw_ops = {
+    .read = spin_read,
+    .write = spin_write,
+    .endianness = DEVICE_BIG_ENDIAN,
+};
+
+static int ppce500_spin_initfn(SysBusDevice *dev)
+{
+    SpinState *s;
+
+    s = FROM_SYSBUS(SpinState, sysbus_from_qdev(dev));
+
+    memory_region_init_io(&s->iomem, &spin_rw_ops, s, "e500 spin pv device",
+                          sizeof(SpinInfo) * MAX_CPUS);
+    sysbus_init_mmio_region(dev, &s->iomem);
+
+    qemu_register_reset(spin_reset, s);
+
+    return 0;
+}
+
+static SysBusDeviceInfo ppce500_spin_info = {
+    .init         = ppce500_spin_initfn,
+    .qdev.name    = "e500-spin",
+    .qdev.size    = sizeof(SpinState),
+};
+
+static void ppce500_spin_register(void)
+{
+    sysbus_register_withprop(&ppce500_spin_info);
+}
+device_init(ppce500_spin_register);