diff mbox

[15/15] Implement the bus structure for PAPR virtual IO

Message ID 1297522467-5975-16-git-send-email-david@gibson.dropbear.id.au
State New
Headers show

Commit Message

David Gibson Feb. 12, 2011, 2:54 p.m. UTC
This extends the "pseries" (PAPR) machine to include a virtual IO bus
supporting the PAPR defined hypercall based virtual IO mechanisms.

So far only one VIO device is provided, the vty / vterm, providing
a full console (polled only, for now).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
---
 Makefile.target  |    3 +-
 hw/spapr.c       |   31 +++++++++-
 hw/spapr.h       |   10 +++
 hw/spapr_hcall.c |   19 ++----
 hw/spapr_vio.c   |  191 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/spapr_vio.h   |   49 ++++++++++++++
 hw/spapr_vty.c   |  132 +++++++++++++++++++++++++++++++++++++
 7 files changed, 419 insertions(+), 16 deletions(-)
 create mode 100644 hw/spapr_vio.c
 create mode 100644 hw/spapr_vio.h
 create mode 100644 hw/spapr_vty.c

Comments

Alexander Graf Feb. 12, 2011, 4:47 p.m. UTC | #1
On 12.02.2011, at 15:54, David Gibson wrote:

> This extends the "pseries" (PAPR) machine to include a virtual IO bus
> supporting the PAPR defined hypercall based virtual IO mechanisms.
> 
> So far only one VIO device is provided, the vty / vterm, providing
> a full console (polled only, for now).
> 
> Signed-off-by: David Gibson <dwg@au1.ibm.com>
> ---
> Makefile.target  |    3 +-
> hw/spapr.c       |   31 +++++++++-
> hw/spapr.h       |   10 +++
> hw/spapr_hcall.c |   19 ++----
> hw/spapr_vio.c   |  191 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> hw/spapr_vio.h   |   49 ++++++++++++++
> hw/spapr_vty.c   |  132 +++++++++++++++++++++++++++++++++++++
> 7 files changed, 419 insertions(+), 16 deletions(-)
> create mode 100644 hw/spapr_vio.c
> create mode 100644 hw/spapr_vio.h
> create mode 100644 hw/spapr_vty.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index e0796ba..fe232da 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -232,7 +232,8 @@ obj-ppc-y += ppc_oldworld.o
> # NewWorld PowerMac
> obj-ppc-y += ppc_newworld.o
> # IBM pSeries (sPAPR)
> -obj-ppc-y += spapr.o spapr_hcall.o
> +obj-ppc-y += spapr.o spapr_hcall.o spapr_vio.o
> +obj-ppc-y += spapr_vty.o
> # PowerPC 4xx boards
> obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
> obj-ppc-y += ppc440.o ppc440_bamboo.o
> diff --git a/hw/spapr.c b/hw/spapr.c
> index 8aca4e0..da61061 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -37,6 +37,7 @@
> #include "net.h"
> #include "blockdev.h"
> #include "hw/spapr.h"
> +#include "hw/spapr_vio.h"
> 
> #include <libfdt.h>
> 
> @@ -49,6 +50,7 @@
> 
> static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>                               const char *cpu_model, CPUState *envs[],
> +                              sPAPREnvironment *spapr,
>                               target_phys_addr_t initrd_base,
>                               target_phys_addr_t initrd_size,
>                               const char *kernel_cmdline)
> @@ -59,6 +61,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>     int i;
>     char *modelname;
> +    int ret;
> 
> #define _FDT(exp) \
>     do { \
> @@ -151,9 +154,28 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
> 
>     _FDT((fdt_end_node(fdt)));
> 
> +    /* vdevice */
> +    _FDT((fdt_begin_node(fdt, "vdevice")));
> +
> +    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
> +    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
> +    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
> +    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
> +    
> +    _FDT((fdt_end_node(fdt)));
> +
>     _FDT((fdt_end_node(fdt))); /* close root node */
>     _FDT((fdt_finish(fdt)));
> 
> +    /* re-expand to allow for further tweaks */
> +    _FDT((fdt_open_into(fdt, fdt, FDT_MAX_SIZE)));
> +
> +    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
> +    if (ret < 0)

Braces..

> +        fprintf(stderr, "couldn't setup vio devices in fdt\n");
> +
> +    _FDT((fdt_pack(fdt)));
> +
>     if (fdt_size)
>         *fdt_size = fdt_totalsize(fdt);
> 
> @@ -211,6 +233,12 @@ static void ppc_spapr_init (ram_addr_t ram_size,
>     ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
>     cpu_register_physical_memory(0, ram_size, ram_offset);
> 
> +    spapr->vio_bus = spapr_vio_bus_init();
> +
> +    for (i = 0; i < MAX_SERIAL_PORTS; i++)

Braces..

> +        if (serial_hds[i])

Braces..

> +            spapr_vty_create(spapr->vio_bus, i, serial_hds[i]);

There might be a qdev way to do this. Blue?

> +
>     if (kernel_filename) {
>         uint64_t lowaddr = 0;
> 
> @@ -242,7 +270,7 @@ static void ppc_spapr_init (ram_addr_t ram_size,
>         }
> 
>         /* load fdt */
> -        fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, &env,
> +        fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, &env, spapr,
>                                initrd_base, initrd_size,
>                                kernel_cmdline);
>         if (!fdt) {
> @@ -267,6 +295,7 @@ static QEMUMachine spapr_machine = {
>     .desc = "pSeries Logical Partition (PAPR compliant)",
>     .init = ppc_spapr_init,
>     .max_cpus = 1,
> +    .no_parallel = 1,

duplicate?

>     .no_vga = 1,
>     .no_parallel = 1,
> };
> diff --git a/hw/spapr.h b/hw/spapr.h
> index dae9617..168511f 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -1,7 +1,10 @@
> #if !defined (__HW_SPAPR_H__)
> #define __HW_SPAPR_H__
> 
> +struct VIOsPAPRBus;
> +
> typedef struct sPAPREnvironment {
> +    struct VIOsPAPRBus *vio_bus;
> } sPAPREnvironment;
> 
> #define H_SUCCESS         0
> @@ -237,4 +240,11 @@ typedef struct sPAPREnvironment {
> target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
>                              target_ulong token, target_ulong *args);
> 
> +target_ulong h_put_term_char(sPAPREnvironment *spapr,
> +                             target_ulong termno, target_ulong len,
> +                             target_ulong char0_7, target_ulong char8_15);
> +target_ulong h_get_term_char(sPAPREnvironment *spapr,
> +                             target_ulong termno, target_ulong *len,
> +                             target_ulong *char0_7, target_ulong *char8_15);
> +
> #endif /* !defined (__HW_SPAPR_H__) */
> diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
> index c99c345..e2ed9cf 100644
> --- a/hw/spapr_hcall.c
> +++ b/hw/spapr_hcall.c
> @@ -3,19 +3,6 @@
> #include "qemu-char.h"
> #include "hw/spapr.h"
> 
> -static target_ulong h_put_term_char(target_ulong termno, target_ulong len,
> -                                    target_ulong char0_7, target_ulong char8_15)
> -{
> -    uint8_t buf[16];
> -
> -    *((uint64_t *)buf) = cpu_to_be64(char0_7);
> -    *((uint64_t *)buf + 1) = cpu_to_be64(char8_15);
> -
> -    qemu_chr_write(serial_hds[0], buf, len);
> -
> -    return 0;
> -}
> -
> target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
>                              target_ulong token, target_ulong *args)
> {
> @@ -29,7 +16,11 @@ target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
> 
>     switch (token) {
>     case H_PUT_TERM_CHAR:
> -        r = h_put_term_char(args[0], args[1], args[2], args[3]);
> +        r = h_put_term_char(spapr, args[0], args[1], args[2], args[3]);
> +        break;
> +
> +    case H_GET_TERM_CHAR:
> +        r = h_get_term_char(spapr, args[0], &args[0], &args[1], &args[2]);

Slick and simple. Blue, do you think there's some random abstraction layer necessary?

>         break;
> 
>     default:
> diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c
> new file mode 100644
> index 0000000..d9c7292
> --- /dev/null
> +++ b/hw/spapr_vio.c
> @@ -0,0 +1,191 @@
> +/*
> + * QEMU sPAPR VIO code
> + *
> + * Copyright (c) 2010 David Gibson, IBM Corporation <david@gibson.dropbear.id.au>
> + * Based on the s390 virtio bus code:
> + * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "hw.h"
> +#include "sysemu.h"
> +#include "boards.h"
> +#include "monitor.h"
> +#include "loader.h"
> +#include "elf.h"
> +#include "hw/sysbus.h"
> +#include "kvm.h"
> +#include "device_tree.h"
> +
> +#include "hw/spapr.h"
> +#include "hw/spapr_vio.h"
> +
> +#ifdef CONFIG_FDT
> +#include <libfdt.h>
> +#endif /* CONFIG_FDT */
> +
> +/* #define DEBUG_SPAPR */
> +
> +#ifdef DEBUG_SPAPR
> +#define dprintf(fmt, ...) \
> +    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define dprintf(fmt, ...) \
> +    do { } while (0)
> +#endif
> +
> +struct BusInfo spapr_vio_bus_info = {
> +    .name       = "spapr-vio",
> +    .size       = sizeof(VIOsPAPRBus),
> +};
> +
> +VIOsPAPRDevice *spapr_vio_find_by_reg(VIOsPAPRBus *bus, uint32_t reg)
> +{
> +    DeviceState *qdev;
> +    VIOsPAPRDevice *dev = NULL;
> +
> +    QLIST_FOREACH(qdev, &bus->bus.children, sibling) {
> +        dev = (VIOsPAPRDevice *)qdev;
> +        if (dev->reg == reg)

Braces

> +            break;
> +    }
> +
> +    return dev;

What if the device doesn't exist?

> +}
> +
> +VIOsPAPRBus *spapr_vio_bus_init(void)
> +{
> +    VIOsPAPRBus *bus;
> +    BusState *_bus;
> +    DeviceState *dev;
> +
> +    /* Create bridge device */
> +    dev = qdev_create(NULL, "spapr-vio-bridge");
> +    qdev_init_nofail(dev);
> +
> +    /* Create bus on bridge device */
> +
> +    _bus = qbus_create(&spapr_vio_bus_info, dev, "spapr-vio");
> +    bus = DO_UPCAST(VIOsPAPRBus, bus, _bus);
> +
> +    return bus;
> +}
> +
> +#ifdef CONFIG_FDT
> +static int vio_make_devnode(VIOsPAPRDevice *dev,
> +                            void *fdt)
> +{
> +    VIOsPAPRDeviceInfo *info = (VIOsPAPRDeviceInfo *)dev->qdev.info;
> +    int vdevice_off, node_off;
> +    int ret;
> +
> +    vdevice_off = fdt_path_offset(fdt, "/vdevice");
> +    if (vdevice_off < 0)

Braces

> +        return vdevice_off;
> +
> +    node_off = fdt_add_subnode(fdt, vdevice_off, dev->qdev.id);
> +    if (node_off < 0)

Braces

> +        return node_off;
> +
> +    ret = fdt_setprop_cell(fdt, node_off, "reg", dev->reg);
> +    if (ret < 0)

Braces

> +        return ret;
> +
> +    if (info->dt_type) {
> +        ret = fdt_setprop_string(fdt, node_off, "device_type",
> +                                 info->dt_type);
> +        if (ret < 0)

Braces

I'll stop complaining about braces now. Please go through the patch yourself and just fix them up :)

> +            return ret;
> +    }
> +
> +    if (info->dt_compatible) {
> +        ret = fdt_setprop_string(fdt, node_off, "compatible",
> +                                 info->dt_compatible);
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    if (info->devnode) {
> +        ret = (info->devnode)(dev, fdt, node_off);
> +        if (ret < 0)
> +            return ret;
> +    }
> +
> +    return node_off;
> +}
> +#endif /* CONFIG_FDT */
> +
> +static int spapr_vio_busdev_init(DeviceState *dev, DeviceInfo *info)
> +{
> +    VIOsPAPRDeviceInfo *_info = (VIOsPAPRDeviceInfo *)info;
> +    VIOsPAPRDevice *_dev = (VIOsPAPRDevice *)dev;
> +    char *id;
> +
> +    if (asprintf(&id, "%s@%x", _info->dt_name, _dev->reg) < 0)
> +        return -1;
> +
> +    _dev->qdev.id = id;
> +
> +    return _info->init(_dev);
> +}
> +
> +void spapr_vio_bus_register_withprop(VIOsPAPRDeviceInfo *info)
> +{
> +    info->qdev.init = spapr_vio_busdev_init;
> +    info->qdev.bus_info = &spapr_vio_bus_info;
> +
> +    assert(info->qdev.size >= sizeof(VIOsPAPRDevice));
> +    qdev_register(&info->qdev);
> +}
> +
> +static int spapr_vio_bridge_init(SysBusDevice *dev)
> +{
> +    /* nothing */
> +    return 0;
> +}
> +
> +static SysBusDeviceInfo spapr_vio_bridge_info = {
> +    .init = spapr_vio_bridge_init,
> +    .qdev.name  = "spapr-vio-bridge",
> +    .qdev.size  = sizeof(SysBusDevice),
> +    .qdev.no_user = 1,
> +};
> +
> +static void spapr_vio_register_devices(void)
> +{
> +    sysbus_register_withprop(&spapr_vio_bridge_info);
> +}
> +
> +device_init(spapr_vio_register_devices)
> +
> +#ifdef CONFIG_FDT
> +
> +int spapr_populate_vdevice(VIOsPAPRBus *bus, void *fdt)
> +{
> +    DeviceState *qdev;
> +    int ret = 0;
> +
> +    QLIST_FOREACH(qdev, &bus->bus.children, sibling) {
> +        VIOsPAPRDevice *dev = (VIOsPAPRDevice *)qdev;
> +
> +        ret = vio_make_devnode(dev, fdt);
> +
> +        if (ret < 0)
> +            return ret;
> +    }
> +    
> +    return 0;
> +}
> +#endif /* CONFIG_FDT */
> diff --git a/hw/spapr_vio.h b/hw/spapr_vio.h
> new file mode 100644
> index 0000000..fb5e301
> --- /dev/null
> +++ b/hw/spapr_vio.h
> @@ -0,0 +1,49 @@
> +#ifndef _HW_SPAPR_VIO_H
> +#define _HW_SPAPR_VIO_H
> +/*
> + * QEMU sPAPR VIO bus definitions
> + *
> + * Copyright (c) 2010 David Gibson, IBM Corporation <david@gibson.dropbear.id.au>
> + * Based on the s390 virtio bus definitions:
> + * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +typedef struct VIOsPAPRDevice {
> +    DeviceState qdev;
> +    uint32_t reg;
> +} VIOsPAPRDevice;
> +
> +typedef struct VIOsPAPRBus {
> +    BusState bus;
> +} VIOsPAPRBus;
> +
> +typedef struct {
> +    DeviceInfo qdev;
> +    const char *dt_name, *dt_type, *dt_compatible;
> +    int (*init)(VIOsPAPRDevice *dev);
> +    int (*devnode)(VIOsPAPRDevice *dev, void *fdt, int node_off);
> +} VIOsPAPRDeviceInfo;
> +
> +extern VIOsPAPRBus *spapr_vio_bus_init(void);
> +extern VIOsPAPRDevice *spapr_vio_find_by_reg(VIOsPAPRBus *bus, uint32_t reg);
> +extern void spapr_vio_bus_register_withprop(VIOsPAPRDeviceInfo *info);
> +extern int spapr_populate_vdevice(VIOsPAPRBus *bus, void *fdt);
> +
> +void vty_putchars(VIOsPAPRDevice *sdev, uint8_t *buf, int len);
> +void spapr_vty_create(VIOsPAPRBus *bus,
> +                      uint32_t reg, CharDriverState *chardev);
> +
> +#endif /* _HW_SPAPR_VIO_H */
> diff --git a/hw/spapr_vty.c b/hw/spapr_vty.c
> new file mode 100644
> index 0000000..9a2dc0b
> --- /dev/null
> +++ b/hw/spapr_vty.c
> @@ -0,0 +1,132 @@
> +#include "qdev.h"
> +#include "qemu-char.h"
> +#include "hw/spapr.h"
> +#include "hw/spapr_vio.h"
> +
> +#define VTERM_BUFSIZE   16
> +
> +typedef struct VIOsPAPRVTYDevice {
> +    VIOsPAPRDevice sdev;
> +    CharDriverState *chardev;
> +    uint32_t in, out;
> +    uint8_t buf[VTERM_BUFSIZE];
> +} VIOsPAPRVTYDevice;
> +
> +static int vty_can_receive(void *opaque)
> +{
> +    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)opaque;
> +
> +    return (dev->in - dev->out) < VTERM_BUFSIZE;
> +}
> +
> +static void vty_receive(void *opaque, const uint8_t *buf, int size)
> +{
> +    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)opaque;
> +    int i;
> +
> +    for (i = 0; i < size; i++) {
> +        assert((dev->in - dev->out) < VTERM_BUFSIZE);
> +        dev->buf[dev->in++ % VTERM_BUFSIZE] = buf[i];
> +    }
> +}
> +
> +static int vty_getchars(VIOsPAPRDevice *sdev, uint8_t *buf, int max)
> +{
> +    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)sdev;
> +    int n = 0;
> +
> +    while ((n < max) && (dev->out != dev->in))
> +        buf[n++] = dev->buf[dev->out++ % VTERM_BUFSIZE];
> +
> +    return n;
> +}
> +
> +void vty_putchars(VIOsPAPRDevice *sdev, uint8_t *buf, int len)
> +{
> +    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)sdev;
> +
> +    /* FIXME: should check the qemu_chr_write() return value */
> +    qemu_chr_write(dev->chardev, buf, len);
> +}
> +
> +static int spapr_vty_init(VIOsPAPRDevice *sdev)
> +{
> +    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)sdev;
> +
> +    qemu_chr_add_handlers(dev->chardev, vty_can_receive,
> +                          vty_receive, NULL, dev);
> +
> +    return 0;
> +}
> +
> +target_ulong h_put_term_char(sPAPREnvironment *spapr,
> +                             target_ulong termno, target_ulong len,
> +                             target_ulong char0_7, target_ulong char8_15)
> +{
> +    VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, termno);
> +    uint8_t buf[16];
> +
> +    if (!sdev)
> +        return H_PARAMETER;
> +
> +    if (len > 16)
> +        return H_PARAMETER;
> +
> +    *((uint64_t *)buf) = cpu_to_be64(char0_7);
> +    *((uint64_t *)buf + 1) = cpu_to_be64(char8_15);
> +
> +    vty_putchars(sdev, buf, len);
> +
> +    return 0;
> +}
> +
> +target_ulong h_get_term_char(sPAPREnvironment *spapr,
> +                             target_ulong termno, target_ulong *len,
> +                             target_ulong *char0_7, target_ulong *char8_15)
> +{
> +    VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, termno);
> +    uint8_t buf[16];
> +
> +    if (!sdev)
> +        return H_PARAMETER;
> +
> +    *len = vty_getchars(sdev, buf, sizeof(buf));
> +    if (*len < 16)
> +        memset(buf + *len, 0, 16 - *len);
> +
> +    *char0_7 = be64_to_cpu(*((uint64_t *)buf));
> +    *char8_15 = be64_to_cpu(*((uint64_t *)buf + 1));
> +
> +    return H_SUCCESS;
> +}
> +
> +void spapr_vty_create(VIOsPAPRBus *bus,
> +                      uint32_t reg, CharDriverState *chardev)
> +{
> +    DeviceState *dev;
> +
> +    dev = qdev_create(&bus->bus, "spapr-vty");
> +    qdev_prop_set_uint32(dev, "reg", reg);
> +    qdev_prop_set_chr(dev, "chardev", chardev);
> +    qdev_init_nofail(dev);
> +}
> +
> +static VIOsPAPRDeviceInfo spapr_vty = {
> +    .init = spapr_vty_init,
> +    .dt_name = "vty",
> +    .dt_type = "serial",
> +    .dt_compatible = "hvterm1",
> +    .qdev.name = "spapr-vty",
> +    .qdev.size = sizeof(VIOsPAPRVTYDevice),
> +    .qdev.props = (Property[]) {
> +        DEFINE_PROP_UINT32("reg", VIOsPAPRDevice, reg, 0),
> +        DEFINE_PROP_CHR("chardev", VIOsPAPRVTYDevice, chardev),
> +        DEFINE_PROP_END_OF_LIST(),
> +    },
> +};
> +
> +static void spapr_vty_register(void)
> +{
> +    spapr_vio_bus_register_withprop(&spapr_vty);
> +}
> +device_init(spapr_vty_register);
> -- 
> 1.7.1
> 

Alex
Blue Swirl Feb. 12, 2011, 4:59 p.m. UTC | #2
On Sat, Feb 12, 2011 at 6:47 PM, Alexander Graf <agraf@suse.de> wrote:
>
> On 12.02.2011, at 15:54, David Gibson wrote:
>
>> This extends the "pseries" (PAPR) machine to include a virtual IO bus
>> supporting the PAPR defined hypercall based virtual IO mechanisms.
>>
>> So far only one VIO device is provided, the vty / vterm, providing
>> a full console (polled only, for now).
>>
>> Signed-off-by: David Gibson <dwg@au1.ibm.com>
>> ---
>> Makefile.target  |    3 +-
>> hw/spapr.c       |   31 +++++++++-
>> hw/spapr.h       |   10 +++
>> hw/spapr_hcall.c |   19 ++----
>> hw/spapr_vio.c   |  191 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> hw/spapr_vio.h   |   49 ++++++++++++++
>> hw/spapr_vty.c   |  132 +++++++++++++++++++++++++++++++++++++
>> 7 files changed, 419 insertions(+), 16 deletions(-)
>> create mode 100644 hw/spapr_vio.c
>> create mode 100644 hw/spapr_vio.h
>> create mode 100644 hw/spapr_vty.c
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index e0796ba..fe232da 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -232,7 +232,8 @@ obj-ppc-y += ppc_oldworld.o
>> # NewWorld PowerMac
>> obj-ppc-y += ppc_newworld.o
>> # IBM pSeries (sPAPR)
>> -obj-ppc-y += spapr.o spapr_hcall.o
>> +obj-ppc-y += spapr.o spapr_hcall.o spapr_vio.o
>> +obj-ppc-y += spapr_vty.o
>> # PowerPC 4xx boards
>> obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
>> obj-ppc-y += ppc440.o ppc440_bamboo.o
>> diff --git a/hw/spapr.c b/hw/spapr.c
>> index 8aca4e0..da61061 100644
>> --- a/hw/spapr.c
>> +++ b/hw/spapr.c
>> @@ -37,6 +37,7 @@
>> #include "net.h"
>> #include "blockdev.h"
>> #include "hw/spapr.h"
>> +#include "hw/spapr_vio.h"
>>
>> #include <libfdt.h>
>>
>> @@ -49,6 +50,7 @@
>>
>> static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>>                               const char *cpu_model, CPUState *envs[],
>> +                              sPAPREnvironment *spapr,
>>                               target_phys_addr_t initrd_base,
>>                               target_phys_addr_t initrd_size,
>>                               const char *kernel_cmdline)
>> @@ -59,6 +61,7 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>>     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
>>     int i;
>>     char *modelname;
>> +    int ret;
>>
>> #define _FDT(exp) \
>>     do { \
>> @@ -151,9 +154,28 @@ static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
>>
>>     _FDT((fdt_end_node(fdt)));
>>
>> +    /* vdevice */
>> +    _FDT((fdt_begin_node(fdt, "vdevice")));
>> +
>> +    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
>> +    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
>> +    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
>> +    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
>> +
>> +    _FDT((fdt_end_node(fdt)));
>> +
>>     _FDT((fdt_end_node(fdt))); /* close root node */
>>     _FDT((fdt_finish(fdt)));
>>
>> +    /* re-expand to allow for further tweaks */
>> +    _FDT((fdt_open_into(fdt, fdt, FDT_MAX_SIZE)));
>> +
>> +    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
>> +    if (ret < 0)
>
> Braces..
>
>> +        fprintf(stderr, "couldn't setup vio devices in fdt\n");
>> +
>> +    _FDT((fdt_pack(fdt)));
>> +
>>     if (fdt_size)
>>         *fdt_size = fdt_totalsize(fdt);
>>
>> @@ -211,6 +233,12 @@ static void ppc_spapr_init (ram_addr_t ram_size,
>>     ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
>>     cpu_register_physical_memory(0, ram_size, ram_offset);
>>
>> +    spapr->vio_bus = spapr_vio_bus_init();
>> +
>> +    for (i = 0; i < MAX_SERIAL_PORTS; i++)
>
> Braces..
>
>> +        if (serial_hds[i])
>
> Braces..
>
>> +            spapr_vty_create(spapr->vio_bus, i, serial_hds[i]);
>
> There might be a qdev way to do this. Blue?

Actually I don't quite understand the need for vty layer, why not use
the chardev here directly?

>
>> +
>>     if (kernel_filename) {
>>         uint64_t lowaddr = 0;
>>
>> @@ -242,7 +270,7 @@ static void ppc_spapr_init (ram_addr_t ram_size,
>>         }
>>
>>         /* load fdt */
>> -        fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, &env,
>> +        fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, &env, spapr,
>>                                initrd_base, initrd_size,
>>                                kernel_cmdline);
>>         if (!fdt) {
>> @@ -267,6 +295,7 @@ static QEMUMachine spapr_machine = {
>>     .desc = "pSeries Logical Partition (PAPR compliant)",
>>     .init = ppc_spapr_init,
>>     .max_cpus = 1,
>> +    .no_parallel = 1,
>
> duplicate?
>
>>     .no_vga = 1,
>>     .no_parallel = 1,
>> };
>> diff --git a/hw/spapr.h b/hw/spapr.h
>> index dae9617..168511f 100644
>> --- a/hw/spapr.h
>> +++ b/hw/spapr.h
>> @@ -1,7 +1,10 @@
>> #if !defined (__HW_SPAPR_H__)
>> #define __HW_SPAPR_H__
>>
>> +struct VIOsPAPRBus;
>> +
>> typedef struct sPAPREnvironment {
>> +    struct VIOsPAPRBus *vio_bus;
>> } sPAPREnvironment;
>>
>> #define H_SUCCESS         0
>> @@ -237,4 +240,11 @@ typedef struct sPAPREnvironment {
>> target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
>>                              target_ulong token, target_ulong *args);
>>
>> +target_ulong h_put_term_char(sPAPREnvironment *spapr,
>> +                             target_ulong termno, target_ulong len,
>> +                             target_ulong char0_7, target_ulong char8_15);
>> +target_ulong h_get_term_char(sPAPREnvironment *spapr,
>> +                             target_ulong termno, target_ulong *len,
>> +                             target_ulong *char0_7, target_ulong *char8_15);
>> +
>> #endif /* !defined (__HW_SPAPR_H__) */
>> diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
>> index c99c345..e2ed9cf 100644
>> --- a/hw/spapr_hcall.c
>> +++ b/hw/spapr_hcall.c
>> @@ -3,19 +3,6 @@
>> #include "qemu-char.h"
>> #include "hw/spapr.h"
>>
>> -static target_ulong h_put_term_char(target_ulong termno, target_ulong len,
>> -                                    target_ulong char0_7, target_ulong char8_15)
>> -{
>> -    uint8_t buf[16];
>> -
>> -    *((uint64_t *)buf) = cpu_to_be64(char0_7);
>> -    *((uint64_t *)buf + 1) = cpu_to_be64(char8_15);
>> -
>> -    qemu_chr_write(serial_hds[0], buf, len);
>> -
>> -    return 0;
>> -}
>> -
>> target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
>>                              target_ulong token, target_ulong *args)
>> {
>> @@ -29,7 +16,11 @@ target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
>>
>>     switch (token) {
>>     case H_PUT_TERM_CHAR:
>> -        r = h_put_term_char(args[0], args[1], args[2], args[3]);
>> +        r = h_put_term_char(spapr, args[0], args[1], args[2], args[3]);
>> +        break;
>> +
>> +    case H_GET_TERM_CHAR:
>> +        r = h_get_term_char(spapr, args[0], &args[0], &args[1], &args[2]);
>
> Slick and simple. Blue, do you think there's some random abstraction layer necessary?

Same here.
Benjamin Herrenschmidt Feb. 12, 2011, 9 p.m. UTC | #3
On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
> 
> Actually I don't quite understand the need for vty layer, why not use
> the chardev here directly?

I'm not sure what you mean here...

Basically, the interface presented to guests is sPAPR compliant, so
virtual devices come with a bunch of stuff such as standard device-tree
properties, but also hcalls for interrupt control etc... which are
common to most of these guys including vty.

Some of it isn't present in David current patch just yet, but I don't
see how using an existing chardev would provide the same semantics,
especially when we start adding interrupts etc...

Also eventually, VTY's will be hot-pluggable (when we get to do that)
and will use the same mechanisms as the other sPAPR VIO devices for
that.

Cheers,
Ben.
Blue Swirl Feb. 12, 2011, 10:52 p.m. UTC | #4
On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>>
>> Actually I don't quite understand the need for vty layer, why not use
>> the chardev here directly?
>
> I'm not sure what you mean here...

Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
instead of moving those to a separate file.
Benjamin Herrenschmidt Feb. 12, 2011, 11:15 p.m. UTC | #5
On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
> >>
> >> Actually I don't quite understand the need for vty layer, why not use
> >> the chardev here directly?
> >
> > I'm not sure what you mean here...
> 
> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
> instead of moving those to a separate file.

Well, the VIO device instance gives the chardev instance which is all
nicely encapsulated inside spapr-vty... Also VIO devices tend to have
dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
close to the rest of the VIO driver they belong to don't you think ?

(Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
added to the core VIO but you'll see that when we get those patches
ready, hopefully soon).

Actually, one thing I noticed is that the current patches David posted
still have a single function with a switch/case statement for hcalls.

I'm not 100% certain what David long term plans are here, but in our
internal "WIP" tree, we've subsequently turned that into a mechanism
where any module can call powerpc_register_hypercall() to add hcalls.

So if David intends to move the "upstream candidate" tree in that
direction, then naturally, the calls in spapr_hcall.c are going to
disappear in favor of a pair of powerpc_register_hypercall() locally in
the vty module.

Cheers,
Ben.
Blue Swirl Feb. 13, 2011, 8:08 a.m. UTC | #6
On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>> > On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>> >>
>> >> Actually I don't quite understand the need for vty layer, why not use
>> >> the chardev here directly?
>> >
>> > I'm not sure what you mean here...
>>
>> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
>> instead of moving those to a separate file.
>
> Well, the VIO device instance gives the chardev instance which is all
> nicely encapsulated inside spapr-vty... Also VIO devices tend to have
> dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
> close to the rest of the VIO driver they belong to don't you think ?
>
> (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
> added to the core VIO but you'll see that when we get those patches
> ready, hopefully soon).

This is a bit of a special case, much like semihosting modes for m68k
or ARM, or like MOL hacks which were removed recently. From QEMU point
of view, the most natural way of handling this would be hypervisor
implemented in the guest side (for example BIOS). Then the hypervisor
would use normal IO (or virtio) to communicate with the host. If
inside QEMU, the interface of the hypervisor to the devices needs some
thought. We'd like to avoid ugly interfaces like vmmouse where a
device probes CPU registers directly or spaghetti interfaces like
APIC.

> Actually, one thing I noticed is that the current patches David posted
> still have a single function with a switch/case statement for hcalls.
>
> I'm not 100% certain what David long term plans are here, but in our
> internal "WIP" tree, we've subsequently turned that into a mechanism
> where any module can call powerpc_register_hypercall() to add hcalls.
>
> So if David intends to move the "upstream candidate" tree in that
> direction, then naturally, the calls in spapr_hcall.c are going to
> disappear in favor of a pair of powerpc_register_hypercall() locally in
> the vty module.

Is the interface new design, or are you implementing what is used also
on real HW?
David Gibson Feb. 13, 2011, 11:09 a.m. UTC | #7
On Sat, Feb 12, 2011 at 05:47:53PM +0100, Alexander Graf wrote:
> On 12.02.2011, at 15:54, David Gibson wrote:
[snip] 
> > @@ -267,6 +295,7 @@ static QEMUMachine spapr_machine = {
> >     .desc = "pSeries Logical Partition (PAPR compliant)",
> >     .init = ppc_spapr_init,
> >     .max_cpus = 1,
> > +    .no_parallel = 1,
> 
> duplicate?

Oops, rebasing mistake.  Fixed now.

[snip]
> > +VIOsPAPRDevice *spapr_vio_find_by_reg(VIOsPAPRBus *bus, uint32_t reg)
> > +{
> > +    DeviceState *qdev;
> > +    VIOsPAPRDevice *dev = NULL;
> > +
> > +    QLIST_FOREACH(qdev, &bus->bus.children, sibling) {
> > +        dev = (VIOsPAPRDevice *)qdev;
> > +        if (dev->reg == reg)
> 
> Braces
> 
> > +            break;
> > +    }
> > +
> > +    return dev;
> 
> What if the device doesn't exist?

This returns NULL, the caller returns an error...
David Gibson Feb. 13, 2011, 11:12 a.m. UTC | #8
On Sun, Feb 13, 2011 at 10:08:23AM +0200, Blue Swirl wrote:
> On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
> >> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
> >> <benh@kernel.crashing.org> wrote:
> >> > On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
> >> >>
> >> >> Actually I don't quite understand the need for vty layer, why not use
> >> >> the chardev here directly?
> >> >
> >> > I'm not sure what you mean here...
> >>
> >> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
> >> instead of moving those to a separate file.
> >
> > Well, the VIO device instance gives the chardev instance which is all
> > nicely encapsulated inside spapr-vty... Also VIO devices tend to have
> > dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
> > close to the rest of the VIO driver they belong to don't you think ?
> >
> > (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
> > added to the core VIO but you'll see that when we get those patches
> > ready, hopefully soon).
> 
> This is a bit of a special case, much like semihosting modes for m68k
> or ARM, or like MOL hacks which were removed recently. From QEMU point
> of view, the most natural way of handling this would be hypervisor
> implemented in the guest side (for example BIOS). Then the hypervisor
> would use normal IO (or virtio) to communicate with the host. If
> inside QEMU, the interface of the hypervisor to the devices needs some
> thought. We'd like to avoid ugly interfaces like vmmouse where a
> device probes CPU registers directly or spaghetti interfaces like
> APIC.

I really don't follow what you're saying here.  Running the hypervisor
in the guest, rather than emulating its effect in qemu seems like an
awful lot of complexity for no clear reason.

> > Actually, one thing I noticed is that the current patches David posted
> > still have a single function with a switch/case statement for hcalls.
> >
> > I'm not 100% certain what David long term plans are here, but in our
> > internal "WIP" tree, we've subsequently turned that into a mechanism
> > where any module can call powerpc_register_hypercall() to add hcalls.
> >
> > So if David intends to move the "upstream candidate" tree in that
> > direction, then naturally, the calls in spapr_hcall.c are going to
> > disappear in favor of a pair of powerpc_register_hypercall() locally in
> > the vty module.
> 
> Is the interface new design, or are you implementing what is used also
> on real HW?

The interface already exists on real HW.  It's described in the PAPR
document we keep mentioning.
David Gibson Feb. 13, 2011, 11:14 a.m. UTC | #9
On Sun, Feb 13, 2011 at 10:15:03AM +1100, Benjamin Herrenschmidt wrote:
> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
> > On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
[snip]
> Actually, one thing I noticed is that the current patches David posted
> still have a single function with a switch/case statement for hcalls.
> 
> I'm not 100% certain what David long term plans are here, but in our
> internal "WIP" tree, we've subsequently turned that into a mechanism
> where any module can call powerpc_register_hypercall() to add hcalls.
> 
> So if David intends to move the "upstream candidate" tree in that
> direction, then naturally, the calls in spapr_hcall.c are going to
> disappear in favor of a pair of powerpc_register_hypercall() locally in
> the vty module.

Ah, yeah.  I'm still not sure what to do about it.  I was going to
fold the dynamic hcall registration into the patch set before
upstreaming.  But then something paulus said made me rethink whether
the dynamic registration was a good idea.  Still need to sort this out
before the series is really ready.
Blue Swirl Feb. 13, 2011, 12:15 p.m. UTC | #10
On Sun, Feb 13, 2011 at 1:12 PM, David Gibson
<david@gibson.dropbear.id.au> wrote:
> On Sun, Feb 13, 2011 at 10:08:23AM +0200, Blue Swirl wrote:
>> On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>> > On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>> >> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>> >> <benh@kernel.crashing.org> wrote:
>> >> > On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>> >> >>
>> >> >> Actually I don't quite understand the need for vty layer, why not use
>> >> >> the chardev here directly?
>> >> >
>> >> > I'm not sure what you mean here...
>> >>
>> >> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
>> >> instead of moving those to a separate file.
>> >
>> > Well, the VIO device instance gives the chardev instance which is all
>> > nicely encapsulated inside spapr-vty... Also VIO devices tend to have
>> > dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
>> > close to the rest of the VIO driver they belong to don't you think ?
>> >
>> > (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
>> > added to the core VIO but you'll see that when we get those patches
>> > ready, hopefully soon).
>>
>> This is a bit of a special case, much like semihosting modes for m68k
>> or ARM, or like MOL hacks which were removed recently. From QEMU point
>> of view, the most natural way of handling this would be hypervisor
>> implemented in the guest side (for example BIOS). Then the hypervisor
>> would use normal IO (or virtio) to communicate with the host. If
>> inside QEMU, the interface of the hypervisor to the devices needs some
>> thought. We'd like to avoid ugly interfaces like vmmouse where a
>> device probes CPU registers directly or spaghetti interfaces like
>> APIC.
>
> I really don't follow what you're saying here.  Running the hypervisor
> in the guest, rather than emulating its effect in qemu seems like an
> awful lot of complexity for no clear reason.

Maybe it would be more complex but also emulation accuracy would be
increased and the interfaces would be saner. We don't shortcut BIOS
and implement its services to OS in QEMU for other machines either.

I'd expect one problem with that approach though, the interface used
on real HW between the hypervisor and the underlying HW may be
undocumented, but then it could use for example existing virtio
devices.

One way to handle this could be to add the hypervisor interface now to
QEMU and switch to guest hypervisor when (if) it becomes available.
I'd just like to avoid duplication with virtio or messy interfaces
like vmport.
Alexander Graf Feb. 13, 2011, 12:31 p.m. UTC | #11
On 13.02.2011, at 09:08, Blue Swirl wrote:

> On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
>> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>>> <benh@kernel.crashing.org> wrote:
>>>> On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>>>>> 
>>>>> Actually I don't quite understand the need for vty layer, why not use
>>>>> the chardev here directly?
>>>> 
>>>> I'm not sure what you mean here...
>>> 
>>> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
>>> instead of moving those to a separate file.
>> 
>> Well, the VIO device instance gives the chardev instance which is all
>> nicely encapsulated inside spapr-vty... Also VIO devices tend to have
>> dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
>> close to the rest of the VIO driver they belong to don't you think ?
>> 
>> (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
>> added to the core VIO but you'll see that when we get those patches
>> ready, hopefully soon).
> 
> This is a bit of a special case, much like semihosting modes for m68k
> or ARM, or like MOL hacks which were removed recently. From QEMU point
> of view, the most natural way of handling this would be hypervisor
> implemented in the guest side (for example BIOS). Then the hypervisor
> would use normal IO (or virtio) to communicate with the host. If
> inside QEMU, the interface of the hypervisor to the devices needs some
> thought. We'd like to avoid ugly interfaces like vmmouse where a
> device probes CPU registers directly or spaghetti interfaces like
> APIC.

In this case I disagree. While the "emulate real hardware" case would be to have a full proprietary binary blob of firmware running in the guest that would handle all the hypervisor stuff, this is not feasible. Implementing PAPR in Qemu is hard (and slow) enough - doing it all emulated simply is overkill for what we're trying to achieve here.

The PAPR interfaces are well specified (in the PAPR spec - seems to be power.org member restricted) and are the only thing you ever get to see on recent POWER hardware. The real hardware interface is simply inaccessible to you.

It's basically the same as the S390 machine we have. On those machine we simply don't have access to real hw, so emulating it is moot. All interfaces that the OS sees are PV interfaces.

> 
>> Actually, one thing I noticed is that the current patches David posted
>> still have a single function with a switch/case statement for hcalls.
>> 
>> I'm not 100% certain what David long term plans are here, but in our
>> internal "WIP" tree, we've subsequently turned that into a mechanism
>> where any module can call powerpc_register_hypercall() to add hcalls.
>> 
>> So if David intends to move the "upstream candidate" tree in that
>> direction, then naturally, the calls in spapr_hcall.c are going to
>> disappear in favor of a pair of powerpc_register_hypercall() locally in
>> the vty module.
> 
> Is the interface new design, or are you implementing what is used also
> on real HW?

PAPR is the spec that defines the PV interface in use on POWER. Outside of IBM, nobody can run Linux on POWER without going through phyp which is their hypervisor.

So this implements exactly what the OS sees on real HW :).


Alex
Alexander Graf Feb. 13, 2011, 12:38 p.m. UTC | #12
On 13.02.2011, at 12:09, David Gibson wrote:

> On Sat, Feb 12, 2011 at 05:47:53PM +0100, Alexander Graf wrote:
>> On 12.02.2011, at 15:54, David Gibson wrote:
> [snip] 
>>> @@ -267,6 +295,7 @@ static QEMUMachine spapr_machine = {
>>>    .desc = "pSeries Logical Partition (PAPR compliant)",
>>>    .init = ppc_spapr_init,
>>>    .max_cpus = 1,
>>> +    .no_parallel = 1,
>> 
>> duplicate?
> 
> Oops, rebasing mistake.  Fixed now.
> 
> [snip]
>>> +VIOsPAPRDevice *spapr_vio_find_by_reg(VIOsPAPRBus *bus, uint32_t reg)
>>> +{
>>> +    DeviceState *qdev;
>>> +    VIOsPAPRDevice *dev = NULL;
>>> +
>>> +    QLIST_FOREACH(qdev, &bus->bus.children, sibling) {
>>> +        dev = (VIOsPAPRDevice *)qdev;
>>> +        if (dev->reg == reg)
>> 
>> Braces
>> 
>>> +            break;
>>> +    }
>>> +
>>> +    return dev;
>> 
>> What if the device doesn't exist?
> 
> This returns NULL, the caller returns an error...

Makes sense :).


Alex
Alexander Graf Feb. 13, 2011, 12:40 p.m. UTC | #13
On 13.02.2011, at 12:14, David Gibson wrote:

> On Sun, Feb 13, 2011 at 10:15:03AM +1100, Benjamin Herrenschmidt wrote:
>> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
> [snip]
>> Actually, one thing I noticed is that the current patches David posted
>> still have a single function with a switch/case statement for hcalls.
>> 
>> I'm not 100% certain what David long term plans are here, but in our
>> internal "WIP" tree, we've subsequently turned that into a mechanism
>> where any module can call powerpc_register_hypercall() to add hcalls.
>> 
>> So if David intends to move the "upstream candidate" tree in that
>> direction, then naturally, the calls in spapr_hcall.c are going to
>> disappear in favor of a pair of powerpc_register_hypercall() locally in
>> the vty module.
> 
> Ah, yeah.  I'm still not sure what to do about it.  I was going to
> fold the dynamic hcall registration into the patch set before
> upstreaming.  But then something paulus said made me rethink whether
> the dynamic registration was a good idea.  Still need to sort this out
> before the series is really ready.

We can surely move it to dynamic later on. I think the "proper" way would be to populate a qdev bus and have the individual hypercall receivers register themselves through -device creations. But Blue really is the expert here :).


Alex
David Gibson Feb. 13, 2011, 12:44 p.m. UTC | #14
On Sun, Feb 13, 2011 at 01:40:14PM +0100, Alexander Graf wrote:
> 
> On 13.02.2011, at 12:14, David Gibson wrote:
> 
> > On Sun, Feb 13, 2011 at 10:15:03AM +1100, Benjamin Herrenschmidt wrote:
> >> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
> >>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
> > [snip]
> >> Actually, one thing I noticed is that the current patches David posted
> >> still have a single function with a switch/case statement for hcalls.
> >> 
> >> I'm not 100% certain what David long term plans are here, but in our
> >> internal "WIP" tree, we've subsequently turned that into a mechanism
> >> where any module can call powerpc_register_hypercall() to add hcalls.
> >> 
> >> So if David intends to move the "upstream candidate" tree in that
> >> direction, then naturally, the calls in spapr_hcall.c are going to
> >> disappear in favor of a pair of powerpc_register_hypercall() locally in
> >> the vty module.
> > 
> > Ah, yeah.  I'm still not sure what to do about it.  I was going to
> > fold the dynamic hcall registration into the patch set before
> > upstreaming.  But then something paulus said made me rethink whether
> > the dynamic registration was a good idea.  Still need to sort this out
> > before the series is really ready.
> 
> We can surely move it to dynamic later on. I think the "proper" way
> would be to populate a qdev bus and have the individual hypercall
> receivers register themselves through -device creations. But Blue
> really is the expert here :).

Ok, not sure what you mean here.  I already have a qdev bus for the
VIO devices.  With my tentative dynamic model as devices are created
on the bus they may register hypercalls as well.

Is that what you mean, or do you mean have a separate "hypercall"
bus.  That sounds like serious overkill for a simple number->function
translation.
Blue Swirl Feb. 13, 2011, 12:59 p.m. UTC | #15
On Sun, Feb 13, 2011 at 2:31 PM, Alexander Graf <agraf@suse.de> wrote:
>
> On 13.02.2011, at 09:08, Blue Swirl wrote:
>
>> On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>>> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>>>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>>>> <benh@kernel.crashing.org> wrote:
>>>>> On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>>>>>>
>>>>>> Actually I don't quite understand the need for vty layer, why not use
>>>>>> the chardev here directly?
>>>>>
>>>>> I'm not sure what you mean here...
>>>>
>>>> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
>>>> instead of moving those to a separate file.
>>>
>>> Well, the VIO device instance gives the chardev instance which is all
>>> nicely encapsulated inside spapr-vty... Also VIO devices tend to have
>>> dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
>>> close to the rest of the VIO driver they belong to don't you think ?
>>>
>>> (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
>>> added to the core VIO but you'll see that when we get those patches
>>> ready, hopefully soon).
>>
>> This is a bit of a special case, much like semihosting modes for m68k
>> or ARM, or like MOL hacks which were removed recently. From QEMU point
>> of view, the most natural way of handling this would be hypervisor
>> implemented in the guest side (for example BIOS). Then the hypervisor
>> would use normal IO (or virtio) to communicate with the host. If
>> inside QEMU, the interface of the hypervisor to the devices needs some
>> thought. We'd like to avoid ugly interfaces like vmmouse where a
>> device probes CPU registers directly or spaghetti interfaces like
>> APIC.
>
> In this case I disagree. While the "emulate real hardware" case would be to have a full proprietary binary blob of firmware running in the guest that would handle all the hypervisor stuff, this is not feasible. Implementing PAPR in Qemu is hard (and slow) enough - doing it all emulated simply is overkill for what we're trying to achieve here.
>
> The PAPR interfaces are well specified (in the PAPR spec - seems to be power.org member restricted) and are the only thing you ever get to see on recent POWER hardware. The real hardware interface is simply inaccessible to you.

Hah, I'm sure that if sufficiently motivated bunch of cool hackers got
access to a truckload (or several, aren't these big machines?) of
POWER hardware, which they could open and brick at leisure, like it is
done with other firmware reverse engineering efforts, we'd have open
hypervisor at no time. ;-)

It's not impossible to imagine also IBM open sourcing theirs.

> It's basically the same as the S390 machine we have. On those machine we simply don't have access to real hw, so emulating it is moot. All interfaces that the OS sees are PV interfaces.
>
>>
>>> Actually, one thing I noticed is that the current patches David posted
>>> still have a single function with a switch/case statement for hcalls.
>>>
>>> I'm not 100% certain what David long term plans are here, but in our
>>> internal "WIP" tree, we've subsequently turned that into a mechanism
>>> where any module can call powerpc_register_hypercall() to add hcalls.
>>>
>>> So if David intends to move the "upstream candidate" tree in that
>>> direction, then naturally, the calls in spapr_hcall.c are going to
>>> disappear in favor of a pair of powerpc_register_hypercall() locally in
>>> the vty module.
>>
>> Is the interface new design, or are you implementing what is used also
>> on real HW?
>
> PAPR is the spec that defines the PV interface in use on POWER. Outside of IBM, nobody can run Linux on POWER without going through phyp which is their hypervisor.
>
> So this implements exactly what the OS sees on real HW :).

Right. As long as the resulting spaghetti is well contained, I'm OK
with this approach. But should ever the millionaire hacker team (or
IBM) appear with their open hypervisor, this shall make room for it.
Alexander Graf Feb. 13, 2011, 1:09 p.m. UTC | #16
On 13.02.2011, at 13:44, David Gibson wrote:

> On Sun, Feb 13, 2011 at 01:40:14PM +0100, Alexander Graf wrote:
>> 
>> On 13.02.2011, at 12:14, David Gibson wrote:
>> 
>>> On Sun, Feb 13, 2011 at 10:15:03AM +1100, Benjamin Herrenschmidt wrote:
>>>> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>>>>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>>> [snip]
>>>> Actually, one thing I noticed is that the current patches David posted
>>>> still have a single function with a switch/case statement for hcalls.
>>>> 
>>>> I'm not 100% certain what David long term plans are here, but in our
>>>> internal "WIP" tree, we've subsequently turned that into a mechanism
>>>> where any module can call powerpc_register_hypercall() to add hcalls.
>>>> 
>>>> So if David intends to move the "upstream candidate" tree in that
>>>> direction, then naturally, the calls in spapr_hcall.c are going to
>>>> disappear in favor of a pair of powerpc_register_hypercall() locally in
>>>> the vty module.
>>> 
>>> Ah, yeah.  I'm still not sure what to do about it.  I was going to
>>> fold the dynamic hcall registration into the patch set before
>>> upstreaming.  But then something paulus said made me rethink whether
>>> the dynamic registration was a good idea.  Still need to sort this out
>>> before the series is really ready.
>> 
>> We can surely move it to dynamic later on. I think the "proper" way
>> would be to populate a qdev bus and have the individual hypercall
>> receivers register themselves through -device creations. But Blue
>> really is the expert here :).
> 
> Ok, not sure what you mean here.  I already have a qdev bus for the
> VIO devices.  With my tentative dynamic model as devices are created
> on the bus they may register hypercalls as well.

Oh, guess I just overlooked that then, sorry :).

> Is that what you mean, or do you mean have a separate "hypercall"
> bus.  That sounds like serious overkill for a simple number->function
> translation.

Yup.


Alex
Anthony Liguori Feb. 13, 2011, 3:08 p.m. UTC | #17
On 02/13/2011 05:12 AM, David Gibson wrote:
> On Sun, Feb 13, 2011 at 10:08:23AM +0200, Blue Swirl wrote:
>    
>> On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org>  wrote:
>>      
>>> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>>>        
>>>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>>>> <benh@kernel.crashing.org>  wrote:
>>>>          
>>>>> On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>>>>>            
>>>>>> Actually I don't quite understand the need for vty layer, why not use
>>>>>> the chardev here directly?
>>>>>>              
>>>>> I'm not sure what you mean here...
>>>>>            
>>>> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
>>>> instead of moving those to a separate file.
>>>>          
>>> Well, the VIO device instance gives the chardev instance which is all
>>> nicely encapsulated inside spapr-vty... Also VIO devices tend to have
>>> dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
>>> close to the rest of the VIO driver they belong to don't you think ?
>>>
>>> (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
>>> added to the core VIO but you'll see that when we get those patches
>>> ready, hopefully soon).
>>>        
>> This is a bit of a special case, much like semihosting modes for m68k
>> or ARM, or like MOL hacks which were removed recently. From QEMU point
>> of view, the most natural way of handling this would be hypervisor
>> implemented in the guest side (for example BIOS). Then the hypervisor
>> would use normal IO (or virtio) to communicate with the host. If
>> inside QEMU, the interface of the hypervisor to the devices needs some
>> thought. We'd like to avoid ugly interfaces like vmmouse where a
>> device probes CPU registers directly or spaghetti interfaces like
>> APIC.
>>      
> I really don't follow what you're saying here.  Running the hypervisor
> in the guest, rather than emulating its effect in qemu seems like an
> awful lot of complexity for no clear reason.
>    

In KVM for x86, instead of using a secondary interface (like 
vmmcall/vmcall), we do all of our paravirtualization using native 
hardware interfaces that we can trap (PIO/MMIO).

IIUC, on Power, trapping MMIO is not possible due to the MMU mode that 
is currently used (PFs are delivered directly to the guest).

So it's not really possible to switch from using hypercalls to using MMIO.

What I would suggest is modelling hypercalls as another I/O address 
space for the processor.  So instead of having a function pointer in the 
CPUState, introduce a:

typedef void (HypercallFunc)(CPUState *env, void *opaque);

/* register a hypercall handler */
void register_hypercall(target_ulong index, HypercallFunc *handler, void 
*opaque);
void unregister_hypercall(target_ulong index);

/* dispatch a hypercall */
void cpu_hypercall(CPUState *env, target_ulong index);

This interface could also be used to implement hypercall based 
interfaces on s390 and x86.

The arguments will have to be extracted from the CPU state but I don't 
think we'll really ever have common hypercall implementations anyway so 
that's not a huge problem.

>> on real HW?
>>      
> The interface already exists on real HW.  It's described in the PAPR
> document we keep mentioning.
>    

Another thing to note is that the hypercall based I/O devices the 
interfaces that the current Power hypervisor uses so implementing this 
interface is necessary to support existing guests.

Regards,

Anthony Liguori
Anthony Liguori Feb. 13, 2011, 3:14 p.m. UTC | #18
On 02/13/2011 06:40 AM, Alexander Graf wrote:
>
>> Ah, yeah.  I'm still not sure what to do about it.  I was going to
>> fold the dynamic hcall registration into the patch set before
>> upstreaming.  But then something paulus said made me rethink whether
>> the dynamic registration was a good idea.  Still need to sort this out
>> before the series is really ready.
>>      
> We can surely move it to dynamic later on. I think the "proper" way would be to populate a qdev bus and have the individual hypercall receivers register themselves through -device creations.
>    

Ignore the qdev bit for a moment.  Hypercalls could be plausibly 
implemented as another I/O space from a processor so the thing to model 
off of would be PIO dispatch (cpu_outb and friends).

 From a qdev perspective, having a VIO bus makes sense.  The details of 
which I/O spaces are uses are not as important from a device tree 
perspective.

Regards,

Anthony Liguori

> Alex
>
>
>
Alexander Graf Feb. 13, 2011, 3:56 p.m. UTC | #19
On 13.02.2011, at 16:08, Anthony Liguori wrote:

> On 02/13/2011 05:12 AM, David Gibson wrote:
>> On Sun, Feb 13, 2011 at 10:08:23AM +0200, Blue Swirl wrote:
>>   
>>> On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
>>> <benh@kernel.crashing.org>  wrote:
>>>     
>>>> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>>>>       
>>>>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>>>>> <benh@kernel.crashing.org>  wrote:
>>>>>         
>>>>>> On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>>>>>>           
>>>>>>> Actually I don't quite understand the need for vty layer, why not use
>>>>>>> the chardev here directly?
>>>>>>>             
>>>>>> I'm not sure what you mean here...
>>>>>>           
>>>>> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
>>>>> instead of moving those to a separate file.
>>>>>         
>>>> Well, the VIO device instance gives the chardev instance which is all
>>>> nicely encapsulated inside spapr-vty... Also VIO devices tend to have
>>>> dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
>>>> close to the rest of the VIO driver they belong to don't you think ?
>>>> 
>>>> (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
>>>> added to the core VIO but you'll see that when we get those patches
>>>> ready, hopefully soon).
>>>>       
>>> This is a bit of a special case, much like semihosting modes for m68k
>>> or ARM, or like MOL hacks which were removed recently. From QEMU point
>>> of view, the most natural way of handling this would be hypervisor
>>> implemented in the guest side (for example BIOS). Then the hypervisor
>>> would use normal IO (or virtio) to communicate with the host. If
>>> inside QEMU, the interface of the hypervisor to the devices needs some
>>> thought. We'd like to avoid ugly interfaces like vmmouse where a
>>> device probes CPU registers directly or spaghetti interfaces like
>>> APIC.
>>>     
>> I really don't follow what you're saying here.  Running the hypervisor
>> in the guest, rather than emulating its effect in qemu seems like an
>> awful lot of complexity for no clear reason.
>>   
> 
> In KVM for x86, instead of using a secondary interface (like vmmcall/vmcall), we do all of our paravirtualization using native hardware interfaces that we can trap (PIO/MMIO).
> 
> IIUC, on Power, trapping MMIO is not possible due to the MMU mode that is currently used (PFs are delivered directly to the guest).
> 
> So it's not really possible to switch from using hypercalls to using MMIO.
> 
> What I would suggest is modelling hypercalls as another I/O address space for the processor.  So instead of having a function pointer in the CPUState, introduce a:
> 
> typedef void (HypercallFunc)(CPUState *env, void *opaque);
> 
> /* register a hypercall handler */
> void register_hypercall(target_ulong index, HypercallFunc *handler, void *opaque);
> void unregister_hypercall(target_ulong index);
> 
> /* dispatch a hypercall */
> void cpu_hypercall(CPUState *env, target_ulong index);
> 
> This interface could also be used to implement hypercall based interfaces on s390 and x86.
> 
> The arguments will have to be extracted from the CPU state but I don't think we'll really ever have common hypercall implementations anyway so that's not a huge problem.

Is there enough common ground between the hypercall interfaces that this makes sense? It sounds nice on paper, but in the end the "hypercall not implemented" return codes differ, the argument interpretation differs and even the amount of return values differ.

So at the end of the day, all this interface could do is call a machine helper function to evaluate the hypercall id from its register state and dispatch to that, leaving the rest to the individual hypercall function implementation.

The implementations themselves also couldn't be reused. A PAPR hypercall simply doesn't work on x86. PIO and MMIO interfaces make sense to share - devices implemented using them could potentially be reused later by other platforms. For the hypercall devices, that doesn't work.

> 
>>> on real HW?
>>>     
>> The interface already exists on real HW.  It's described in the PAPR
>> document we keep mentioning.
>>   
> 
> Another thing to note is that the hypercall based I/O devices the interfaces that the current Power hypervisor uses so implementing this interface is necessary to support existing guests.

Yes, implementing the existing PAPR interfaces is crucial to run existing guests. Implementing it at the hypercall level is required if we ever want to run it with hardware accelerated KVM on ppc, as there hypercalls simply get forwarded to the hypervisor (kvm) which would pass them on to qemu. And since the interface is not nesting capable, running a hypervisor inside the guest doesn't work.


Alex
Benjamin Herrenschmidt Feb. 13, 2011, 4:07 p.m. UTC | #20
On Sun, 2011-02-13 at 10:08 +0200, Blue Swirl wrote:
> This is a bit of a special case, much like semihosting modes for m68k
> or ARM, or like MOL hacks which were removed recently. From QEMU point
> of view, the most natural way of handling this would be hypervisor
> implemented in the guest side (for example BIOS). Then the hypervisor
> would use normal IO (or virtio) to communicate with the host. If
> inside QEMU, the interface of the hypervisor to the devices needs some
> thought. We'd like to avoid ugly interfaces like vmmouse where a
> device probes CPU registers directly or spaghetti interfaces like
> APIC.

I'm not sure I understand your point. We are emulating a guest running
under pHyp, ie, qemu in that case is the hypervisor, and provides the
same interfaces as pHyp does (the IBM proprietary hypervisor, aka
sPAPR). This is how we enable booting existing kernels and distributions
inside qemu/kvm.

> Is the interface new design, or are you implementing what is used also
> on real HW?

We are implementing a subset of the interfaces implemented by pHyp today
on "real HW". In the long run, when you throw KVM into the picture, on
machines (hypothetical machines at this stage) where one can run a KVM
has a hypervisor (currently you can only run pHyp on all released IBM
machines), this will give you an environment that is compatible with
pHyp and thus supports existing distributions and kernels.

We -will- add support for the "real" virtio on top of that, but those
will require modified guest kernels (and will provide better
performances than the sPAPR emulation). But the sPAPR emulation is a
necessary step to enable existing stuff to run.

Cheers,
Ben.
Benjamin Herrenschmidt Feb. 13, 2011, 4:12 p.m. UTC | #21
On Sun, 2011-02-13 at 14:15 +0200, Blue Swirl wrote:
> 
> Maybe it would be more complex but also emulation accuracy would be
> increased and the interfaces would be saner. We don't shortcut BIOS
> and implement its services to OS in QEMU for other machines either.

But that is not comparable. BIOS is comparable for example to Open
Firmware and we do not 'emulate' OF, we will provide an implementation
that runs inside the guest, just like you do for BIOS (SLOF based, tho
people are welcome to play with OpenBIOS if they want, but SLOF is what
we will provide and support).

In this case, we are talking about a hypervisor which is somewhat a
different beast. Sure you -could- run it into the guest, I suppose, if
emulation accuracy was your ultimate goal. That would entail at least
the followings:

 - Implement support for the complete "hypervisor" mode inside qemu
 - Re-implement a complete hypervisor compatible with pHyp

An enormous amount of work, for a result that would have low
performances and about zero interest to anybody.

The goal here is to provide a runtime environment for kernels and
distributions that is -compatible- with sPAPR/pHyp to enable existing
distributions to operate in KVM.

> I'd expect one problem with that approach though, the interface used
> on real HW between the hypervisor and the underlying HW may be
> undocumented, but then it could use for example existing virtio
> devices.

But what would be the point ?

> One way to handle this could be to add the hypervisor interface now to
> QEMU and switch to guest hypervisor when (if) it becomes available.
> I'd just like to avoid duplication with virtio or messy interfaces
> like vmport. 

Again, what would be the point ? Eventually, KVM will be available as an
"alternate" hypervisor to pHyp which I suppose one could run entirely
inside qemu once we add support for the HV mode to it, and that would
somewhat do what you describe but that isn't what we are trying to get
at here.

Cheers,
Ben.
Benjamin Herrenschmidt Feb. 13, 2011, 4:17 p.m. UTC | #22
On Sun, 2011-02-13 at 13:40 +0100, Alexander Graf wrote:
> 
> We can surely move it to dynamic later on. I think the "proper" way
> would be to populate a qdev bus and have the individual hypercall
> receivers register themselves through -device creations. But Blue
> really is the expert here :).

Why would you want to go through a bus for all hcalls ? (ie including
the ones that aren't device related ?). That doesn't quite "tick" but
I'm sure I'm missing part of the picture here :-)

A simple dispatch table based approach is the most efficient and simple
way to do that (after a switch/case) in my opinion, this is more/less
what we have done internally, with a pair of calls for "modules" to
register hcalls if they need to. The hcalls numbers are fixed and
specified in sPAPR.

BTW. We are still missing in this picture RTAS. I suppose David is still
cleaning up those patches. Basically, we use a 5 instruction trampoline
that calls a private h-call, the RTAS emulation is entirely in qemu.
This has to be done that way for various reasons, but essentially RTAS
under pHyp also more/less turns into private pHyp calls under the hood.
 
Cheers,
Ben.
Anthony Liguori Feb. 13, 2011, 4:46 p.m. UTC | #23
On 02/13/2011 09:56 AM, Alexander Graf wrote:
>
>> This interface could also be used to implement hypercall based interfaces on s390 and x86.
>>
>> The arguments will have to be extracted from the CPU state but I don't think we'll really ever have common hypercall implementations anyway so that's not a huge problem.
>>      
> Is there enough common ground between the hypercall interfaces that this makes sense?

Between the dispatch, registration, and tracing, yes.

>   It sounds nice on paper, but in the end the "hypercall not implemented" return codes differ, the argument interpretation differs and even the amount of return values differ.
>    

Yes, that's why we pass CPUState.  But keep in mind, CPUState needs to 
be sync'd before and after the invocation.

> So at the end of the day, all this interface could do is call a machine helper function to evaluate the hypercall id from its register state and dispatch to that, leaving the rest to the individual hypercall function implementation.
>
> The implementations themselves also couldn't be reused. A PAPR hypercall simply doesn't work on x86. PIO and MMIO interfaces make sense to share - devices implemented using them could potentially be reused later by other platforms. For the hypercall devices, that doesn't work.
>    

Yes, which is why I'm not advocating trying to unmarshal the calls or 
anything like that.  But the dispatch, registration, tracing, and CPU 
sync'ing bits can all be reused.

Regards,

Anthony Liguori
Anthony Liguori Feb. 13, 2011, 4:48 p.m. UTC | #24
On 02/13/2011 10:07 AM, Benjamin Herrenschmidt wrote:
> On Sun, 2011-02-13 at 10:08 +0200, Blue Swirl wrote:
>    
>> This is a bit of a special case, much like semihosting modes for m68k
>> or ARM, or like MOL hacks which were removed recently. From QEMU point
>> of view, the most natural way of handling this would be hypervisor
>> implemented in the guest side (for example BIOS). Then the hypervisor
>> would use normal IO (or virtio) to communicate with the host. If
>> inside QEMU, the interface of the hypervisor to the devices needs some
>> thought. We'd like to avoid ugly interfaces like vmmouse where a
>> device probes CPU registers directly or spaghetti interfaces like
>> APIC.
>>      
> I'm not sure I understand your point. We are emulating a guest running
> under pHyp, ie, qemu in that case is the hypervisor, and provides the
> same interfaces as pHyp does (the IBM proprietary hypervisor, aka
> sPAPR). This is how we enable booting existing kernels and distributions
> inside qemu/kvm.
>
>    
>> Is the interface new design, or are you implementing what is used also
>> on real HW?
>>      
> We are implementing a subset of the interfaces implemented by pHyp today
> on "real HW". In the long run, when you throw KVM into the picture, on
> machines (hypothetical machines at this stage) where one can run a KVM
> has a hypervisor (currently you can only run pHyp on all released IBM
> machines), this will give you an environment that is compatible with
> pHyp and thus supports existing distributions and kernels.
>    

We try very, very hard to make our paravirtualization look like real 
hardware.

When the paravirtualization interfaces are already defined, we have no 
choice in supporting those interfaces although sometimes we try to push 
that to firmware (like with Xenner).  It's better from a security PoV.

But in this case, we have no choice but to implement the 
paravirtualization interfaces in QEMU because of the way the hardware 
works and because these interfaces are already well defined.

Regards,

Anthony Liguori

> We -will- add support for the "real" virtio on top of that, but those
> will require modified guest kernels (and will provide better
> performances than the sPAPR emulation). But the sPAPR emulation is a
> necessary step to enable existing stuff to run.
>
> Cheers,
> Ben.
>
>
>
Anthony Liguori Feb. 13, 2011, 4:52 p.m. UTC | #25
On 02/13/2011 10:17 AM, Benjamin Herrenschmidt wrote:
> On Sun, 2011-02-13 at 13:40 +0100, Alexander Graf wrote:
>    
>> We can surely move it to dynamic later on. I think the "proper" way
>> would be to populate a qdev bus and have the individual hypercall
>> receivers register themselves through -device creations. But Blue
>> really is the expert here :).
>>      
> Why would you want to go through a bus for all hcalls ? (ie including
> the ones that aren't device related ?). That doesn't quite "tick" but
> I'm sure I'm missing part of the picture here :-)
>    

A virtual bus is just an interface.  If all virtual devices that 
interact via hcalls would all reside on the same virtual bus, then 
having hypercalls registered through that interface makes sense because 
you can associate hypercalls with particular devices.  This means that 
you can automatically deregister on device removal and things like that.

But I don't think this will work out well.  I think treating the 
hypercalls as a simple dispatch table just like ioport would make sense.

Regards,

Anthony Liguori

> A simple dispatch table based approach is the most efficient and simple
> way to do that (after a switch/case) in my opinion, this is more/less
> what we have done internally, with a pair of calls for "modules" to
> register hcalls if they need to. The hcalls numbers are fixed and
> specified in sPAPR.
>
> BTW. We are still missing in this picture RTAS. I suppose David is still
> cleaning up those patches. Basically, we use a 5 instruction trampoline
> that calls a private h-call, the RTAS emulation is entirely in qemu.
> This has to be done that way for various reasons, but essentially RTAS
> under pHyp also more/less turns into private pHyp calls under the hood.
>
> Cheers,
> Ben.
>
>
>
>
Benjamin Herrenschmidt Feb. 13, 2011, 6:19 p.m. UTC | #26
On Sun, 2011-02-13 at 10:48 -0600, Anthony Liguori wrote:
> 
> We try very, very hard to make our paravirtualization look like real 
> hardware.

Sure, that makes sense when you invent new paravirt interfaces, but that
isn't the case. Note also that our current processors do not have the
ability to emulate MMIOs in all cases, ie, when doing "real" KVM in HV
mode, we cannot trap MMIO unless we redirect all page faults to the
hypervisor, which comes at a cost.

> When the paravirtualization interfaces are already defined, we have no
> choice in supporting those interfaces although sometimes we try to
> push  that to firmware (like with Xenner).  It's better from a
> security PoV.
> 
> But in this case, we have no choice but to implement the 
> paravirtualization interfaces in QEMU because of the way the hardware 
> works and because these interfaces are already well defined.

Right.

Now, in the long run, we might decide to "reflect" some of these back
into some guest co-located FW or whatever of that kind, especially if we
get to a point where linux virt-io becomes more prevalent and the sPAPR
style IOs become purely legacy backward compat stubs, but we aren't
there yet.

Cheers,
Ben.
Benjamin Herrenschmidt Feb. 13, 2011, 6:21 p.m. UTC | #27
On Sun, 2011-02-13 at 10:52 -0600, Anthony Liguori wrote:
> 
> A virtual bus is just an interface.  If all virtual devices that 
> interact via hcalls would all reside on the same virtual bus, then 
> having hypercalls registered through that interface makes sense
> because 
> you can associate hypercalls with particular devices.  This means
> that 
> you can automatically deregister on device removal and things like
> that.

I see. Well, VIO related h-calls are only part of the picture here, I
think we can live with having explicit de-registration if needed ;-)
Besides the h-call is still implemented even if no device -instance- is
currently plugged into the partition anyways. It just returns a (well
defined per-hcall) error code if the instance handle passed to it is
bogus.

> But I don't think this will work out well.  I think treating the 
> hypercalls as a simple dispatch table just like ioport would make
> sense.

Yup.

Cheers,
Ben.
Blue Swirl Feb. 13, 2011, 6:29 p.m. UTC | #28
On Sun, Feb 13, 2011 at 5:08 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 02/13/2011 05:12 AM, David Gibson wrote:
>>
>> On Sun, Feb 13, 2011 at 10:08:23AM +0200, Blue Swirl wrote:
>>
>>>
>>> On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
>>> <benh@kernel.crashing.org>  wrote:
>>>
>>>>
>>>> On Sun, 2011-02-13 at 00:52 +0200, Blue Swirl wrote:
>>>>
>>>>>
>>>>> On Sat, Feb 12, 2011 at 11:00 PM, Benjamin Herrenschmidt
>>>>> <benh@kernel.crashing.org>  wrote:
>>>>>
>>>>>>
>>>>>> On Sat, 2011-02-12 at 18:59 +0200, Blue Swirl wrote:
>>>>>>
>>>>>>>
>>>>>>> Actually I don't quite understand the need for vty layer, why not use
>>>>>>> the chardev here directly?
>>>>>>>
>>>>>>
>>>>>> I'm not sure what you mean here...
>>>>>>
>>>>>
>>>>> Maybe it would be reasonable to leave h_put_term_char to spapr_hcall.c
>>>>> instead of moving those to a separate file.
>>>>>
>>>>
>>>> Well, the VIO device instance gives the chardev instance which is all
>>>> nicely encapsulated inside spapr-vty... Also VIO devices tend to have
>>>> dedicated hcalls, not only VTY, so it makes a lot of sense to keep them
>>>> close to the rest of the VIO driver they belong to don't you think ?
>>>>
>>>> (Actually veth does, vscsi uses the more "generic" CRQ stuff which we've
>>>> added to the core VIO but you'll see that when we get those patches
>>>> ready, hopefully soon).
>>>>
>>>
>>> This is a bit of a special case, much like semihosting modes for m68k
>>> or ARM, or like MOL hacks which were removed recently. From QEMU point
>>> of view, the most natural way of handling this would be hypervisor
>>> implemented in the guest side (for example BIOS). Then the hypervisor
>>> would use normal IO (or virtio) to communicate with the host. If
>>> inside QEMU, the interface of the hypervisor to the devices needs some
>>> thought. We'd like to avoid ugly interfaces like vmmouse where a
>>> device probes CPU registers directly or spaghetti interfaces like
>>> APIC.
>>>
>>
>> I really don't follow what you're saying here.  Running the hypervisor
>> in the guest, rather than emulating its effect in qemu seems like an
>> awful lot of complexity for no clear reason.
>>
>
> In KVM for x86, instead of using a secondary interface (like
> vmmcall/vmcall), we do all of our paravirtualization using native hardware
> interfaces that we can trap (PIO/MMIO).
>
> IIUC, on Power, trapping MMIO is not possible due to the MMU mode that is
> currently used (PFs are delivered directly to the guest).
>
> So it's not really possible to switch from using hypercalls to using MMIO.
>
> What I would suggest is modelling hypercalls as another I/O address space
> for the processor.  So instead of having a function pointer in the CPUState,
> introduce a:
>
> typedef void (HypercallFunc)(CPUState *env, void *opaque);
>
> /* register a hypercall handler */
> void register_hypercall(target_ulong index, HypercallFunc *handler, void
> *opaque);
> void unregister_hypercall(target_ulong index);
>
> /* dispatch a hypercall */
> void cpu_hypercall(CPUState *env, target_ulong index);
>
> This interface could also be used to implement hypercall based interfaces on
> s390 and x86.
>
> The arguments will have to be extracted from the CPU state but I don't think
> we'll really ever have common hypercall implementations anyway so that's not
> a huge problem.

Nice idea. Then the part handling CPUState probably should belong to
target-ppc/ rather than hw/.
Anthony Liguori Feb. 13, 2011, 7:32 p.m. UTC | #29
On 02/13/2011 12:29 PM, Blue Swirl wrote:
> On Sun, Feb 13, 2011 at 5:08 PM, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>    
>>
>> In KVM for x86, instead of using a secondary interface (like
>> vmmcall/vmcall), we do all of our paravirtualization using native hardware
>> interfaces that we can trap (PIO/MMIO).
>>
>> IIUC, on Power, trapping MMIO is not possible due to the MMU mode that is
>> currently used (PFs are delivered directly to the guest).
>>
>> So it's not really possible to switch from using hypercalls to using MMIO.
>>
>> What I would suggest is modelling hypercalls as another I/O address space
>> for the processor.  So instead of having a function pointer in the CPUState,
>> introduce a:
>>
>> typedef void (HypercallFunc)(CPUState *env, void *opaque);
>>
>> /* register a hypercall handler */
>> void register_hypercall(target_ulong index, HypercallFunc *handler, void
>> *opaque);
>> void unregister_hypercall(target_ulong index);
>>
>> /* dispatch a hypercall */
>> void cpu_hypercall(CPUState *env, target_ulong index);
>>
>> This interface could also be used to implement hypercall based interfaces on
>> s390 and x86.
>>
>> The arguments will have to be extracted from the CPU state but I don't think
>> we'll really ever have common hypercall implementations anyway so that's not
>> a huge problem.
>>      
> Nice idea. Then the part handling CPUState probably should belong to
> target-ppc/ rather than hw/.
>    

Would be nice to have the target-ppc/ hypercall handlers call into a 
higher level VIO interface that didn't deal directly with CPUState.  The 
actual hardware emulation would then be implemented in hw/ and would not 
be compiled for a specific target.  That helps avoid future sloppiness.

Regards,

Anthony Liguori
David Gibson Feb. 13, 2011, 11:30 p.m. UTC | #30
On Sun, Feb 13, 2011 at 09:08:22AM -0600, Anthony Liguori wrote:
> On 02/13/2011 05:12 AM, David Gibson wrote:
> >On Sun, Feb 13, 2011 at 10:08:23AM +0200, Blue Swirl wrote:
> >>On Sun, Feb 13, 2011 at 1:15 AM, Benjamin Herrenschmidt
[snip]
> In KVM for x86, instead of using a secondary interface (like
> vmmcall/vmcall), we do all of our paravirtualization using native
> hardware interfaces that we can trap (PIO/MMIO).
> 
> IIUC, on Power, trapping MMIO is not possible due to the MMU mode
> that is currently used (PFs are delivered directly to the guest).
> 
> So it's not really possible to switch from using hypercalls to using MMIO.

That's correct.

> What I would suggest is modelling hypercalls as another I/O address
> space for the processor.  So instead of having a function pointer in
> the CPUState, introduce a:
> 
> typedef void (HypercallFunc)(CPUState *env, void *opaque);
> 
> /* register a hypercall handler */
> void register_hypercall(target_ulong index, HypercallFunc *handler,
> void *opaque);
> void unregister_hypercall(target_ulong index);
> 
> /* dispatch a hypercall */
> void cpu_hypercall(CPUState *env, target_ulong index);

Well, I can certainly implement dynamic registration - in fact I've
done that, I just need to fold it into the earlier part of the patch
series.

But the only "address" we have for this hypercall address space is the
hypercall number, and it's not architected where that will be
supplied.  So we'd still need a per-platform hook to extract the
index from the CPUState.

> This interface could also be used to implement hypercall based
> interfaces on s390 and x86.
> 
> The arguments will have to be extracted from the CPU state but I
> don't think we'll really ever have common hypercall implementations
> anyway so that's not a huge problem.
> 
> >>on real HW?
> >The interface already exists on real HW.  It's described in the PAPR
> >document we keep mentioning.
> 
> Another thing to note is that the hypercall based I/O devices the
> interfaces that the current Power hypervisor uses so implementing
> this interface is necessary to support existing guests.
David Gibson Feb. 13, 2011, 11:33 p.m. UTC | #31
On Sun, Feb 13, 2011 at 08:29:05PM +0200, Blue Swirl wrote:
> On Sun, Feb 13, 2011 at 5:08 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> > On 02/13/2011 05:12 AM, David Gibson wrote:
[snip]
> > The arguments will have to be extracted from the CPU state but I don't think
> > we'll really ever have common hypercall implementations anyway so that's not
> > a huge problem.
> 
> Nice idea. Then the part handling CPUState probably should belong to
> target-ppc/ rather than hw/.

Doesn't work.  Different hypervisors may have arguments - even the
hcall number itself - arranged differently in the registers.  My
earlier drafts had this in target-ppc/; I moved it to hw/ for a
reason.
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index e0796ba..fe232da 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -232,7 +232,8 @@  obj-ppc-y += ppc_oldworld.o
 # NewWorld PowerMac
 obj-ppc-y += ppc_newworld.o
 # IBM pSeries (sPAPR)
-obj-ppc-y += spapr.o spapr_hcall.o
+obj-ppc-y += spapr.o spapr_hcall.o spapr_vio.o
+obj-ppc-y += spapr_vty.o
 # PowerPC 4xx boards
 obj-ppc-y += ppc4xx_devs.o ppc4xx_pci.o ppc405_uc.o ppc405_boards.o
 obj-ppc-y += ppc440.o ppc440_bamboo.o
diff --git a/hw/spapr.c b/hw/spapr.c
index 8aca4e0..da61061 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -37,6 +37,7 @@ 
 #include "net.h"
 #include "blockdev.h"
 #include "hw/spapr.h"
+#include "hw/spapr_vio.h"
 
 #include <libfdt.h>
 
@@ -49,6 +50,7 @@ 
 
 static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
                               const char *cpu_model, CPUState *envs[],
+                              sPAPREnvironment *spapr,
                               target_phys_addr_t initrd_base,
                               target_phys_addr_t initrd_size,
                               const char *kernel_cmdline)
@@ -59,6 +61,7 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
     uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
     int i;
     char *modelname;
+    int ret;
 
 #define _FDT(exp) \
     do { \
@@ -151,9 +154,28 @@  static void *spapr_create_fdt(int *fdt_size, ram_addr_t ramsize,
 
     _FDT((fdt_end_node(fdt)));
 
+    /* vdevice */
+    _FDT((fdt_begin_node(fdt, "vdevice")));
+
+    _FDT((fdt_property_string(fdt, "device_type", "vdevice")));
+    _FDT((fdt_property_string(fdt, "compatible", "IBM,vdevice")));
+    _FDT((fdt_property_cell(fdt, "#address-cells", 0x1)));
+    _FDT((fdt_property_cell(fdt, "#size-cells", 0x0)));
+    
+    _FDT((fdt_end_node(fdt)));
+
     _FDT((fdt_end_node(fdt))); /* close root node */
     _FDT((fdt_finish(fdt)));
 
+    /* re-expand to allow for further tweaks */
+    _FDT((fdt_open_into(fdt, fdt, FDT_MAX_SIZE)));
+
+    ret = spapr_populate_vdevice(spapr->vio_bus, fdt);
+    if (ret < 0)
+        fprintf(stderr, "couldn't setup vio devices in fdt\n");
+
+    _FDT((fdt_pack(fdt)));
+
     if (fdt_size)
         *fdt_size = fdt_totalsize(fdt);
 
@@ -211,6 +233,12 @@  static void ppc_spapr_init (ram_addr_t ram_size,
     ram_offset = qemu_ram_alloc(NULL, "ppc_spapr.ram", ram_size);
     cpu_register_physical_memory(0, ram_size, ram_offset);
 
+    spapr->vio_bus = spapr_vio_bus_init();
+
+    for (i = 0; i < MAX_SERIAL_PORTS; i++)
+        if (serial_hds[i])
+            spapr_vty_create(spapr->vio_bus, i, serial_hds[i]);
+
     if (kernel_filename) {
         uint64_t lowaddr = 0;
 
@@ -242,7 +270,7 @@  static void ppc_spapr_init (ram_addr_t ram_size,
         }
 
         /* load fdt */
-        fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, &env,
+        fdt = spapr_create_fdt(&fdt_size, ram_size, cpu_model, &env, spapr,
                                initrd_base, initrd_size,
                                kernel_cmdline);
         if (!fdt) {
@@ -267,6 +295,7 @@  static QEMUMachine spapr_machine = {
     .desc = "pSeries Logical Partition (PAPR compliant)",
     .init = ppc_spapr_init,
     .max_cpus = 1,
+    .no_parallel = 1,
     .no_vga = 1,
     .no_parallel = 1,
 };
diff --git a/hw/spapr.h b/hw/spapr.h
index dae9617..168511f 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -1,7 +1,10 @@ 
 #if !defined (__HW_SPAPR_H__)
 #define __HW_SPAPR_H__
 
+struct VIOsPAPRBus;
+
 typedef struct sPAPREnvironment {
+    struct VIOsPAPRBus *vio_bus;
 } sPAPREnvironment;
 
 #define H_SUCCESS         0
@@ -237,4 +240,11 @@  typedef struct sPAPREnvironment {
 target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
                              target_ulong token, target_ulong *args);
 
+target_ulong h_put_term_char(sPAPREnvironment *spapr,
+                             target_ulong termno, target_ulong len,
+                             target_ulong char0_7, target_ulong char8_15);
+target_ulong h_get_term_char(sPAPREnvironment *spapr,
+                             target_ulong termno, target_ulong *len,
+                             target_ulong *char0_7, target_ulong *char8_15);
+
 #endif /* !defined (__HW_SPAPR_H__) */
diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index c99c345..e2ed9cf 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -3,19 +3,6 @@ 
 #include "qemu-char.h"
 #include "hw/spapr.h"
 
-static target_ulong h_put_term_char(target_ulong termno, target_ulong len,
-                                    target_ulong char0_7, target_ulong char8_15)
-{
-    uint8_t buf[16];
-
-    *((uint64_t *)buf) = cpu_to_be64(char0_7);
-    *((uint64_t *)buf + 1) = cpu_to_be64(char8_15);
-
-    qemu_chr_write(serial_hds[0], buf, len);
-
-    return 0;
-}
-
 target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
                              target_ulong token, target_ulong *args)
 {
@@ -29,7 +16,11 @@  target_ulong spapr_hypercall(CPUState *env, sPAPREnvironment *spapr,
 
     switch (token) {
     case H_PUT_TERM_CHAR:
-        r = h_put_term_char(args[0], args[1], args[2], args[3]);
+        r = h_put_term_char(spapr, args[0], args[1], args[2], args[3]);
+        break;
+
+    case H_GET_TERM_CHAR:
+        r = h_get_term_char(spapr, args[0], &args[0], &args[1], &args[2]);
         break;
 
     default:
diff --git a/hw/spapr_vio.c b/hw/spapr_vio.c
new file mode 100644
index 0000000..d9c7292
--- /dev/null
+++ b/hw/spapr_vio.c
@@ -0,0 +1,191 @@ 
+/*
+ * QEMU sPAPR VIO code
+ *
+ * Copyright (c) 2010 David Gibson, IBM Corporation <david@gibson.dropbear.id.au>
+ * Based on the s390 virtio bus code:
+ * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw.h"
+#include "sysemu.h"
+#include "boards.h"
+#include "monitor.h"
+#include "loader.h"
+#include "elf.h"
+#include "hw/sysbus.h"
+#include "kvm.h"
+#include "device_tree.h"
+
+#include "hw/spapr.h"
+#include "hw/spapr_vio.h"
+
+#ifdef CONFIG_FDT
+#include <libfdt.h>
+#endif /* CONFIG_FDT */
+
+/* #define DEBUG_SPAPR */
+
+#ifdef DEBUG_SPAPR
+#define dprintf(fmt, ...) \
+    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+struct BusInfo spapr_vio_bus_info = {
+    .name       = "spapr-vio",
+    .size       = sizeof(VIOsPAPRBus),
+};
+
+VIOsPAPRDevice *spapr_vio_find_by_reg(VIOsPAPRBus *bus, uint32_t reg)
+{
+    DeviceState *qdev;
+    VIOsPAPRDevice *dev = NULL;
+
+    QLIST_FOREACH(qdev, &bus->bus.children, sibling) {
+        dev = (VIOsPAPRDevice *)qdev;
+        if (dev->reg == reg)
+            break;
+    }
+
+    return dev;
+}
+
+VIOsPAPRBus *spapr_vio_bus_init(void)
+{
+    VIOsPAPRBus *bus;
+    BusState *_bus;
+    DeviceState *dev;
+
+    /* Create bridge device */
+    dev = qdev_create(NULL, "spapr-vio-bridge");
+    qdev_init_nofail(dev);
+
+    /* Create bus on bridge device */
+
+    _bus = qbus_create(&spapr_vio_bus_info, dev, "spapr-vio");
+    bus = DO_UPCAST(VIOsPAPRBus, bus, _bus);
+
+    return bus;
+}
+
+#ifdef CONFIG_FDT
+static int vio_make_devnode(VIOsPAPRDevice *dev,
+                            void *fdt)
+{
+    VIOsPAPRDeviceInfo *info = (VIOsPAPRDeviceInfo *)dev->qdev.info;
+    int vdevice_off, node_off;
+    int ret;
+
+    vdevice_off = fdt_path_offset(fdt, "/vdevice");
+    if (vdevice_off < 0)
+        return vdevice_off;
+
+    node_off = fdt_add_subnode(fdt, vdevice_off, dev->qdev.id);
+    if (node_off < 0)
+        return node_off;
+
+    ret = fdt_setprop_cell(fdt, node_off, "reg", dev->reg);
+    if (ret < 0)
+        return ret;
+
+    if (info->dt_type) {
+        ret = fdt_setprop_string(fdt, node_off, "device_type",
+                                 info->dt_type);
+        if (ret < 0)
+            return ret;
+    }
+
+    if (info->dt_compatible) {
+        ret = fdt_setprop_string(fdt, node_off, "compatible",
+                                 info->dt_compatible);
+        if (ret < 0)
+            return ret;
+    }
+
+    if (info->devnode) {
+        ret = (info->devnode)(dev, fdt, node_off);
+        if (ret < 0)
+            return ret;
+    }
+
+    return node_off;
+}
+#endif /* CONFIG_FDT */
+
+static int spapr_vio_busdev_init(DeviceState *dev, DeviceInfo *info)
+{
+    VIOsPAPRDeviceInfo *_info = (VIOsPAPRDeviceInfo *)info;
+    VIOsPAPRDevice *_dev = (VIOsPAPRDevice *)dev;
+    char *id;
+
+    if (asprintf(&id, "%s@%x", _info->dt_name, _dev->reg) < 0)
+        return -1;
+
+    _dev->qdev.id = id;
+
+    return _info->init(_dev);
+}
+
+void spapr_vio_bus_register_withprop(VIOsPAPRDeviceInfo *info)
+{
+    info->qdev.init = spapr_vio_busdev_init;
+    info->qdev.bus_info = &spapr_vio_bus_info;
+
+    assert(info->qdev.size >= sizeof(VIOsPAPRDevice));
+    qdev_register(&info->qdev);
+}
+
+static int spapr_vio_bridge_init(SysBusDevice *dev)
+{
+    /* nothing */
+    return 0;
+}
+
+static SysBusDeviceInfo spapr_vio_bridge_info = {
+    .init = spapr_vio_bridge_init,
+    .qdev.name  = "spapr-vio-bridge",
+    .qdev.size  = sizeof(SysBusDevice),
+    .qdev.no_user = 1,
+};
+
+static void spapr_vio_register_devices(void)
+{
+    sysbus_register_withprop(&spapr_vio_bridge_info);
+}
+
+device_init(spapr_vio_register_devices)
+
+#ifdef CONFIG_FDT
+
+int spapr_populate_vdevice(VIOsPAPRBus *bus, void *fdt)
+{
+    DeviceState *qdev;
+    int ret = 0;
+
+    QLIST_FOREACH(qdev, &bus->bus.children, sibling) {
+        VIOsPAPRDevice *dev = (VIOsPAPRDevice *)qdev;
+
+        ret = vio_make_devnode(dev, fdt);
+
+        if (ret < 0)
+            return ret;
+    }
+    
+    return 0;
+}
+#endif /* CONFIG_FDT */
diff --git a/hw/spapr_vio.h b/hw/spapr_vio.h
new file mode 100644
index 0000000..fb5e301
--- /dev/null
+++ b/hw/spapr_vio.h
@@ -0,0 +1,49 @@ 
+#ifndef _HW_SPAPR_VIO_H
+#define _HW_SPAPR_VIO_H
+/*
+ * QEMU sPAPR VIO bus definitions
+ *
+ * Copyright (c) 2010 David Gibson, IBM Corporation <david@gibson.dropbear.id.au>
+ * Based on the s390 virtio bus definitions:
+ * Copyright (c) 2009 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+typedef struct VIOsPAPRDevice {
+    DeviceState qdev;
+    uint32_t reg;
+} VIOsPAPRDevice;
+
+typedef struct VIOsPAPRBus {
+    BusState bus;
+} VIOsPAPRBus;
+
+typedef struct {
+    DeviceInfo qdev;
+    const char *dt_name, *dt_type, *dt_compatible;
+    int (*init)(VIOsPAPRDevice *dev);
+    int (*devnode)(VIOsPAPRDevice *dev, void *fdt, int node_off);
+} VIOsPAPRDeviceInfo;
+
+extern VIOsPAPRBus *spapr_vio_bus_init(void);
+extern VIOsPAPRDevice *spapr_vio_find_by_reg(VIOsPAPRBus *bus, uint32_t reg);
+extern void spapr_vio_bus_register_withprop(VIOsPAPRDeviceInfo *info);
+extern int spapr_populate_vdevice(VIOsPAPRBus *bus, void *fdt);
+
+void vty_putchars(VIOsPAPRDevice *sdev, uint8_t *buf, int len);
+void spapr_vty_create(VIOsPAPRBus *bus,
+                      uint32_t reg, CharDriverState *chardev);
+
+#endif /* _HW_SPAPR_VIO_H */
diff --git a/hw/spapr_vty.c b/hw/spapr_vty.c
new file mode 100644
index 0000000..9a2dc0b
--- /dev/null
+++ b/hw/spapr_vty.c
@@ -0,0 +1,132 @@ 
+#include "qdev.h"
+#include "qemu-char.h"
+#include "hw/spapr.h"
+#include "hw/spapr_vio.h"
+
+#define VTERM_BUFSIZE   16
+
+typedef struct VIOsPAPRVTYDevice {
+    VIOsPAPRDevice sdev;
+    CharDriverState *chardev;
+    uint32_t in, out;
+    uint8_t buf[VTERM_BUFSIZE];
+} VIOsPAPRVTYDevice;
+
+static int vty_can_receive(void *opaque)
+{
+    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)opaque;
+
+    return (dev->in - dev->out) < VTERM_BUFSIZE;
+}
+
+static void vty_receive(void *opaque, const uint8_t *buf, int size)
+{
+    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)opaque;
+    int i;
+
+    for (i = 0; i < size; i++) {
+        assert((dev->in - dev->out) < VTERM_BUFSIZE);
+        dev->buf[dev->in++ % VTERM_BUFSIZE] = buf[i];
+    }
+}
+
+static int vty_getchars(VIOsPAPRDevice *sdev, uint8_t *buf, int max)
+{
+    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)sdev;
+    int n = 0;
+
+    while ((n < max) && (dev->out != dev->in))
+        buf[n++] = dev->buf[dev->out++ % VTERM_BUFSIZE];
+
+    return n;
+}
+
+void vty_putchars(VIOsPAPRDevice *sdev, uint8_t *buf, int len)
+{
+    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)sdev;
+
+    /* FIXME: should check the qemu_chr_write() return value */
+    qemu_chr_write(dev->chardev, buf, len);
+}
+
+static int spapr_vty_init(VIOsPAPRDevice *sdev)
+{
+    VIOsPAPRVTYDevice *dev = (VIOsPAPRVTYDevice *)sdev;
+
+    qemu_chr_add_handlers(dev->chardev, vty_can_receive,
+                          vty_receive, NULL, dev);
+
+    return 0;
+}
+
+target_ulong h_put_term_char(sPAPREnvironment *spapr,
+                             target_ulong termno, target_ulong len,
+                             target_ulong char0_7, target_ulong char8_15)
+{
+    VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, termno);
+    uint8_t buf[16];
+
+    if (!sdev)
+        return H_PARAMETER;
+
+    if (len > 16)
+        return H_PARAMETER;
+
+    *((uint64_t *)buf) = cpu_to_be64(char0_7);
+    *((uint64_t *)buf + 1) = cpu_to_be64(char8_15);
+
+    vty_putchars(sdev, buf, len);
+
+    return 0;
+}
+
+target_ulong h_get_term_char(sPAPREnvironment *spapr,
+                             target_ulong termno, target_ulong *len,
+                             target_ulong *char0_7, target_ulong *char8_15)
+{
+    VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, termno);
+    uint8_t buf[16];
+
+    if (!sdev)
+        return H_PARAMETER;
+
+    *len = vty_getchars(sdev, buf, sizeof(buf));
+    if (*len < 16)
+        memset(buf + *len, 0, 16 - *len);
+
+    *char0_7 = be64_to_cpu(*((uint64_t *)buf));
+    *char8_15 = be64_to_cpu(*((uint64_t *)buf + 1));
+
+    return H_SUCCESS;
+}
+
+void spapr_vty_create(VIOsPAPRBus *bus,
+                      uint32_t reg, CharDriverState *chardev)
+{
+    DeviceState *dev;
+
+    dev = qdev_create(&bus->bus, "spapr-vty");
+    qdev_prop_set_uint32(dev, "reg", reg);
+    qdev_prop_set_chr(dev, "chardev", chardev);
+    qdev_init_nofail(dev);
+}
+
+static VIOsPAPRDeviceInfo spapr_vty = {
+    .init = spapr_vty_init,
+    .dt_name = "vty",
+    .dt_type = "serial",
+    .dt_compatible = "hvterm1",
+    .qdev.name = "spapr-vty",
+    .qdev.size = sizeof(VIOsPAPRVTYDevice),
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_UINT32("reg", VIOsPAPRDevice, reg, 0),
+        DEFINE_PROP_CHR("chardev", VIOsPAPRVTYDevice, chardev),
+        DEFINE_PROP_END_OF_LIST(),
+    },
+};
+
+static void spapr_vty_register(void)
+{
+    spapr_vio_bus_register_withprop(&spapr_vty);
+}
+device_init(spapr_vty_register);