diff mbox series

[v2,4/4] hw/i386: Introduce the microvm machine type

Message ID 20190701144705.102615-5-slp@redhat.com
State New
Headers show
Series Introduce the microvm machine type | expand

Commit Message

Sergio Lopez July 1, 2019, 2:47 p.m. UTC
Microvm is a machine type inspired by both NEMU and Firecracker, and
constructed after the machine model implemented by the latter.

It's main purpose is providing users a KVM-only machine type with fast
boot times, minimal attack surface (measured as the number of IO ports
and MMIO regions exposed to the Guest) and small footprint (specially
when combined with the ongoing QEMU modularization effort).

Normally, other than the device support provided by KVM itself,
microvm only supports virtio-mmio devices. Microvm also includes a
legacy mode, which adds an ISA bus with a 16550A serial port, useful
for being able to see the early boot kernel messages.

Microvm only supports booting PVH-enabled Linux ELF images. Booting
other PVH-enabled kernels may be possible, but due to the lack of ACPI
and firmware, we're relying on the command line for specifying the
location of the virtio-mmio transports. If there's an interest on
using this machine type with other kernels, we'll try to find some
kind of middle ground solution.

Signed-off-by: Sergio Lopez <slp@redhat.com>
---
 default-configs/i386-softmmu.mak |   1 +
 hw/i386/Kconfig                  |   4 +
 hw/i386/Makefile.objs            |   1 +
 hw/i386/microvm.c                | 500 +++++++++++++++++++++++++++++++
 include/hw/i386/microvm.h        |  77 +++++
 5 files changed, 583 insertions(+)
 create mode 100644 hw/i386/microvm.c
 create mode 100644 include/hw/i386/microvm.h

Comments

Gerd Hoffmann July 2, 2019, 8:17 a.m. UTC | #1
Hi,

> Microvm only supports booting PVH-enabled Linux ELF images. Booting
> other PVH-enabled kernels may be possible, but due to the lack of ACPI
> and firmware, we're relying on the command line for specifying the
> location of the virtio-mmio transports. If there's an interest on
> using this machine type with other kernels, we'll try to find some
> kind of middle ground solution.

Can we get rid of the kernel command line hacking please?
The virtio-mmio devices should be discoverable somehow.

Device tree (as suggested by paolo) would work.
Custom acpi device (simliar to fw_cfg) is another option.
I'd tend to pick acpi, I wouldn't be surprised if we'll
need acpi anyway at some point.

Maybe even do both, then switch at runtime depending on -no-acpi
(simliar to arm/aarch64).

cheers,
  Gerd
Stefano Garzarella July 2, 2019, 8:19 a.m. UTC | #2
On Mon, Jul 01, 2019 at 04:47:05PM +0200, Sergio Lopez wrote:
> Microvm is a machine type inspired by both NEMU and Firecracker, and
> constructed after the machine model implemented by the latter.
> 
> It's main purpose is providing users a KVM-only machine type with fast
> boot times, minimal attack surface (measured as the number of IO ports
> and MMIO regions exposed to the Guest) and small footprint (specially
> when combined with the ongoing QEMU modularization effort).
> 
> Normally, other than the device support provided by KVM itself,
> microvm only supports virtio-mmio devices. Microvm also includes a
> legacy mode, which adds an ISA bus with a 16550A serial port, useful
> for being able to see the early boot kernel messages.
> 
> Microvm only supports booting PVH-enabled Linux ELF images. Booting
> other PVH-enabled kernels may be possible, but due to the lack of ACPI
> and firmware, we're relying on the command line for specifying the
> location of the virtio-mmio transports. If there's an interest on
> using this machine type with other kernels, we'll try to find some
> kind of middle ground solution.
> 
> Signed-off-by: Sergio Lopez <slp@redhat.com>
> ---
>  default-configs/i386-softmmu.mak |   1 +
>  hw/i386/Kconfig                  |   4 +
>  hw/i386/Makefile.objs            |   1 +
>  hw/i386/microvm.c                | 500 +++++++++++++++++++++++++++++++
>  include/hw/i386/microvm.h        |  77 +++++
>  5 files changed, 583 insertions(+)
>  create mode 100644 hw/i386/microvm.c
>  create mode 100644 include/hw/i386/microvm.h
> 
> diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
> index cd5ea391e8..338f07420f 100644
> --- a/default-configs/i386-softmmu.mak
> +++ b/default-configs/i386-softmmu.mak
> @@ -26,3 +26,4 @@ CONFIG_ISAPC=y
>  CONFIG_I440FX=y
>  CONFIG_Q35=y
>  CONFIG_ACPI_PCI=y
> +CONFIG_MICROVM=y
> diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
> index 9817888216..94c565d8db 100644
> --- a/hw/i386/Kconfig
> +++ b/hw/i386/Kconfig
> @@ -87,6 +87,10 @@ config Q35
>      select VMMOUSE
>      select FW_CFG_DMA
>  
> +config MICROVM
> +    bool
> +    select VIRTIO_MMIO
> +
>  config VTD
>      bool
>  
> diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
> index c5f20bbd72..7bffca413e 100644
> --- a/hw/i386/Makefile.objs
> +++ b/hw/i386/Makefile.objs
> @@ -4,6 +4,7 @@ obj-y += pvh.o
>  obj-y += pc.o
>  obj-$(CONFIG_I440FX) += pc_piix.o
>  obj-$(CONFIG_Q35) += pc_q35.o
> +obj-$(CONFIG_MICROVM) += mptable.o microvm.o
>  obj-y += fw_cfg.o pc_sysfw.o
>  obj-y += x86-iommu.o
>  obj-$(CONFIG_VTD) += intel_iommu.o
> diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
> new file mode 100644
> index 0000000000..8b5efe9e45
> --- /dev/null
> +++ b/hw/i386/microvm.c
> @@ -0,0 +1,500 @@
> +/*
> + * Copyright (c) 2018 Intel Corporation
> + * Copyright (c) 2019 Red Hat, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2 or later, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/error-report.h"
> +#include "qemu/cutils.h"
> +#include "qapi/error.h"
> +#include "qapi/visitor.h"
> +#include "sysemu/sysemu.h"
> +#include "sysemu/cpus.h"
> +#include "sysemu/numa.h"
> +
> +#include "hw/loader.h"
> +#include "hw/nmi.h"
> +#include "hw/kvm/clock.h"
> +#include "hw/i386/microvm.h"
> +#include "hw/i386/pc.h"
> +#include "target/i386/cpu.h"
> +#include "hw/timer/i8254.h"
> +#include "hw/char/serial.h"
> +#include "hw/i386/topology.h"
> +#include "hw/virtio/virtio-mmio.h"
> +#include "hw/i386/mptable.h"
> +
> +#include "cpu.h"
> +#include "elf.h"
> +#include "pvh.h"
> +#include "kvm_i386.h"
> +#include "hw/xen/start_info.h"
> +
> +static void microvm_gsi_handler(void *opaque, int n, int level)
> +{
> +    qemu_irq *ioapic_irq = opaque;
> +
> +    qemu_set_irq(ioapic_irq[n], level);
> +}
> +
> +static void microvm_legacy_init(MicrovmMachineState *mms)
> +{
> +    ISABus *isa_bus;
> +    GSIState *gsi_state;
> +    qemu_irq *i8259;
> +    int i;
> +
> +    assert(kvm_irqchip_in_kernel());
> +    gsi_state = g_malloc0(sizeof(*gsi_state));
> +    mms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS);
> +
> +    isa_bus = isa_bus_new(NULL, get_system_memory(), get_system_io(),
> +                          &error_abort);
> +    isa_bus_irqs(isa_bus, mms->gsi);
> +
> +    assert(kvm_pic_in_kernel());
> +    i8259 = kvm_i8259_init(isa_bus);
> +
> +    for (i = 0; i < ISA_NUM_IRQS; i++) {
> +        gsi_state->i8259_irq[i] = i8259[i];
> +    }
> +
> +    kvm_pit_init(isa_bus, 0x40);
> +
> +    for (i = 0; i < VIRTIO_NUM_TRANSPORTS; i++) {
> +        int nirq = VIRTIO_IRQ_BASE + i;
> +        ISADevice *isadev = isa_create(isa_bus, TYPE_ISA_SERIAL);
> +        qemu_irq mmio_irq;
> +
> +        isa_init_irq(isadev, &mmio_irq, nirq);
> +        sysbus_create_simple("virtio-mmio",
> +                             VIRTIO_MMIO_BASE + i * 512,
> +                             mms->gsi[VIRTIO_IRQ_BASE + i]);
> +    }
> +
> +    g_free(i8259);
> +
> +    serial_hds_isa_init(isa_bus, 0, 1);
> +}
> +
> +static void microvm_ioapic_init(MicrovmMachineState *mms)
> +{
> +    qemu_irq *ioapic_irq;
> +    DeviceState *ioapic_dev;
> +    SysBusDevice *d;
> +    int i;
> +
> +    assert(kvm_irqchip_in_kernel());
> +    ioapic_irq = g_new0(qemu_irq, IOAPIC_NUM_PINS);
> +    kvm_pc_setup_irq_routing(true);
> +
> +    assert(kvm_ioapic_in_kernel());
> +    ioapic_dev = qdev_create(NULL, "kvm-ioapic");
> +
> +    object_property_add_child(qdev_get_machine(),
> +                              "ioapic", OBJECT(ioapic_dev), NULL);
> +
> +    qdev_init_nofail(ioapic_dev);
> +    d = SYS_BUS_DEVICE(ioapic_dev);
> +    sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
> +
> +    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> +        ioapic_irq[i] = qdev_get_gpio_in(ioapic_dev, i);
> +    }
> +
> +    mms->gsi = qemu_allocate_irqs(microvm_gsi_handler,
> +                                  ioapic_irq, IOAPIC_NUM_PINS);
> +
> +    for (i = 0; i < VIRTIO_NUM_TRANSPORTS; i++) {
> +        sysbus_create_simple("virtio-mmio",
> +                             VIRTIO_MMIO_BASE + i * 512,
> +                             mms->gsi[VIRTIO_IRQ_BASE + i]);
> +    }
> +}
> +
> +static void microvm_memory_init(MicrovmMachineState *mms)
> +{
> +    MachineState *machine = MACHINE(mms);
> +    MemoryRegion *ram, *ram_below_4g, *ram_above_4g;
> +    MemoryRegion *system_memory = get_system_memory();
> +
> +    if (machine->ram_size > MICROVM_MAX_BELOW_4G) {
> +        mms->above_4g_mem_size = machine->ram_size - MICROVM_MAX_BELOW_4G;
> +        mms->below_4g_mem_size = MICROVM_MAX_BELOW_4G;
> +    } else {
> +        mms->above_4g_mem_size = 0;
> +        mms->below_4g_mem_size = machine->ram_size;
> +    }
> +
> +    ram = g_malloc(sizeof(*ram));
> +    memory_region_allocate_system_memory(ram, NULL, "microvm.ram",
> +                                         machine->ram_size);
> +
> +    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
> +    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
> +                             0, mms->below_4g_mem_size);
> +    memory_region_add_subregion(system_memory, 0, ram_below_4g);
> +
> +    e820_add_entry(0, mms->below_4g_mem_size, E820_RAM);
> +
> +    if (mms->above_4g_mem_size > 0) {
> +        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
> +        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
> +                                 mms->below_4g_mem_size,
> +                                 mms->above_4g_mem_size);
> +        memory_region_add_subregion(system_memory, 0x100000000ULL,
> +                                    ram_above_4g);
> +        e820_add_entry(0x100000000ULL, mms->above_4g_mem_size, E820_RAM);
> +    }
> +}
> +
> +static void microvm_cpus_init(const char *typename, Error **errp)
> +{
> +    int i;
> +
> +    for (i = 0; i < smp_cpus; i++) {
> +        Object *cpu = NULL;
> +        Error *local_err = NULL;
> +
> +        cpu = object_new(typename);
> +
> +        object_property_set_uint(cpu, i, "apic-id", &local_err);
> +        object_property_set_bool(cpu, true, "realized", &local_err);
> +
> +        object_unref(cpu);
> +        error_propagate(errp, local_err);
> +    }
> +}
> +
> +static void microvm_machine_state_init(MachineState *machine)
> +{
> +    MicrovmMachineState *mms = MICROVM_MACHINE(machine);
> +    Error *local_err = NULL;
> +
> +    if (machine->kernel_filename == NULL) {
> +        error_report("missing kernel image file name, required by microvm");
> +        exit(1);
> +    }

Could it be useful to support initrd as well?

I'm thinking a possibility to a microvm to use only the initrd without a
block device.

In this case, Linux expects the initrd address and size in the first
element of the modlist in the 'struct hvm_start_info'.

See pc-bios/optionrom/pvh_main.c

Cheers,
Stefano
Sergio Lopez July 2, 2019, 8:42 a.m. UTC | #3
Gerd Hoffmann <kraxel@redhat.com> writes:

>   Hi,
>
>> Microvm only supports booting PVH-enabled Linux ELF images. Booting
>> other PVH-enabled kernels may be possible, but due to the lack of ACPI
>> and firmware, we're relying on the command line for specifying the
>> location of the virtio-mmio transports. If there's an interest on
>> using this machine type with other kernels, we'll try to find some
>> kind of middle ground solution.
>
> Can we get rid of the kernel command line hacking please?
> The virtio-mmio devices should be discoverable somehow.
>
> Device tree (as suggested by paolo) would work.
> Custom acpi device (simliar to fw_cfg) is another option.
> I'd tend to pick acpi, I wouldn't be surprised if we'll
> need acpi anyway at some point.
>
> Maybe even do both, then switch at runtime depending on -no-acpi
> (simliar to arm/aarch64).

Microvm tries to do things in the cheapest possible way. As I said the
other email, I'm not opposed to support qboot (which will probably imply
ACPI and/or device tree), as long it's optional, and the "cheap" way is
still present.

Otherwise, let's just drop microvm and stick with Q35 + qboot.

Sergio.
Sergio Lopez July 2, 2019, 8:47 a.m. UTC | #4
Stefano Garzarella <sgarzare@redhat.com> writes:

> On Mon, Jul 01, 2019 at 04:47:05PM +0200, Sergio Lopez wrote:
>> Microvm is a machine type inspired by both NEMU and Firecracker, and
>> constructed after the machine model implemented by the latter.
>> 
>> It's main purpose is providing users a KVM-only machine type with fast
>> boot times, minimal attack surface (measured as the number of IO ports
>> and MMIO regions exposed to the Guest) and small footprint (specially
>> when combined with the ongoing QEMU modularization effort).
>> 
>> Normally, other than the device support provided by KVM itself,
>> microvm only supports virtio-mmio devices. Microvm also includes a
>> legacy mode, which adds an ISA bus with a 16550A serial port, useful
>> for being able to see the early boot kernel messages.
>> 
>> Microvm only supports booting PVH-enabled Linux ELF images. Booting
>> other PVH-enabled kernels may be possible, but due to the lack of ACPI
>> and firmware, we're relying on the command line for specifying the
>> location of the virtio-mmio transports. If there's an interest on
>> using this machine type with other kernels, we'll try to find some
>> kind of middle ground solution.
>> 
>> Signed-off-by: Sergio Lopez <slp@redhat.com>
>> ---
>>  default-configs/i386-softmmu.mak |   1 +
>>  hw/i386/Kconfig                  |   4 +
>>  hw/i386/Makefile.objs            |   1 +
>>  hw/i386/microvm.c                | 500 +++++++++++++++++++++++++++++++
>>  include/hw/i386/microvm.h        |  77 +++++
>>  5 files changed, 583 insertions(+)
>>  create mode 100644 hw/i386/microvm.c
>>  create mode 100644 include/hw/i386/microvm.h
>> 
>> diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
>> index cd5ea391e8..338f07420f 100644
>> --- a/default-configs/i386-softmmu.mak
>> +++ b/default-configs/i386-softmmu.mak
>> @@ -26,3 +26,4 @@ CONFIG_ISAPC=y
>>  CONFIG_I440FX=y
>>  CONFIG_Q35=y
>>  CONFIG_ACPI_PCI=y
>> +CONFIG_MICROVM=y
>> diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
>> index 9817888216..94c565d8db 100644
>> --- a/hw/i386/Kconfig
>> +++ b/hw/i386/Kconfig
>> @@ -87,6 +87,10 @@ config Q35
>>      select VMMOUSE
>>      select FW_CFG_DMA
>>  
>> +config MICROVM
>> +    bool
>> +    select VIRTIO_MMIO
>> +
>>  config VTD
>>      bool
>>  
>> diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
>> index c5f20bbd72..7bffca413e 100644
>> --- a/hw/i386/Makefile.objs
>> +++ b/hw/i386/Makefile.objs
>> @@ -4,6 +4,7 @@ obj-y += pvh.o
>>  obj-y += pc.o
>>  obj-$(CONFIG_I440FX) += pc_piix.o
>>  obj-$(CONFIG_Q35) += pc_q35.o
>> +obj-$(CONFIG_MICROVM) += mptable.o microvm.o
>>  obj-y += fw_cfg.o pc_sysfw.o
>>  obj-y += x86-iommu.o
>>  obj-$(CONFIG_VTD) += intel_iommu.o
>> diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
>> new file mode 100644
>> index 0000000000..8b5efe9e45
>> --- /dev/null
>> +++ b/hw/i386/microvm.c
>> @@ -0,0 +1,500 @@
>> +/*
>> + * Copyright (c) 2018 Intel Corporation
>> + * Copyright (c) 2019 Red Hat, Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2 or later, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>> + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along with
>> + * this program.  If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/cutils.h"
>> +#include "qapi/error.h"
>> +#include "qapi/visitor.h"
>> +#include "sysemu/sysemu.h"
>> +#include "sysemu/cpus.h"
>> +#include "sysemu/numa.h"
>> +
>> +#include "hw/loader.h"
>> +#include "hw/nmi.h"
>> +#include "hw/kvm/clock.h"
>> +#include "hw/i386/microvm.h"
>> +#include "hw/i386/pc.h"
>> +#include "target/i386/cpu.h"
>> +#include "hw/timer/i8254.h"
>> +#include "hw/char/serial.h"
>> +#include "hw/i386/topology.h"
>> +#include "hw/virtio/virtio-mmio.h"
>> +#include "hw/i386/mptable.h"
>> +
>> +#include "cpu.h"
>> +#include "elf.h"
>> +#include "pvh.h"
>> +#include "kvm_i386.h"
>> +#include "hw/xen/start_info.h"
>> +
>> +static void microvm_gsi_handler(void *opaque, int n, int level)
>> +{
>> +    qemu_irq *ioapic_irq = opaque;
>> +
>> +    qemu_set_irq(ioapic_irq[n], level);
>> +}
>> +
>> +static void microvm_legacy_init(MicrovmMachineState *mms)
>> +{
>> +    ISABus *isa_bus;
>> +    GSIState *gsi_state;
>> +    qemu_irq *i8259;
>> +    int i;
>> +
>> +    assert(kvm_irqchip_in_kernel());
>> +    gsi_state = g_malloc0(sizeof(*gsi_state));
>> +    mms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS);
>> +
>> +    isa_bus = isa_bus_new(NULL, get_system_memory(), get_system_io(),
>> +                          &error_abort);
>> +    isa_bus_irqs(isa_bus, mms->gsi);
>> +
>> +    assert(kvm_pic_in_kernel());
>> +    i8259 = kvm_i8259_init(isa_bus);
>> +
>> +    for (i = 0; i < ISA_NUM_IRQS; i++) {
>> +        gsi_state->i8259_irq[i] = i8259[i];
>> +    }
>> +
>> +    kvm_pit_init(isa_bus, 0x40);
>> +
>> +    for (i = 0; i < VIRTIO_NUM_TRANSPORTS; i++) {
>> +        int nirq = VIRTIO_IRQ_BASE + i;
>> +        ISADevice *isadev = isa_create(isa_bus, TYPE_ISA_SERIAL);
>> +        qemu_irq mmio_irq;
>> +
>> +        isa_init_irq(isadev, &mmio_irq, nirq);
>> +        sysbus_create_simple("virtio-mmio",
>> +                             VIRTIO_MMIO_BASE + i * 512,
>> +                             mms->gsi[VIRTIO_IRQ_BASE + i]);
>> +    }
>> +
>> +    g_free(i8259);
>> +
>> +    serial_hds_isa_init(isa_bus, 0, 1);
>> +}
>> +
>> +static void microvm_ioapic_init(MicrovmMachineState *mms)
>> +{
>> +    qemu_irq *ioapic_irq;
>> +    DeviceState *ioapic_dev;
>> +    SysBusDevice *d;
>> +    int i;
>> +
>> +    assert(kvm_irqchip_in_kernel());
>> +    ioapic_irq = g_new0(qemu_irq, IOAPIC_NUM_PINS);
>> +    kvm_pc_setup_irq_routing(true);
>> +
>> +    assert(kvm_ioapic_in_kernel());
>> +    ioapic_dev = qdev_create(NULL, "kvm-ioapic");
>> +
>> +    object_property_add_child(qdev_get_machine(),
>> +                              "ioapic", OBJECT(ioapic_dev), NULL);
>> +
>> +    qdev_init_nofail(ioapic_dev);
>> +    d = SYS_BUS_DEVICE(ioapic_dev);
>> +    sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
>> +
>> +    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
>> +        ioapic_irq[i] = qdev_get_gpio_in(ioapic_dev, i);
>> +    }
>> +
>> +    mms->gsi = qemu_allocate_irqs(microvm_gsi_handler,
>> +                                  ioapic_irq, IOAPIC_NUM_PINS);
>> +
>> +    for (i = 0; i < VIRTIO_NUM_TRANSPORTS; i++) {
>> +        sysbus_create_simple("virtio-mmio",
>> +                             VIRTIO_MMIO_BASE + i * 512,
>> +                             mms->gsi[VIRTIO_IRQ_BASE + i]);
>> +    }
>> +}
>> +
>> +static void microvm_memory_init(MicrovmMachineState *mms)
>> +{
>> +    MachineState *machine = MACHINE(mms);
>> +    MemoryRegion *ram, *ram_below_4g, *ram_above_4g;
>> +    MemoryRegion *system_memory = get_system_memory();
>> +
>> +    if (machine->ram_size > MICROVM_MAX_BELOW_4G) {
>> +        mms->above_4g_mem_size = machine->ram_size - MICROVM_MAX_BELOW_4G;
>> +        mms->below_4g_mem_size = MICROVM_MAX_BELOW_4G;
>> +    } else {
>> +        mms->above_4g_mem_size = 0;
>> +        mms->below_4g_mem_size = machine->ram_size;
>> +    }
>> +
>> +    ram = g_malloc(sizeof(*ram));
>> +    memory_region_allocate_system_memory(ram, NULL, "microvm.ram",
>> +                                         machine->ram_size);
>> +
>> +    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
>> +    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
>> +                             0, mms->below_4g_mem_size);
>> +    memory_region_add_subregion(system_memory, 0, ram_below_4g);
>> +
>> +    e820_add_entry(0, mms->below_4g_mem_size, E820_RAM);
>> +
>> +    if (mms->above_4g_mem_size > 0) {
>> +        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
>> +        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
>> +                                 mms->below_4g_mem_size,
>> +                                 mms->above_4g_mem_size);
>> +        memory_region_add_subregion(system_memory, 0x100000000ULL,
>> +                                    ram_above_4g);
>> +        e820_add_entry(0x100000000ULL, mms->above_4g_mem_size, E820_RAM);
>> +    }
>> +}
>> +
>> +static void microvm_cpus_init(const char *typename, Error **errp)
>> +{
>> +    int i;
>> +
>> +    for (i = 0; i < smp_cpus; i++) {
>> +        Object *cpu = NULL;
>> +        Error *local_err = NULL;
>> +
>> +        cpu = object_new(typename);
>> +
>> +        object_property_set_uint(cpu, i, "apic-id", &local_err);
>> +        object_property_set_bool(cpu, true, "realized", &local_err);
>> +
>> +        object_unref(cpu);
>> +        error_propagate(errp, local_err);
>> +    }
>> +}
>> +
>> +static void microvm_machine_state_init(MachineState *machine)
>> +{
>> +    MicrovmMachineState *mms = MICROVM_MACHINE(machine);
>> +    Error *local_err = NULL;
>> +
>> +    if (machine->kernel_filename == NULL) {
>> +        error_report("missing kernel image file name, required by microvm");
>> +        exit(1);
>> +    }
>
> Could it be useful to support initrd as well?
>
> I'm thinking a possibility to a microvm to use only the initrd without a
> block device.

I agree, thanks for the suggestion. I'll add support for it.

Sergio.

> In this case, Linux expects the initrd address and size in the first
> element of the modlist in the 'struct hvm_start_info'.
>
> See pc-bios/optionrom/pvh_main.c
>
> Cheers,
> Stefano
Gerd Hoffmann July 2, 2019, 10:16 a.m. UTC | #5
Hi,

> > Can we get rid of the kernel command line hacking please?
> > The virtio-mmio devices should be discoverable somehow.
> >
> > Device tree (as suggested by paolo) would work.
> > Custom acpi device (simliar to fw_cfg) is another option.
> > I'd tend to pick acpi, I wouldn't be surprised if we'll
> > need acpi anyway at some point.
> >
> > Maybe even do both, then switch at runtime depending on -no-acpi
> > (simliar to arm/aarch64).
> 
> Microvm tries to do things in the cheapest possible way.

But taking too many shortcuts tends to hurt in the long run.
It also cuts off useful use cases.

I think microvm has more value than just the reduced boot time.
Specifically the reduced attack surface is useful I think, even
beyond container-style workloads.  Being able to boot standard
cloud images (with the cloud image kernel loaded via cloud image
boot loader) in microvm would be useful for example.

So, yes, I want microvm being designed in a way that it can run
firmware and have it handle the boot process.  For starters just
qboot for fast direct kernel boot, but longer term also seabios
and/or ovmf.

Can look at the seabios side, but probably not before I'm back
from my summer vacation in august.  For seabios a simple & reliable
time source would be quite useful.  Direct kernel boot might be doable
without that, but as soon as any I/O (read from cloud image disk) is
involved a time source is needed.  Right now seabios uses the acpi
pm_timer.  tsc should work too if seabios can figure the frequency
without a calibration loop (invtsc should be enough).  Maybe seabios
needs kvmclock support ...

Is there any way to detect microvm from the guest?  pc/q35 can be
easily detected by looking at the pci host bridge.

Do you have boot time numbers for qboot vs. no-firmware boot?
Is the difference big enough that it makes sense to maintain both?

cheers,
  Gerd
Paolo Bonzini July 2, 2019, 10:37 a.m. UTC | #6
On 02/07/19 10:47, Sergio Lopez wrote:
>> Could it be useful to support initrd as well?
>>
>> I'm thinking a possibility to a microvm to use only the initrd without a
>> block device.
> I agree, thanks for the suggestion. I'll add support for it.

I'd like to take a look at adding firmware support too, so that we get
linuxboot and multiboot for free.  This would also allow boot time
comparisons.

Paolo
Sergio Lopez July 2, 2019, 10:52 a.m. UTC | #7
Gerd Hoffmann <kraxel@redhat.com> writes:

>   Hi,
>
>> > Can we get rid of the kernel command line hacking please?
>> > The virtio-mmio devices should be discoverable somehow.
>> >
>> > Device tree (as suggested by paolo) would work.
>> > Custom acpi device (simliar to fw_cfg) is another option.
>> > I'd tend to pick acpi, I wouldn't be surprised if we'll
>> > need acpi anyway at some point.
>> >
>> > Maybe even do both, then switch at runtime depending on -no-acpi
>> > (simliar to arm/aarch64).
>> 
>> Microvm tries to do things in the cheapest possible way.
>
> But taking too many shortcuts tends to hurt in the long run.
> It also cuts off useful use cases.

Sure, but the consideration about whether there are too many shortcuts,
or just enough of them, is quite subjective. Microvm's code base is
small enough to keep its quirks in check without a becoming a
significant maintenance burden, and doesn't invalidate how other, more
conventional machine types work.

> I think microvm has more value than just the reduced boot time.
> Specifically the reduced attack surface is useful I think, even
> beyond container-style workloads.  Being able to boot standard
> cloud images (with the cloud image kernel loaded via cloud image
> boot loader) in microvm would be useful for example.

Agreed.

> So, yes, I want microvm being designed in a way that it can run
> firmware and have it handle the boot process.  For starters just
> qboot for fast direct kernel boot, but longer term also seabios
> and/or ovmf.

As I said, I'm also in favor of microvm supporting booting from
firmware in the future, as long we keep the simple mode too.

> Can look at the seabios side, but probably not before I'm back
> from my summer vacation in august.  For seabios a simple & reliable
> time source would be quite useful.  Direct kernel boot might be doable
> without that, but as soon as any I/O (read from cloud image disk) is
> involved a time source is needed.  Right now seabios uses the acpi
> pm_timer.  tsc should work too if seabios can figure the frequency
> without a calibration loop (invtsc should be enough).  Maybe seabios
> needs kvmclock support ...

My main concern about supporting SeaBIOS, in addition to boot times, is
having to support ACPI, which due to its complexity and size, is a clear
candidate to be stripped out from a minimalistic QEMU build.

> Is there any way to detect microvm from the guest?  pc/q35 can be
> easily detected by looking at the pci host bridge.

One option would be using the fields MPC_OEM and MPC_PRODUCT_ID from the
MP Table to give a hint to the guest.

> Do you have boot time numbers for qboot vs. no-firmware boot?
> Is the difference big enough that it makes sense to maintain both?

AFAIK, qboot can't boot a guest without both ACPI and PCI.

Sergio.
Sergio Lopez July 2, 2019, 11:16 a.m. UTC | #8
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 02/07/19 10:47, Sergio Lopez wrote:
>>> Could it be useful to support initrd as well?
>>>
>>> I'm thinking a possibility to a microvm to use only the initrd without a
>>> block device.
>> I agree, thanks for the suggestion. I'll add support for it.
>
> I'd like to take a look at adding firmware support too, so that we get
> linuxboot and multiboot for free.  This would also allow boot time
> comparisons.

I agree, but I'd prefer doing that on another iteration. This way we can
already introduce the machine type, with its basic set of features and
characteristics (which work in the sense it was initially intended), so
other projects can start experimenting with it.

Then we can add other options like ACPI and firmware support as knobs.

Sergio.
Gerd Hoffmann July 2, 2019, 11:50 a.m. UTC | #9
On Tue, Jul 02, 2019 at 12:52:27PM +0200, Sergio Lopez wrote:
> 
> Gerd Hoffmann <kraxel@redhat.com> writes:
> 
> >   Hi,
> >
> >> > Can we get rid of the kernel command line hacking please?
> >> > The virtio-mmio devices should be discoverable somehow.
> >> >
> >> > Device tree (as suggested by paolo) would work.
> >> > Custom acpi device (simliar to fw_cfg) is another option.
> >> > I'd tend to pick acpi, I wouldn't be surprised if we'll
> >> > need acpi anyway at some point.
> >> >
> >> > Maybe even do both, then switch at runtime depending on -no-acpi
> >> > (simliar to arm/aarch64).
> >> 
> >> Microvm tries to do things in the cheapest possible way.
> >
> > But taking too many shortcuts tends to hurt in the long run.
> > It also cuts off useful use cases.
> 
> Sure, but the consideration about whether there are too many shortcuts,
> or just enough of them, is quite subjective. Microvm's code base is
> small enough to keep its quirks in check without a becoming a
> significant maintenance burden, and doesn't invalidate how other, more
> conventional machine types work.

Most projects starts small, but then tend to grow over time.
And likewise the maintenance burden tends to grow over time ...

> > Can look at the seabios side, but probably not before I'm back
> > from my summer vacation in august.  For seabios a simple & reliable
> > time source would be quite useful.  Direct kernel boot might be doable
> > without that, but as soon as any I/O (read from cloud image disk) is
> > involved a time source is needed.  Right now seabios uses the acpi
> > pm_timer.  tsc should work too if seabios can figure the frequency
> > without a calibration loop (invtsc should be enough).  Maybe seabios
> > needs kvmclock support ...
> 
> My main concern about supporting SeaBIOS, in addition to boot times, is
> having to support ACPI, which due to its complexity and size, is a clear
> candidate to be stripped out from a minimalistic QEMU build.

Not sure dropping apci will be much of a win.  I think the aml generator
hasn't any external dependencies (increasing load time due to shared
libs).  The guest interface is rather small too (only reading tables via
fw_cfg).  I'd also expect microvm doesn't need many tables due to the
small feature set (no numa, no pci, ...).

On the other hand acpi tables plus some minimal apci registers would
provide some useful features.  apci poweroff,  acpi power button,  apci
pm-timer.  You also could describe the virtio-mmio devices.

Having said that I think it should be possible to change seabios that
it'll work without acpi too.  It would still need some way to discover
virtio-mmio devices though if we want it load guest kernels from disk
images.

> > Is there any way to detect microvm from the guest?  pc/q35 can be
> > easily detected by looking at the pci host bridge.
> 
> One option would be using the fields MPC_OEM and MPC_PRODUCT_ID from the
> MP Table to give a hint to the guest.

Well, I mean for the firmware.  When booting with firmware all those
tables (mptable, e820, ...) should be created by the firmware not qemu.

It's not that critical though.  We probably want a separate seabios
build for microvm anyway, so a compile time option should work too.

> > Do you have boot time numbers for qboot vs. no-firmware boot?
> > Is the difference big enough that it makes sense to maintain both?
> 
> AFAIK, qboot can't boot a guest without both ACPI and PCI.

Should be fixable I guess ...

cheers,
  Gerd
Paolo Bonzini July 2, 2019, 2:06 p.m. UTC | #10
On 02/07/19 12:52, Sergio Lopez wrote:
> As I said, I'm also in favor of microvm supporting booting from
> firmware in the future, as long we keep the simple mode too.

The simple mode adds code to QEMU's x86 target that only exists to
support microvm.  It should be motivated by a clear win in boot times.

> My main concern about supporting SeaBIOS, in addition to boot times, is
> having to support ACPI, which due to its complexity and size, is a clear
> candidate to be stripped out from a minimalistic QEMU build.

SeaBIOS doesn't need ACPI.  I agree that ACPI should be optional in microvm.

> AFAIK, qboot can't boot a guest without both ACPI and PCI.

It currently needs PCI, but that's a one-line change to avoid panicking.
 ACPI is optional, on the other hand qboot doesn't support mptable so
that can be a problem when ACPI is disabled.

Paolo
Sergio Lopez July 2, 2019, 2:41 p.m. UTC | #11
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 02/07/19 12:52, Sergio Lopez wrote:
>> As I said, I'm also in favor of microvm supporting booting from
>> firmware in the future, as long we keep the simple mode too.
>
> The simple mode adds code to QEMU's x86 target that only exists to
> support microvm.  It should be motivated by a clear win in boot times.

OK. When I'm back from my PTO, I'll work on adding the firmware
support to microvm. I'll run and share some numbers to see whether the
simple mode makes sense or we can just rely on qboot for lower boot
times plus SeaBIOS for compatibility.

Cheers,
Sergio.

>> My main concern about supporting SeaBIOS, in addition to boot times, is
>> having to support ACPI, which due to its complexity and size, is a clear
>> candidate to be stripped out from a minimalistic QEMU build.
>
> SeaBIOS doesn't need ACPI.  I agree that ACPI should be optional in microvm.
>
>> AFAIK, qboot can't boot a guest without both ACPI and PCI.
>
> It currently needs PCI, but that's a one-line change to avoid panicking.
>  ACPI is optional, on the other hand qboot doesn't support mptable so
> that can be a problem when ACPI is disabled.
>
> Paolo
Sergio Lopez July 18, 2019, 2:34 p.m. UTC | #12
Sergio Lopez <slp@redhat.com> writes:

> Paolo Bonzini <pbonzini@redhat.com> writes:
>
>> On 02/07/19 12:52, Sergio Lopez wrote:
>>> As I said, I'm also in favor of microvm supporting booting from
>>> firmware in the future, as long we keep the simple mode too.
>>
>> The simple mode adds code to QEMU's x86 target that only exists to
>> support microvm.  It should be motivated by a clear win in boot times.
>
> OK. When I'm back from my PTO, I'll work on adding the firmware
> support to microvm. I'll run and share some numbers to see whether the
> simple mode makes sense or we can just rely on qboot for lower boot
> times plus SeaBIOS for compatibility.

I've just added support for starting the machine from SeaBIOS (Stefan
Hajnoczi pointed in another thread that it can be as fast as qboot, and
given that the latter doesn't support mptables, I just tested this
one). I tried to keep it as minimalistic as possible, but it still
required an RTC (mc146818), which dragged in an ISA BUS, and this one a
KVM PIT.

I ran some numbers using Stefano Garzarella's qemu-boot-time scripts
[1] on a server with 2xIntel Xeon Silver 4114 2.20GHz, using the
upstream QEMU (474f3938d79ab36b9231c9ad3b5a9314c2aeacde) built with
minimal features [2]. The VM boots a minimal kernel [3] without initrd,
using a kata container image as root via virtio-blk (though this isn't
really relevant, as we're just taking measurements until the kernel is
about to exec init).

 ---------------------
 | QEMU with SeaBIOS |
 ---------------------

Command line:

./x86_64-softmmu/qemu-system-x86_64 -m 512m -enable-kvm -M microvm,legacy -kernel /root/src/images/vmlinux-5.2 -append "console=hvc0 reboot=k panic=1 root=/dev/vda quiet virtio_mmio.device=512@0xd0000600:15 virtio_mmio.device=512@0xd0000400:14" -smp 1 -nodefaults -no-user-config -chardev pty,id=virtiocon0,server -device virtio-serial-device -device virtconsole,chardev=virtiocon0 -drive id=test,file=/usr/share/kata-containers/kata-containers.img,format=raw,if=none -device virtio-blk-device,drive=test -monitor stdio

Average boot times after 10 consecutive runs:

 qemu_init_end: 65.958714
 linux_start_kernel: 77.735803 (+11.777089)
 linux_start_user: 127.360739 (+49.624936)

Exposed I/O ports and MMIOs:

 address-space: memory
  0000000000000000-ffffffffffffffff (prio 0, i/o): system
    0000000000000000-000000001fffffff (prio 0, i/o): alias ram-below-4g @microvm.ram 0000000000000000-000000001fffffff
    00000000000e0000-00000000000fffff (prio 1, i/o): alias isa-bios @pc.bios 0000000000000000-000000000001ffff
    00000000d0000000-00000000d00001ff (prio 0, i/o): virtio-mmio
    00000000d0000200-00000000d00003ff (prio 0, i/o): virtio-mmio
    00000000d0000400-00000000d00005ff (prio 0, i/o): virtio-mmio
    00000000d0000600-00000000d00007ff (prio 0, i/o): virtio-mmio
    00000000fee00000-00000000feefffff (prio 4096, i/o): kvm-apic-msi
    00000000fffe0000-00000000ffffffff (prio 0, ram): pc.bios

 address-space: I/O
  0000000000000000-000000000000ffff (prio 0, i/o): io
    0000000000000020-0000000000000021 (prio 0, i/o): kvm-pic
    0000000000000040-0000000000000043 (prio 0, i/o): kvm-pit
    0000000000000070-0000000000000071 (prio 0, i/o): rtc
      0000000000000070-0000000000000070 (prio 0, i/o): rtc-index
    000000000000007e-000000000000007f (prio 0, i/o): kvmvapic
    00000000000000a0-00000000000000a1 (prio 0, i/o): kvm-pic
    00000000000004d0-00000000000004d0 (prio 0, i/o): kvm-elcr
    00000000000004d1-00000000000004d1 (prio 0, i/o): kvm-elcr
    0000000000000510-0000000000000511 (prio 0, i/o): fwcfg
    0000000000000514-000000000000051b (prio 0, i/o): fwcfg.dma


 -------------------
 | QEMU direct PVH |
 -------------------

Command line:

 ./x86_64-softmmu/qemu-system-x86_64 -m 512m -enable-kvm -M microvm -kernel /root/src/images/vmlinux-5.2 -append "console=hvc0 reboot=k panic=1 root=/dev/vda quiet" -smp 1 -nodefaults -no-user-config -chardev pty,id=virtiocon0,server -device virtio-serial-device -device virtconsole,chardev=virtiocon0 -drive id=test,file=/usr/share/kata-containers/kata-containers.img,format=raw,if=none -device virtio-blk-device,drive=test -monitor stdio

Average boot times after 10 consecutive runs:

 qemu_init_end: 64.043264
 linux_start_kernel: 65.481782 (+1.438518)
 linux_start_user: 114.938353 (+49.456571)

Exposed I/O ports and MMIOs:

 address-space: memory
  0000000000000000-ffffffffffffffff (prio 0, i/o): system
    0000000000000000-000000001fffffff (prio 0, i/o): alias ram-below-4g @microvm.ram 0000000000000000-000000001fffffff
    00000000d0000000-00000000d00001ff (prio 0, i/o): virtio-mmio
    00000000d0000200-00000000d00003ff (prio 0, i/o): virtio-mmio
    00000000d0000400-00000000d00005ff (prio 0, i/o): virtio-mmio
    00000000d0000600-00000000d00007ff (prio 0, i/o): virtio-mmio
    00000000fec00000-00000000fec00fff (prio 0, i/o): kvm-ioapic
    00000000fee00000-00000000feefffff (prio 4096, i/o): kvm-apic-msi

 address-space: I/O
  0000000000000000-000000000000ffff (prio 0, i/o): io
    000000000000007e-000000000000007f (prio 0, i/o): kvmvapic


 --------------
 | Comparison |
 --------------

Average boot time:

 * Relying on SeaBIOS, when compared with direct PVH boot, as a total
   average overhead of ~12ms. The cost of initializing QEMU increases in
   ~2ms (probably due to need to instantiate more devices), while the
   other ~10ms is the SeaBIOS overhead.

Exposed I/O ports and MMIOs:

 * The following 8 I/O ports are only present in the version relying on SeaBIOS:

    0000000000000020-0000000000000021 (prio 0, i/o): kvm-pic
    0000000000000040-0000000000000043 (prio 0, i/o): kvm-pit
    0000000000000070-0000000000000071 (prio 0, i/o): rtc
      0000000000000070-0000000000000070 (prio 0, i/o): rtc-index
    00000000000000a0-00000000000000a1 (prio 0, i/o): kvm-pic
    00000000000004d0-00000000000004d0 (prio 0, i/o): kvm-elcr
    00000000000004d1-00000000000004d1 (prio 0, i/o): kvm-elcr
    0000000000000510-0000000000000511 (prio 0, i/o): fwcfg
    0000000000000514-000000000000051b (prio 0, i/o): fwcfg.dma

 * The following MMIO region is only present in the direct boot version:

    00000000fec00000-00000000fec00fff (prio 0, i/o): kvm-ioapic


 ---------------
 | Conclusions |
 ---------------

Objectively, the version that boots directly the kernel using PVH is 10%
faster and has a slightly larger exposed surface. Whether this is enough
to justify its existence is quite subjective.

In my opinion, not only I think it makes sense to have it, but I also
think there's little reason to have the firmware reliant version, given
the nature and purpose of microvm.

Sergio.

[1] https://github.com/stefano-garzarella/qemu-boot-time
[2] https://paste.fedoraproject.org/paste/YZ9Ok-dJtQrc0xxctFm-nw
[3] https://paste.fedoraproject.org/paste/sck0jfioAJdMq51HH6wkmA
Paolo Bonzini July 18, 2019, 3:48 p.m. UTC | #13
On 18/07/19 16:34, Sergio Lopez wrote:
> I've just added support for starting the machine from SeaBIOS (Stefan
> Hajnoczi pointed in another thread that it can be as fast as qboot, and
> given that the latter doesn't support mptables, I just tested this
> one). I tried to keep it as minimalistic as possible, but it still
> required an RTC (mc146818), which dragged in an ISA BUS, and this one a
> KVM PIT.
> 
> I ran some numbers using Stefano Garzarella's qemu-boot-time scripts
> [1] on a server with 2xIntel Xeon Silver 4114 2.20GHz, using the
> upstream QEMU (474f3938d79ab36b9231c9ad3b5a9314c2aeacde) built with
> minimal features [2]. The VM boots a minimal kernel [3] without initrd,
> using a kata container image as root via virtio-blk (though this isn't
> really relevant, as we're just taking measurements until the kernel is
> about to exec init).
> 
>  ---------------------
>  | QEMU with SeaBIOS |
>  ---------------------
> 
>  qemu_init_end: 65.958714
>  linux_start_kernel: 77.735803 (+11.777089)
>  linux_start_user: 127.360739 (+49.624936)
> 
>  -------------------
>  | QEMU direct PVH |
>  -------------------
> 
>  qemu_init_end: 64.043264
>  linux_start_kernel: 65.481782 (+1.438518)
>  linux_start_user: 114.938353 (+49.456571)
> 
>  --------------
>  | Comparison |
>  --------------
> 
> Average boot time:
> 
>  * Relying on SeaBIOS, when compared with direct PVH boot, as a total
>    average overhead of ~12ms. The cost of initializing QEMU increases in
>    ~2ms (probably due to need to instantiate more devices), while the
>    other ~10ms is the SeaBIOS overhead.
> 
>  ---------------
>  | Conclusions |
>  ---------------
> 
> Objectively, the version that boots directly the kernel using PVH is 10%
> faster and has a slightly larger exposed surface. Whether this is enough
> to justify its existence is quite subjective.
> 
> In my opinion, not only I think it makes sense to have it, but I also
> think there's little reason to have the firmware reliant version, given
> the nature and purpose of microvm.

The advantage of firmware is support for vmlinuz and multiboot in
addition to PVH, and removing code from QEMU.  So I think it's still
worth doing a comparison with qboot, and trying to understand where
SeaBIOS is spending its time (qboot should not need additional devices
other than fw_cfg, and since SeaBIOS has never been optimized for
microvm I expect it's possible to shave quite a few of those 12 ms).

On the other hand I agree that microvm is showing great promise compared
to PCI-based machine types.

Paolo
Sergio Lopez July 19, 2019, 10:30 a.m. UTC | #14
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 18/07/19 16:34, Sergio Lopez wrote:
>> I've just added support for starting the machine from SeaBIOS (Stefan
>> Hajnoczi pointed in another thread that it can be as fast as qboot, and
>> given that the latter doesn't support mptables, I just tested this
>> one). I tried to keep it as minimalistic as possible, but it still
>> required an RTC (mc146818), which dragged in an ISA BUS, and this one a
>> KVM PIT.
>> 
>> I ran some numbers using Stefano Garzarella's qemu-boot-time scripts
>> [1] on a server with 2xIntel Xeon Silver 4114 2.20GHz, using the
>> upstream QEMU (474f3938d79ab36b9231c9ad3b5a9314c2aeacde) built with
>> minimal features [2]. The VM boots a minimal kernel [3] without initrd,
>> using a kata container image as root via virtio-blk (though this isn't
>> really relevant, as we're just taking measurements until the kernel is
>> about to exec init).
>> 
>>  ---------------------
>>  | QEMU with SeaBIOS |
>>  ---------------------
>> 
>>  qemu_init_end: 65.958714
>>  linux_start_kernel: 77.735803 (+11.777089)
>>  linux_start_user: 127.360739 (+49.624936)
>> 
>>  -------------------
>>  | QEMU direct PVH |
>>  -------------------
>> 
>>  qemu_init_end: 64.043264
>>  linux_start_kernel: 65.481782 (+1.438518)
>>  linux_start_user: 114.938353 (+49.456571)
>> 
>>  --------------
>>  | Comparison |
>>  --------------
>> 
>> Average boot time:
>> 
>>  * Relying on SeaBIOS, when compared with direct PVH boot, as a total
>>    average overhead of ~12ms. The cost of initializing QEMU increases in
>>    ~2ms (probably due to need to instantiate more devices), while the
>>    other ~10ms is the SeaBIOS overhead.
>> 
>>  ---------------
>>  | Conclusions |
>>  ---------------
>> 
>> Objectively, the version that boots directly the kernel using PVH is 10%
>> faster and has a slightly larger exposed surface. Whether this is enough
>> to justify its existence is quite subjective.
>> 
>> In my opinion, not only I think it makes sense to have it, but I also
>> think there's little reason to have the firmware reliant version, given
>> the nature and purpose of microvm.
>
> The advantage of firmware is support for vmlinuz and multiboot in
> addition to PVH, and removing code from QEMU.  So I think it's still
> worth doing a comparison with qboot, and trying to understand where
> SeaBIOS is spending its time (qboot should not need additional devices
> other than fw_cfg, and since SeaBIOS has never been optimized for
> microvm I expect it's possible to shave quite a few of those 12 ms).

I tweaked qboot a bit to make it work with microvm (basically, if it
can't identify a PCI vendor, it assumes it's running in a microvm, and
avoids PCI and ACPI initialization, writing an minimal mptable instead)
and I've just got some numbers:

Average boot time:
 qemu_init_end: 64.502199
 linux_start_kernel: 66.644311 (+2.142112)
 linux_start_user: 116.279127 (+49.634816)

To avoid having to scroll up, these are the numbers with the direct PVH
boot version:

Average boot time:
 qemu_init_end: 64.043264
 linux_start_kernel: 65.481782 (+1.438518)
 linux_start_user: 114.938353 (+49.456571)

So starting from qboot is just a bit over 1ms slower, which I think it's
quite reasonable. And, as you said, the only additional device is
fw_cfg:

address-space: I/O
  0000000000000000-000000000000ffff (prio 0, i/o): io
    000000000000007e-000000000000007f (prio 0, i/o): kvmvapic
    0000000000000510-0000000000000511 (prio 0, i/o): fwcfg
    0000000000000514-000000000000051b (prio 0, i/o): fwcfg.dma

If people feel more comfortable keeping QEMU on x86_64 starting on Real
Mode and having the firmware writing mptables, I'm fine with taking this
approach.

That said, to avoid confusing users, I think it'd be better to drop the
legacy mode for SeaBIOS, and support just qboot. That would mean having
to add a binary from the latter to "pc-bios" with some descriptive name
("qboot-microvm.bin"?) and making microvm look for that one by default.

Does this sound like a reasonable plan?

> On the other hand I agree that microvm is showing great promise compared
> to PCI-based machine types.
>
> Paolo
Paolo Bonzini July 19, 2019, 11:49 a.m. UTC | #15
On 19/07/19 12:30, Sergio Lopez wrote:
> If people feel more comfortable keeping QEMU on x86_64 starting on Real
> Mode and having the firmware writing mptables, I'm fine with taking this
> approach.
>
> That said, to avoid confusing users, I think it'd be better to drop the
> legacy mode for SeaBIOS, and support just qboot. That would mean having
> to add a binary from the latter to "pc-bios" with some descriptive name
> ("qboot-microvm.bin"?) and making microvm look for that one by default.
> 
> Does this sound like a reasonable plan?

Yes, though I wouldn't call it qboot-microvm.bin but just bios-microvm.bin.

In the meanwhile, feel free to send a pull request to the qboot project.
 The mptables can be written for all platforms, even those that have
ACPI (there is always -no-acpi, too).

Paolo
diff mbox series

Patch

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index cd5ea391e8..338f07420f 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -26,3 +26,4 @@  CONFIG_ISAPC=y
 CONFIG_I440FX=y
 CONFIG_Q35=y
 CONFIG_ACPI_PCI=y
+CONFIG_MICROVM=y
diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index 9817888216..94c565d8db 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -87,6 +87,10 @@  config Q35
     select VMMOUSE
     select FW_CFG_DMA
 
+config MICROVM
+    bool
+    select VIRTIO_MMIO
+
 config VTD
     bool
 
diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index c5f20bbd72..7bffca413e 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -4,6 +4,7 @@  obj-y += pvh.o
 obj-y += pc.o
 obj-$(CONFIG_I440FX) += pc_piix.o
 obj-$(CONFIG_Q35) += pc_q35.o
+obj-$(CONFIG_MICROVM) += mptable.o microvm.o
 obj-y += fw_cfg.o pc_sysfw.o
 obj-y += x86-iommu.o
 obj-$(CONFIG_VTD) += intel_iommu.o
diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
new file mode 100644
index 0000000000..8b5efe9e45
--- /dev/null
+++ b/hw/i386/microvm.c
@@ -0,0 +1,500 @@ 
+/*
+ * Copyright (c) 2018 Intel Corporation
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/cpus.h"
+#include "sysemu/numa.h"
+
+#include "hw/loader.h"
+#include "hw/nmi.h"
+#include "hw/kvm/clock.h"
+#include "hw/i386/microvm.h"
+#include "hw/i386/pc.h"
+#include "target/i386/cpu.h"
+#include "hw/timer/i8254.h"
+#include "hw/char/serial.h"
+#include "hw/i386/topology.h"
+#include "hw/virtio/virtio-mmio.h"
+#include "hw/i386/mptable.h"
+
+#include "cpu.h"
+#include "elf.h"
+#include "pvh.h"
+#include "kvm_i386.h"
+#include "hw/xen/start_info.h"
+
+static void microvm_gsi_handler(void *opaque, int n, int level)
+{
+    qemu_irq *ioapic_irq = opaque;
+
+    qemu_set_irq(ioapic_irq[n], level);
+}
+
+static void microvm_legacy_init(MicrovmMachineState *mms)
+{
+    ISABus *isa_bus;
+    GSIState *gsi_state;
+    qemu_irq *i8259;
+    int i;
+
+    assert(kvm_irqchip_in_kernel());
+    gsi_state = g_malloc0(sizeof(*gsi_state));
+    mms->gsi = qemu_allocate_irqs(gsi_handler, gsi_state, GSI_NUM_PINS);
+
+    isa_bus = isa_bus_new(NULL, get_system_memory(), get_system_io(),
+                          &error_abort);
+    isa_bus_irqs(isa_bus, mms->gsi);
+
+    assert(kvm_pic_in_kernel());
+    i8259 = kvm_i8259_init(isa_bus);
+
+    for (i = 0; i < ISA_NUM_IRQS; i++) {
+        gsi_state->i8259_irq[i] = i8259[i];
+    }
+
+    kvm_pit_init(isa_bus, 0x40);
+
+    for (i = 0; i < VIRTIO_NUM_TRANSPORTS; i++) {
+        int nirq = VIRTIO_IRQ_BASE + i;
+        ISADevice *isadev = isa_create(isa_bus, TYPE_ISA_SERIAL);
+        qemu_irq mmio_irq;
+
+        isa_init_irq(isadev, &mmio_irq, nirq);
+        sysbus_create_simple("virtio-mmio",
+                             VIRTIO_MMIO_BASE + i * 512,
+                             mms->gsi[VIRTIO_IRQ_BASE + i]);
+    }
+
+    g_free(i8259);
+
+    serial_hds_isa_init(isa_bus, 0, 1);
+}
+
+static void microvm_ioapic_init(MicrovmMachineState *mms)
+{
+    qemu_irq *ioapic_irq;
+    DeviceState *ioapic_dev;
+    SysBusDevice *d;
+    int i;
+
+    assert(kvm_irqchip_in_kernel());
+    ioapic_irq = g_new0(qemu_irq, IOAPIC_NUM_PINS);
+    kvm_pc_setup_irq_routing(true);
+
+    assert(kvm_ioapic_in_kernel());
+    ioapic_dev = qdev_create(NULL, "kvm-ioapic");
+
+    object_property_add_child(qdev_get_machine(),
+                              "ioapic", OBJECT(ioapic_dev), NULL);
+
+    qdev_init_nofail(ioapic_dev);
+    d = SYS_BUS_DEVICE(ioapic_dev);
+    sysbus_mmio_map(d, 0, IO_APIC_DEFAULT_ADDRESS);
+
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        ioapic_irq[i] = qdev_get_gpio_in(ioapic_dev, i);
+    }
+
+    mms->gsi = qemu_allocate_irqs(microvm_gsi_handler,
+                                  ioapic_irq, IOAPIC_NUM_PINS);
+
+    for (i = 0; i < VIRTIO_NUM_TRANSPORTS; i++) {
+        sysbus_create_simple("virtio-mmio",
+                             VIRTIO_MMIO_BASE + i * 512,
+                             mms->gsi[VIRTIO_IRQ_BASE + i]);
+    }
+}
+
+static void microvm_memory_init(MicrovmMachineState *mms)
+{
+    MachineState *machine = MACHINE(mms);
+    MemoryRegion *ram, *ram_below_4g, *ram_above_4g;
+    MemoryRegion *system_memory = get_system_memory();
+
+    if (machine->ram_size > MICROVM_MAX_BELOW_4G) {
+        mms->above_4g_mem_size = machine->ram_size - MICROVM_MAX_BELOW_4G;
+        mms->below_4g_mem_size = MICROVM_MAX_BELOW_4G;
+    } else {
+        mms->above_4g_mem_size = 0;
+        mms->below_4g_mem_size = machine->ram_size;
+    }
+
+    ram = g_malloc(sizeof(*ram));
+    memory_region_allocate_system_memory(ram, NULL, "microvm.ram",
+                                         machine->ram_size);
+
+    ram_below_4g = g_malloc(sizeof(*ram_below_4g));
+    memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
+                             0, mms->below_4g_mem_size);
+    memory_region_add_subregion(system_memory, 0, ram_below_4g);
+
+    e820_add_entry(0, mms->below_4g_mem_size, E820_RAM);
+
+    if (mms->above_4g_mem_size > 0) {
+        ram_above_4g = g_malloc(sizeof(*ram_above_4g));
+        memory_region_init_alias(ram_above_4g, NULL, "ram-above-4g", ram,
+                                 mms->below_4g_mem_size,
+                                 mms->above_4g_mem_size);
+        memory_region_add_subregion(system_memory, 0x100000000ULL,
+                                    ram_above_4g);
+        e820_add_entry(0x100000000ULL, mms->above_4g_mem_size, E820_RAM);
+    }
+}
+
+static void microvm_cpus_init(const char *typename, Error **errp)
+{
+    int i;
+
+    for (i = 0; i < smp_cpus; i++) {
+        Object *cpu = NULL;
+        Error *local_err = NULL;
+
+        cpu = object_new(typename);
+
+        object_property_set_uint(cpu, i, "apic-id", &local_err);
+        object_property_set_bool(cpu, true, "realized", &local_err);
+
+        object_unref(cpu);
+        error_propagate(errp, local_err);
+    }
+}
+
+static void microvm_machine_state_init(MachineState *machine)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+    Error *local_err = NULL;
+
+    if (machine->kernel_filename == NULL) {
+        error_report("missing kernel image file name, required by microvm");
+        exit(1);
+    }
+
+    microvm_memory_init(mms);
+
+    microvm_cpus_init(machine->cpu_type, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        exit(1);
+    }
+
+    if (mms->legacy) {
+        microvm_legacy_init(mms);
+    } else {
+        microvm_ioapic_init(mms);
+    }
+
+    kvmclock_create();
+
+    if (!pvh_load_elfboot(machine->kernel_filename, NULL, NULL)) {
+        error_report("Error while loading elf kernel");
+        exit(1);
+    }
+
+    mms->elf_entry = pvh_get_start_addr();
+}
+
+static gchar *microvm_get_mmio_cmdline(gchar *name)
+{
+    gchar *cmdline;
+    gchar *separator;
+    long int index;
+    int ret;
+
+    separator = g_strrstr(name, ".");
+    if (!separator) {
+        return NULL;
+    }
+
+    if (qemu_strtol(separator + 1, NULL, 10, &index) != 0) {
+        return NULL;
+    }
+
+    cmdline = g_malloc0(VIRTIO_CMDLINE_MAXLEN);
+    ret = g_snprintf(cmdline, VIRTIO_CMDLINE_MAXLEN,
+                     " virtio_mmio.device=512@0x%lx:%ld",
+                     VIRTIO_MMIO_BASE + index * 512,
+                     VIRTIO_IRQ_BASE + index);
+    if (ret < 0 || ret >= VIRTIO_CMDLINE_MAXLEN) {
+        g_free(cmdline);
+        return NULL;
+    }
+
+    return cmdline;
+}
+
+static void microvm_setup_pvh(MicrovmMachineState *mms,
+                              const gchar *kernel_cmdline)
+{
+    struct hvm_memmap_table_entry *memmap_table;
+    struct hvm_start_info *start_info;
+    BusState *bus;
+    BusChild *kid;
+    gchar *cmdline;
+    int cmdline_len;
+    int memmap_entries;
+    int i;
+
+    cmdline = g_strdup(kernel_cmdline);
+
+    /*
+     * Find MMIO transports with attached devices, and add them to the kernel
+     * command line.
+     */
+    bus = sysbus_get_default();
+    QTAILQ_FOREACH(kid, &bus->children, sibling) {
+        DeviceState *dev = kid->child;
+        ObjectClass *class = object_get_class(OBJECT(dev));
+
+        if (class == object_class_by_name(TYPE_VIRTIO_MMIO)) {
+            VirtIOMMIOProxy *mmio = VIRTIO_MMIO(OBJECT(dev));
+            VirtioBusState *mmio_virtio_bus = &mmio->bus;
+            BusState *mmio_bus = &mmio_virtio_bus->parent_obj;
+
+            if (!QTAILQ_EMPTY(&mmio_bus->children)) {
+                gchar *mmio_cmdline = microvm_get_mmio_cmdline(mmio_bus->name);
+                if (mmio_cmdline) {
+                    char *newcmd = g_strjoin(NULL, cmdline, mmio_cmdline, NULL);
+                    g_free(mmio_cmdline);
+                    g_free(cmdline);
+                    cmdline = newcmd;
+                }
+            }
+        }
+    }
+
+    cmdline_len = strlen(cmdline);
+
+    address_space_write(&address_space_memory,
+                        KERNEL_CMDLINE_START, MEMTXATTRS_UNSPECIFIED,
+                        (uint8_t *) cmdline, cmdline_len);
+
+    g_free(cmdline);
+
+    memmap_entries = e820_get_num_entries();
+    memmap_table = g_new0(struct hvm_memmap_table_entry, memmap_entries);
+    for (i = 0; i < memmap_entries; i++) {
+        uint64_t address, length;
+        struct hvm_memmap_table_entry *entry = &memmap_table[i];
+
+        if (e820_get_entry(i, E820_RAM, &address, &length)) {
+            entry->addr = address;
+            entry->size = length;
+            entry->type = E820_RAM;
+            entry->reserved = 0;
+        }
+    }
+
+    address_space_write(&address_space_memory,
+                        MEMMAP_START, MEMTXATTRS_UNSPECIFIED,
+                        (uint8_t *) memmap_table,
+                        memmap_entries * sizeof(struct hvm_memmap_table_entry));
+
+    g_free(memmap_table);
+
+    start_info = g_malloc0(sizeof(struct hvm_start_info));
+
+    start_info->magic = XEN_HVM_START_MAGIC_VALUE;
+    start_info->version = 1;
+    start_info->nr_modules = 0;
+    start_info->cmdline_paddr = KERNEL_CMDLINE_START;
+    start_info->memmap_entries = memmap_entries;
+    start_info->memmap_paddr = MEMMAP_START;
+
+    address_space_write(&address_space_memory,
+                        PVH_START_INFO, MEMTXATTRS_UNSPECIFIED,
+                        (uint8_t *) start_info,
+                        sizeof(struct hvm_start_info));
+
+    g_free(start_info);
+}
+
+static void microvm_init_page_tables(void)
+{
+    uint64_t val = 0;
+    int i;
+
+    val = PDPTE_START | 0x03;
+    address_space_write(&address_space_memory,
+                        PML4_START, MEMTXATTRS_UNSPECIFIED,
+                        (uint8_t *) &val, 8);
+    val = PDE_START | 0x03;
+    address_space_write(&address_space_memory,
+                        PDPTE_START, MEMTXATTRS_UNSPECIFIED,
+                        (uint8_t *) &val, 8);
+
+    for (i = 0; i < 512; i++) {
+        val = (i << 21) + 0x83;
+        address_space_write(&address_space_memory,
+                            PDE_START + (i * 8), MEMTXATTRS_UNSPECIFIED,
+                            (uint8_t *) &val, 8);
+    }
+}
+
+static void microvm_cpu_reset(CPUState *cs, uint64_t elf_entry)
+{
+    X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *env = &cpu->env;
+    struct SegmentCache seg_code = { .selector = 0x8,
+                                     .base = 0x0,
+                                     .limit = 0xffffffff,
+                                     .flags = 0xc09b00 };
+    struct SegmentCache seg_data = { .selector = 0x10,
+                                     .base = 0x0,
+                                     .limit = 0xffffffff,
+                                     .flags = 0xc09300 };
+    struct SegmentCache seg_tr = { .selector = 0x18,
+                                   .base = 0x0,
+                                   .limit = 0xffff,
+                                   .flags = 0x8b00 };
+
+    memcpy(&env->segs[R_CS], &seg_code, sizeof(struct SegmentCache));
+    memcpy(&env->segs[R_DS], &seg_data, sizeof(struct SegmentCache));
+    memcpy(&env->segs[R_ES], &seg_data, sizeof(struct SegmentCache));
+    memcpy(&env->segs[R_FS], &seg_data, sizeof(struct SegmentCache));
+    memcpy(&env->segs[R_GS], &seg_data, sizeof(struct SegmentCache));
+    memcpy(&env->segs[R_SS], &seg_data, sizeof(struct SegmentCache));
+    memcpy(&env->tr, &seg_tr, sizeof(struct SegmentCache));
+
+    env->regs[R_EBX] = PVH_START_INFO;
+
+    cpu_set_pc(cs, elf_entry);
+    cpu_x86_update_cr3(env, 0);
+    cpu_x86_update_cr4(env, 0);
+    cpu_x86_update_cr0(env, CR0_PE_MASK);
+
+    x86_update_hflags(env);
+}
+
+static void microvm_mptable_setup(MicrovmMachineState *mms)
+{
+    char *mptable;
+    int size;
+
+    mptable = mptable_generate(smp_cpus, EBDA_START, &size);
+    address_space_write(&address_space_memory,
+                        EBDA_START, MEMTXATTRS_UNSPECIFIED,
+                        (uint8_t *) mptable, size);
+    g_free(mptable);
+}
+
+static bool microvm_machine_get_legacy(Object *obj, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    return mms->legacy;
+}
+
+static void microvm_machine_set_legacy(Object *obj, bool value, Error **errp)
+{
+    MicrovmMachineState *mms = MICROVM_MACHINE(obj);
+
+    mms->legacy = value;
+}
+
+static void microvm_machine_reset(void)
+{
+    MachineState *machine = MACHINE(qdev_get_machine());
+    MicrovmMachineState *mms = MICROVM_MACHINE(machine);
+    CPUState *cs;
+    X86CPU *cpu;
+
+    qemu_devices_reset();
+
+    microvm_mptable_setup(mms);
+    microvm_setup_pvh(mms, machine->kernel_cmdline);
+    microvm_init_page_tables();
+
+    CPU_FOREACH(cs) {
+        cpu = X86_CPU(cs);
+
+        if (cpu->apic_state) {
+            device_reset(cpu->apic_state);
+        }
+
+        microvm_cpu_reset(cs, mms->elf_entry);
+    }
+}
+
+static void x86_nmi(NMIState *n, int cpu_index, Error **errp)
+{
+    CPUState *cs;
+
+    CPU_FOREACH(cs) {
+        X86CPU *cpu = X86_CPU(cs);
+
+        if (!cpu->apic_state) {
+            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
+        } else {
+            apic_deliver_nmi(cpu->apic_state);
+        }
+    }
+}
+
+static void microvm_class_init(ObjectClass *oc, void *data)
+{
+    MachineClass *mc = MACHINE_CLASS(oc);
+    NMIClass *nc = NMI_CLASS(oc);
+
+    mc->init = microvm_machine_state_init;
+
+    mc->family = "microvm_i386";
+    mc->desc = "Microvm (i386)";
+    mc->units_per_default_bus = 1;
+    mc->no_floppy = 1;
+    machine_class_allow_dynamic_sysbus_dev(mc, "sysbus-debugcon");
+    machine_class_allow_dynamic_sysbus_dev(mc, "sysbus-debugexit");
+    mc->max_cpus = 288;
+    mc->has_hotpluggable_cpus = false;
+    mc->auto_enable_numa_with_memhp = false;
+    mc->default_cpu_type = X86_CPU_TYPE_NAME("host");
+    mc->nvdimm_supported = false;
+    mc->default_machine_opts = "accel=kvm";
+
+    /* Machine class handlers */
+    mc->reset = microvm_machine_reset;
+
+    /* NMI handler */
+    nc->nmi_monitor_handler = x86_nmi;
+
+    object_class_property_add_bool(oc, MICROVM_MACHINE_LEGACY,
+                                   microvm_machine_get_legacy,
+                                   microvm_machine_set_legacy,
+                                   &error_abort);
+}
+
+static const TypeInfo microvm_machine_info = {
+    .name          = TYPE_MICROVM_MACHINE,
+    .parent        = TYPE_MACHINE,
+    .instance_size = sizeof(MicrovmMachineState),
+    .class_size    = sizeof(MicrovmMachineClass),
+    .class_init    = microvm_class_init,
+    .interfaces = (InterfaceInfo[]) {
+         { TYPE_NMI },
+         { }
+    },
+};
+
+static void microvm_machine_init(void)
+{
+    type_register_static(&microvm_machine_info);
+}
+type_init(microvm_machine_init);
diff --git a/include/hw/i386/microvm.h b/include/hw/i386/microvm.h
new file mode 100644
index 0000000000..72c89db669
--- /dev/null
+++ b/include/hw/i386/microvm.h
@@ -0,0 +1,77 @@ 
+/*
+ * Copyright (c) 2018 Intel Corporation
+ * Copyright (c) 2019 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_I386_MICROVM_H
+#define HW_I386_MICROVM_H
+
+#include "qemu-common.h"
+#include "exec/hwaddr.h"
+#include "qemu/notify.h"
+
+#include "hw/boards.h"
+
+/* Microvm memory layout */
+#define PVH_START_INFO        0x6000
+#define MEMMAP_START          0x7000
+#define BOOT_STACK_POINTER    0x8ff0
+#define PML4_START            0x9000
+#define PDPTE_START           0xa000
+#define PDE_START             0xb000
+#define KERNEL_CMDLINE_START  0x20000
+#define EBDA_START            0x9fc00
+#define HIMEM_START           0x100000
+#define MICROVM_MAX_BELOW_4G  0xe0000000
+
+/* Platform virtio definitions */
+#define VIRTIO_MMIO_BASE      0xd0000000
+#define VIRTIO_IRQ_BASE       5
+#define VIRTIO_NUM_TRANSPORTS 8
+#define VIRTIO_CMDLINE_MAXLEN 64
+
+/* Machine type options */
+#define MICROVM_MACHINE_LEGACY "legacy"
+
+typedef struct {
+    MachineClass parent;
+    HotplugHandler *(*orig_hotplug_handler)(MachineState *machine,
+                                           DeviceState *dev);
+} MicrovmMachineClass;
+
+typedef struct {
+    MachineState parent;
+    qemu_irq *gsi;
+
+    /* RAM size */
+    ram_addr_t below_4g_mem_size;
+    ram_addr_t above_4g_mem_size;
+
+    /* Kernel ELF entry. On reset, vCPUs RIP will be set to this */
+    uint64_t elf_entry;
+
+    /* Legacy mode based on an ISA bus. Useful for debugging */
+    bool legacy;
+} MicrovmMachineState;
+
+#define TYPE_MICROVM_MACHINE   MACHINE_TYPE_NAME("microvm")
+#define MICROVM_MACHINE(obj) \
+    OBJECT_CHECK(MicrovmMachineState, (obj), TYPE_MICROVM_MACHINE)
+#define MICROVM_MACHINE_GET_CLASS(obj) \
+    OBJECT_GET_CLASS(MicrovmMachineClass, obj, TYPE_MICROVM_MACHINE)
+#define MICROVM_MACHINE_CLASS(class) \
+    OBJECT_CLASS_CHECK(MicrovmMachineClass, class, TYPE_MICROVM_MACHINE)
+
+#endif