Patchwork [15/15] kvm: x86: Introduce kvmclock device to save/restore its state

login
register
mail settings
Submitter Jan Kiszka
Date Feb. 7, 2011, 11:19 a.m.
Message ID <f28a16f3560d93574ff7f7cc6cdef411bc90a2dc.1297077507.git.jan.kiszka@siemens.com>
Download mbox | patch
Permalink /patch/82164/
State New
Headers show

Comments

Jan Kiszka - Feb. 7, 2011, 11:19 a.m.
If kvmclock is used, which implies the kernel supports it, register a
kvmclock device with the sysbus. Its main purpose is to save and restore
the kernel state on migration, but this will also allow to visualize it
one day.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
CC: Glauber Costa <glommer@redhat.com>
---
 Makefile.target |    4 +-
 hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/kvmclock.h   |   14 ++++++
 hw/pc_piix.c    |   31 +++++++++++---
 4 files changed, 165 insertions(+), 9 deletions(-)
 create mode 100644 hw/kvmclock.c
 create mode 100644 hw/kvmclock.h
Glauber Costa - Feb. 7, 2011, 12:27 p.m.
On Mon, 2011-02-07 at 12:19 +0100, Jan Kiszka wrote:
> If kvmclock is used, which implies the kernel supports it, register a
> kvmclock device with the sysbus. Its main purpose is to save and restore
> the kernel state on migration, but this will also allow to visualize it
> one day.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> CC: Glauber Costa <glommer@redhat.com>
> ---
>  Makefile.target |    4 +-
>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/kvmclock.h   |   14 ++++++
>  hw/pc_piix.c    |   31 +++++++++++---
>  4 files changed, 165 insertions(+), 9 deletions(-)
>  create mode 100644 hw/kvmclock.c
>  create mode 100644 hw/kvmclock.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index b0ba95f..30232fa 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
>  LIBS+=-lm
>  endif
>  
> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>  
>  config-target.h: config-target.h-timestamp
>  config-target.h-timestamp: config-target.mak
> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
>  obj-i386-y += debugcon.o multiboot.o
> -obj-i386-y += pc_piix.o
> +obj-i386-y += pc_piix.o kvmclock.o
>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
>  
>  # shared objects
> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
> new file mode 100644
> index 0000000..b6ceddf
> --- /dev/null
> +++ b/hw/kvmclock.c
> @@ -0,0 +1,125 @@
> +/*
> + * QEMU KVM support, paravirtual clock device
> + *
> + * Copyright (C) 2011 Siemens AG
> + *
> + * Authors:
> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> + *
> + * This work is licensed under the terms of the GNU GPL version 2.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "sysemu.h"
> +#include "sysbus.h"
> +#include "kvm.h"
> +#include "kvmclock.h"
> +
> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
> +
> +#include <linux/kvm.h>
> +#include <linux/kvm_para.h>
> +
> +typedef struct KVMClockState {
> +    SysBusDevice busdev;
> +    uint64_t clock;
> +    bool clock_valid;
> +} KVMClockState;
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    if (s->clock_valid) {
> +        return;
> +    }
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +        data.clock = 0;
> +    }
> +    s->clock = data.clock;
> +    /*
> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
> +     * it on next vmsave (which would return a different value). Will be reset
> +     * when the VM is continued.
> +     */
> +    s->clock_valid = !vm_running;
> +}
> +
> +static int kvmclock_post_load(void *opaque, int version_id)
> +{
> +    KVMClockState *s = opaque;
> +    struct kvm_clock_data data;
> +
> +    data.clock = s->clock;
> +    data.flags = 0;
> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> +}
> +
> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
> +{
> +    KVMClockState *s = opaque;
> +
> +    if (running) {
> +        s->clock_valid = false;
> +    }
> +}
> +
> +static int kvmclock_init(SysBusDevice *dev)
> +{
> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
> +
> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> +    return 0;
> +}
> +
> +static const VMStateDescription kvmclock_vmsd = {
> +    .name = "kvmclock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .pre_save = kvmclock_pre_save,
> +    .post_load = kvmclock_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static SysBusDeviceInfo kvmclock_info = {
> +    .qdev.name = "kvmclock",
> +    .qdev.size = sizeof(KVMClockState),
> +    .qdev.vmsd = &kvmclock_vmsd,
> +    .qdev.no_user = 1,
> +    .init = kvmclock_init,
> +};
> +
> +/* Note: Must be called after VCPU initialization. */
> +void kvmclock_create(void)
> +{
> +    if (kvm_enabled() &&
> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
> +        sysbus_create_simple("kvmclock", -1, NULL);
> +    }
> +}
> +
> +static void kvmclock_register_device(void)
> +{
> +    if (kvm_enabled()) {
> +        sysbus_register_withprop(&kvmclock_info);
> +    }
> +}
> +
> +device_init(kvmclock_register_device);
> +
> +#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> +
> +void kvmclock_create(void)
> +{
> +}
> +#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> diff --git a/hw/kvmclock.h b/hw/kvmclock.h
> new file mode 100644
> index 0000000..7a83cbe
> --- /dev/null
> +++ b/hw/kvmclock.h
> @@ -0,0 +1,14 @@
> +/*
> + * QEMU KVM support, paravirtual clock device
> + *
> + * Copyright (C) 2011 Siemens AG
> + *
> + * Authors:
> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> + *
> + * This work is licensed under the terms of the GNU GPL version 2.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +void kvmclock_create(void);
> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> index 7b74473..9bc4659 100644
> --- a/hw/pc_piix.c
> +++ b/hw/pc_piix.c
> @@ -32,6 +32,7 @@
>  #include "boards.h"
>  #include "ide.h"
>  #include "kvm.h"
> +#include "kvmclock.h"
>  #include "sysemu.h"
>  #include "sysbus.h"
>  #include "arch_init.h"
> @@ -66,7 +67,8 @@ static void pc_init1(ram_addr_t ram_size,
>                       const char *kernel_cmdline,
>                       const char *initrd_filename,
>                       const char *cpu_model,
> -                     int pci_enabled)
> +                     int pci_enabled,
> +                     int kvmclock_enabled)
>  
What exactly is your motivation to that ? I think mid/long-term
we should be making machine initialization more common among
architectures, not introducing more arch specific, or even worse, kvm
specific parameters here.

I'd like to understand what do we gain from that, since opting kvmclock
in our out is done by cpuid anyway - no need for a specific machine.
Jan Kiszka - Feb. 7, 2011, 12:36 p.m.
On 2011-02-07 13:27, Glauber Costa wrote:
> On Mon, 2011-02-07 at 12:19 +0100, Jan Kiszka wrote:
>> If kvmclock is used, which implies the kernel supports it, register a
>> kvmclock device with the sysbus. Its main purpose is to save and restore
>> the kernel state on migration, but this will also allow to visualize it
>> one day.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> CC: Glauber Costa <glommer@redhat.com>
>> ---
>>  Makefile.target |    4 +-
>>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/kvmclock.h   |   14 ++++++
>>  hw/pc_piix.c    |   31 +++++++++++---
>>  4 files changed, 165 insertions(+), 9 deletions(-)
>>  create mode 100644 hw/kvmclock.c
>>  create mode 100644 hw/kvmclock.h
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index b0ba95f..30232fa 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
>>  LIBS+=-lm
>>  endif
>>  
>> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>>  
>>  config-target.h: config-target.h-timestamp
>>  config-target.h-timestamp: config-target.mak
>> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
>>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
>>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
>>  obj-i386-y += debugcon.o multiboot.o
>> -obj-i386-y += pc_piix.o
>> +obj-i386-y += pc_piix.o kvmclock.o
>>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
>>  
>>  # shared objects
>> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
>> new file mode 100644
>> index 0000000..b6ceddf
>> --- /dev/null
>> +++ b/hw/kvmclock.c
>> @@ -0,0 +1,125 @@
>> +/*
>> + * QEMU KVM support, paravirtual clock device
>> + *
>> + * Copyright (C) 2011 Siemens AG
>> + *
>> + * Authors:
>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL version 2.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include "sysemu.h"
>> +#include "sysbus.h"
>> +#include "kvm.h"
>> +#include "kvmclock.h"
>> +
>> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
>> +
>> +#include <linux/kvm.h>
>> +#include <linux/kvm_para.h>
>> +
>> +typedef struct KVMClockState {
>> +    SysBusDevice busdev;
>> +    uint64_t clock;
>> +    bool clock_valid;
>> +} KVMClockState;
>> +
>> +static void kvmclock_pre_save(void *opaque)
>> +{
>> +    KVMClockState *s = opaque;
>> +    struct kvm_clock_data data;
>> +    int ret;
>> +
>> +    if (s->clock_valid) {
>> +        return;
>> +    }
>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> +    if (ret < 0) {
>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> +        data.clock = 0;
>> +    }
>> +    s->clock = data.clock;
>> +    /*
>> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
>> +     * it on next vmsave (which would return a different value). Will be reset
>> +     * when the VM is continued.
>> +     */
>> +    s->clock_valid = !vm_running;
>> +}
>> +
>> +static int kvmclock_post_load(void *opaque, int version_id)
>> +{
>> +    KVMClockState *s = opaque;
>> +    struct kvm_clock_data data;
>> +
>> +    data.clock = s->clock;
>> +    data.flags = 0;
>> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>> +}
>> +
>> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
>> +{
>> +    KVMClockState *s = opaque;
>> +
>> +    if (running) {
>> +        s->clock_valid = false;
>> +    }
>> +}
>> +
>> +static int kvmclock_init(SysBusDevice *dev)
>> +{
>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
>> +
>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>> +    return 0;
>> +}
>> +
>> +static const VMStateDescription kvmclock_vmsd = {
>> +    .name = "kvmclock",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .minimum_version_id_old = 1,
>> +    .pre_save = kvmclock_pre_save,
>> +    .post_load = kvmclock_post_load,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT64(clock, KVMClockState),
>> +        VMSTATE_END_OF_LIST()
>> +    }
>> +};
>> +
>> +static SysBusDeviceInfo kvmclock_info = {
>> +    .qdev.name = "kvmclock",
>> +    .qdev.size = sizeof(KVMClockState),
>> +    .qdev.vmsd = &kvmclock_vmsd,
>> +    .qdev.no_user = 1,
>> +    .init = kvmclock_init,
>> +};
>> +
>> +/* Note: Must be called after VCPU initialization. */
>> +void kvmclock_create(void)
>> +{
>> +    if (kvm_enabled() &&
>> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
>> +        sysbus_create_simple("kvmclock", -1, NULL);
>> +    }
>> +}
>> +
>> +static void kvmclock_register_device(void)
>> +{
>> +    if (kvm_enabled()) {
>> +        sysbus_register_withprop(&kvmclock_info);
>> +    }
>> +}
>> +
>> +device_init(kvmclock_register_device);
>> +
>> +#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
>> +
>> +void kvmclock_create(void)
>> +{
>> +}
>> +#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
>> diff --git a/hw/kvmclock.h b/hw/kvmclock.h
>> new file mode 100644
>> index 0000000..7a83cbe
>> --- /dev/null
>> +++ b/hw/kvmclock.h
>> @@ -0,0 +1,14 @@
>> +/*
>> + * QEMU KVM support, paravirtual clock device
>> + *
>> + * Copyright (C) 2011 Siemens AG
>> + *
>> + * Authors:
>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL version 2.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +void kvmclock_create(void);
>> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
>> index 7b74473..9bc4659 100644
>> --- a/hw/pc_piix.c
>> +++ b/hw/pc_piix.c
>> @@ -32,6 +32,7 @@
>>  #include "boards.h"
>>  #include "ide.h"
>>  #include "kvm.h"
>> +#include "kvmclock.h"
>>  #include "sysemu.h"
>>  #include "sysbus.h"
>>  #include "arch_init.h"
>> @@ -66,7 +67,8 @@ static void pc_init1(ram_addr_t ram_size,
>>                       const char *kernel_cmdline,
>>                       const char *initrd_filename,
>>                       const char *cpu_model,
>> -                     int pci_enabled)
>> +                     int pci_enabled,
>> +                     int kvmclock_enabled)
>>  
> What exactly is your motivation to that ? I think mid/long-term
> we should be making machine initialization more common among
> architectures, not introducing more arch specific, or even worse, kvm
> specific parameters here.
> 
> I'd like to understand what do we gain from that, since opting kvmclock
> in our out is done by cpuid anyway - no need for a specific machine.

Is that really the case? I thought we were already shipping versions
where that CPU feature was enabled by default. If not, I'll happily drop
that admittedly clumsy approach above.

Jan
Glauber Costa - Feb. 7, 2011, 1:40 p.m.
On Mon, 2011-02-07 at 13:36 +0100, Jan Kiszka wrote:
> On 2011-02-07 13:27, Glauber Costa wrote:
> > On Mon, 2011-02-07 at 12:19 +0100, Jan Kiszka wrote:
> >> If kvmclock is used, which implies the kernel supports it, register a
> >> kvmclock device with the sysbus. Its main purpose is to save and restore
> >> the kernel state on migration, but this will also allow to visualize it
> >> one day.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> CC: Glauber Costa <glommer@redhat.com>
> >> ---
> >>  Makefile.target |    4 +-
> >>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  hw/kvmclock.h   |   14 ++++++
> >>  hw/pc_piix.c    |   31 +++++++++++---
> >>  4 files changed, 165 insertions(+), 9 deletions(-)
> >>  create mode 100644 hw/kvmclock.c
> >>  create mode 100644 hw/kvmclock.h
> >>
> >> diff --git a/Makefile.target b/Makefile.target
> >> index b0ba95f..30232fa 100644
> >> --- a/Makefile.target
> >> +++ b/Makefile.target
> >> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
> >>  LIBS+=-lm
> >>  endif
> >>  
> >> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
> >> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
> >>  
> >>  config-target.h: config-target.h-timestamp
> >>  config-target.h-timestamp: config-target.mak
> >> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
> >>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
> >>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
> >>  obj-i386-y += debugcon.o multiboot.o
> >> -obj-i386-y += pc_piix.o
> >> +obj-i386-y += pc_piix.o kvmclock.o
> >>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
> >>  
> >>  # shared objects
> >> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
> >> new file mode 100644
> >> index 0000000..b6ceddf
> >> --- /dev/null
> >> +++ b/hw/kvmclock.c
> >> @@ -0,0 +1,125 @@
> >> +/*
> >> + * QEMU KVM support, paravirtual clock device
> >> + *
> >> + * Copyright (C) 2011 Siemens AG
> >> + *
> >> + * Authors:
> >> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> >> + *
> >> + * This work is licensed under the terms of the GNU GPL version 2.
> >> + * See the COPYING file in the top-level directory.
> >> + *
> >> + */
> >> +
> >> +#include "qemu-common.h"
> >> +#include "sysemu.h"
> >> +#include "sysbus.h"
> >> +#include "kvm.h"
> >> +#include "kvmclock.h"
> >> +
> >> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
> >> +
> >> +#include <linux/kvm.h>
> >> +#include <linux/kvm_para.h>
> >> +
> >> +typedef struct KVMClockState {
> >> +    SysBusDevice busdev;
> >> +    uint64_t clock;
> >> +    bool clock_valid;
> >> +} KVMClockState;
> >> +
> >> +static void kvmclock_pre_save(void *opaque)
> >> +{
> >> +    KVMClockState *s = opaque;
> >> +    struct kvm_clock_data data;
> >> +    int ret;
> >> +
> >> +    if (s->clock_valid) {
> >> +        return;
> >> +    }
> >> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >> +    if (ret < 0) {
> >> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >> +        data.clock = 0;
> >> +    }
> >> +    s->clock = data.clock;
> >> +    /*
> >> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
> >> +     * it on next vmsave (which would return a different value). Will be reset
> >> +     * when the VM is continued.
> >> +     */
> >> +    s->clock_valid = !vm_running;
> >> +}
> >> +
> >> +static int kvmclock_post_load(void *opaque, int version_id)
> >> +{
> >> +    KVMClockState *s = opaque;
> >> +    struct kvm_clock_data data;
> >> +
> >> +    data.clock = s->clock;
> >> +    data.flags = 0;
> >> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >> +}
> >> +
> >> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
> >> +{
> >> +    KVMClockState *s = opaque;
> >> +
> >> +    if (running) {
> >> +        s->clock_valid = false;
> >> +    }
> >> +}
> >> +
> >> +static int kvmclock_init(SysBusDevice *dev)
> >> +{
> >> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
> >> +
> >> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >> +    return 0;
> >> +}
> >> +
> >> +static const VMStateDescription kvmclock_vmsd = {
> >> +    .name = "kvmclock",
> >> +    .version_id = 1,
> >> +    .minimum_version_id = 1,
> >> +    .minimum_version_id_old = 1,
> >> +    .pre_save = kvmclock_pre_save,
> >> +    .post_load = kvmclock_post_load,
> >> +    .fields = (VMStateField[]) {
> >> +        VMSTATE_UINT64(clock, KVMClockState),
> >> +        VMSTATE_END_OF_LIST()
> >> +    }
> >> +};
> >> +
> >> +static SysBusDeviceInfo kvmclock_info = {
> >> +    .qdev.name = "kvmclock",
> >> +    .qdev.size = sizeof(KVMClockState),
> >> +    .qdev.vmsd = &kvmclock_vmsd,
> >> +    .qdev.no_user = 1,
> >> +    .init = kvmclock_init,
> >> +};
> >> +
> >> +/* Note: Must be called after VCPU initialization. */
> >> +void kvmclock_create(void)
> >> +{
> >> +    if (kvm_enabled() &&
> >> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
> >> +        sysbus_create_simple("kvmclock", -1, NULL);
> >> +    }
> >> +}
> >> +
> >> +static void kvmclock_register_device(void)
> >> +{
> >> +    if (kvm_enabled()) {
> >> +        sysbus_register_withprop(&kvmclock_info);
> >> +    }
> >> +}
> >> +
> >> +device_init(kvmclock_register_device);
> >> +
> >> +#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> >> +
> >> +void kvmclock_create(void)
> >> +{
> >> +}
> >> +#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> >> diff --git a/hw/kvmclock.h b/hw/kvmclock.h
> >> new file mode 100644
> >> index 0000000..7a83cbe
> >> --- /dev/null
> >> +++ b/hw/kvmclock.h
> >> @@ -0,0 +1,14 @@
> >> +/*
> >> + * QEMU KVM support, paravirtual clock device
> >> + *
> >> + * Copyright (C) 2011 Siemens AG
> >> + *
> >> + * Authors:
> >> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> >> + *
> >> + * This work is licensed under the terms of the GNU GPL version 2.
> >> + * See the COPYING file in the top-level directory.
> >> + *
> >> + */
> >> +
> >> +void kvmclock_create(void);
> >> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> >> index 7b74473..9bc4659 100644
> >> --- a/hw/pc_piix.c
> >> +++ b/hw/pc_piix.c
> >> @@ -32,6 +32,7 @@
> >>  #include "boards.h"
> >>  #include "ide.h"
> >>  #include "kvm.h"
> >> +#include "kvmclock.h"
> >>  #include "sysemu.h"
> >>  #include "sysbus.h"
> >>  #include "arch_init.h"
> >> @@ -66,7 +67,8 @@ static void pc_init1(ram_addr_t ram_size,
> >>                       const char *kernel_cmdline,
> >>                       const char *initrd_filename,
> >>                       const char *cpu_model,
> >> -                     int pci_enabled)
> >> +                     int pci_enabled,
> >> +                     int kvmclock_enabled)
> >>  
> > What exactly is your motivation to that ? I think mid/long-term
> > we should be making machine initialization more common among
> > architectures, not introducing more arch specific, or even worse, kvm
> > specific parameters here.
> > 
> > I'd like to understand what do we gain from that, since opting kvmclock
> > in our out is done by cpuid anyway - no need for a specific machine.
> 
> Is that really the case? I thought we were already shipping versions
> where that CPU feature was enabled by default. If not, I'll happily drop
> that admittedly clumsy approach above.

Yes, AFAIK, kvmclock is enabled by default, disabled by cpuid-leaf, as
in
-cpu kvm64,-kvmclock

So your test for cpuid bit before starting kvmclock should already cover
it.
Jan Kiszka - Feb. 7, 2011, 2:03 p.m.
On 2011-02-07 14:40, Glauber Costa wrote:
> On Mon, 2011-02-07 at 13:36 +0100, Jan Kiszka wrote:
>> On 2011-02-07 13:27, Glauber Costa wrote:
>>> On Mon, 2011-02-07 at 12:19 +0100, Jan Kiszka wrote:
>>>> If kvmclock is used, which implies the kernel supports it, register a
>>>> kvmclock device with the sysbus. Its main purpose is to save and restore
>>>> the kernel state on migration, but this will also allow to visualize it
>>>> one day.
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>> CC: Glauber Costa <glommer@redhat.com>
>>>> ---
>>>>  Makefile.target |    4 +-
>>>>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  hw/kvmclock.h   |   14 ++++++
>>>>  hw/pc_piix.c    |   31 +++++++++++---
>>>>  4 files changed, 165 insertions(+), 9 deletions(-)
>>>>  create mode 100644 hw/kvmclock.c
>>>>  create mode 100644 hw/kvmclock.h
>>>>
>>>> diff --git a/Makefile.target b/Makefile.target
>>>> index b0ba95f..30232fa 100644
>>>> --- a/Makefile.target
>>>> +++ b/Makefile.target
>>>> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
>>>>  LIBS+=-lm
>>>>  endif
>>>>  
>>>> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>>>> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>>>>  
>>>>  config-target.h: config-target.h-timestamp
>>>>  config-target.h-timestamp: config-target.mak
>>>> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
>>>>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
>>>>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
>>>>  obj-i386-y += debugcon.o multiboot.o
>>>> -obj-i386-y += pc_piix.o
>>>> +obj-i386-y += pc_piix.o kvmclock.o
>>>>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
>>>>  
>>>>  # shared objects
>>>> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
>>>> new file mode 100644
>>>> index 0000000..b6ceddf
>>>> --- /dev/null
>>>> +++ b/hw/kvmclock.c
>>>> @@ -0,0 +1,125 @@
>>>> +/*
>>>> + * QEMU KVM support, paravirtual clock device
>>>> + *
>>>> + * Copyright (C) 2011 Siemens AG
>>>> + *
>>>> + * Authors:
>>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL version 2.
>>>> + * See the COPYING file in the top-level directory.
>>>> + *
>>>> + */
>>>> +
>>>> +#include "qemu-common.h"
>>>> +#include "sysemu.h"
>>>> +#include "sysbus.h"
>>>> +#include "kvm.h"
>>>> +#include "kvmclock.h"
>>>> +
>>>> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
>>>> +
>>>> +#include <linux/kvm.h>
>>>> +#include <linux/kvm_para.h>
>>>> +
>>>> +typedef struct KVMClockState {
>>>> +    SysBusDevice busdev;
>>>> +    uint64_t clock;
>>>> +    bool clock_valid;
>>>> +} KVMClockState;
>>>> +
>>>> +static void kvmclock_pre_save(void *opaque)
>>>> +{
>>>> +    KVMClockState *s = opaque;
>>>> +    struct kvm_clock_data data;
>>>> +    int ret;
>>>> +
>>>> +    if (s->clock_valid) {
>>>> +        return;
>>>> +    }
>>>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>>>> +    if (ret < 0) {
>>>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>>>> +        data.clock = 0;
>>>> +    }
>>>> +    s->clock = data.clock;
>>>> +    /*
>>>> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
>>>> +     * it on next vmsave (which would return a different value). Will be reset
>>>> +     * when the VM is continued.
>>>> +     */
>>>> +    s->clock_valid = !vm_running;
>>>> +}
>>>> +
>>>> +static int kvmclock_post_load(void *opaque, int version_id)
>>>> +{
>>>> +    KVMClockState *s = opaque;
>>>> +    struct kvm_clock_data data;
>>>> +
>>>> +    data.clock = s->clock;
>>>> +    data.flags = 0;
>>>> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>>>> +}
>>>> +
>>>> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
>>>> +{
>>>> +    KVMClockState *s = opaque;
>>>> +
>>>> +    if (running) {
>>>> +        s->clock_valid = false;
>>>> +    }
>>>> +}
>>>> +
>>>> +static int kvmclock_init(SysBusDevice *dev)
>>>> +{
>>>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
>>>> +
>>>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static const VMStateDescription kvmclock_vmsd = {
>>>> +    .name = "kvmclock",
>>>> +    .version_id = 1,
>>>> +    .minimum_version_id = 1,
>>>> +    .minimum_version_id_old = 1,
>>>> +    .pre_save = kvmclock_pre_save,
>>>> +    .post_load = kvmclock_post_load,
>>>> +    .fields = (VMStateField[]) {
>>>> +        VMSTATE_UINT64(clock, KVMClockState),
>>>> +        VMSTATE_END_OF_LIST()
>>>> +    }
>>>> +};
>>>> +
>>>> +static SysBusDeviceInfo kvmclock_info = {
>>>> +    .qdev.name = "kvmclock",
>>>> +    .qdev.size = sizeof(KVMClockState),
>>>> +    .qdev.vmsd = &kvmclock_vmsd,
>>>> +    .qdev.no_user = 1,
>>>> +    .init = kvmclock_init,
>>>> +};
>>>> +
>>>> +/* Note: Must be called after VCPU initialization. */
>>>> +void kvmclock_create(void)
>>>> +{
>>>> +    if (kvm_enabled() &&
>>>> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
>>>> +        sysbus_create_simple("kvmclock", -1, NULL);
>>>> +    }
>>>> +}
>>>> +
>>>> +static void kvmclock_register_device(void)
>>>> +{
>>>> +    if (kvm_enabled()) {
>>>> +        sysbus_register_withprop(&kvmclock_info);
>>>> +    }
>>>> +}
>>>> +
>>>> +device_init(kvmclock_register_device);
>>>> +
>>>> +#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
>>>> +
>>>> +void kvmclock_create(void)
>>>> +{
>>>> +}
>>>> +#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
>>>> diff --git a/hw/kvmclock.h b/hw/kvmclock.h
>>>> new file mode 100644
>>>> index 0000000..7a83cbe
>>>> --- /dev/null
>>>> +++ b/hw/kvmclock.h
>>>> @@ -0,0 +1,14 @@
>>>> +/*
>>>> + * QEMU KVM support, paravirtual clock device
>>>> + *
>>>> + * Copyright (C) 2011 Siemens AG
>>>> + *
>>>> + * Authors:
>>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL version 2.
>>>> + * See the COPYING file in the top-level directory.
>>>> + *
>>>> + */
>>>> +
>>>> +void kvmclock_create(void);
>>>> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
>>>> index 7b74473..9bc4659 100644
>>>> --- a/hw/pc_piix.c
>>>> +++ b/hw/pc_piix.c
>>>> @@ -32,6 +32,7 @@
>>>>  #include "boards.h"
>>>>  #include "ide.h"
>>>>  #include "kvm.h"
>>>> +#include "kvmclock.h"
>>>>  #include "sysemu.h"
>>>>  #include "sysbus.h"
>>>>  #include "arch_init.h"
>>>> @@ -66,7 +67,8 @@ static void pc_init1(ram_addr_t ram_size,
>>>>                       const char *kernel_cmdline,
>>>>                       const char *initrd_filename,
>>>>                       const char *cpu_model,
>>>> -                     int pci_enabled)
>>>> +                     int pci_enabled,
>>>> +                     int kvmclock_enabled)
>>>>  
>>> What exactly is your motivation to that ? I think mid/long-term
>>> we should be making machine initialization more common among
>>> architectures, not introducing more arch specific, or even worse, kvm
>>> specific parameters here.
>>>
>>> I'd like to understand what do we gain from that, since opting kvmclock
>>> in our out is done by cpuid anyway - no need for a specific machine.
>>
>> Is that really the case? I thought we were already shipping versions
>> where that CPU feature was enabled by default. If not, I'll happily drop
>> that admittedly clumsy approach above.
> 
> Yes, AFAIK, kvmclock is enabled by default, disabled by cpuid-leaf, as
> in
> -cpu kvm64,-kvmclock
> 
> So your test for cpuid bit before starting kvmclock should already cover
> it.
> 

No, just the contrary: As kvmclock was always enabled in older versions
and the compat machines also expose it, we cannot rely on the flag to
enable this new (and therefore 0.15-only) vmstate.

Jan
Glauber Costa - Feb. 7, 2011, 6:04 p.m.
On Mon, 2011-02-07 at 15:03 +0100, Jan Kiszka wrote:
> On 2011-02-07 14:40, Glauber Costa wrote:
> > On Mon, 2011-02-07 at 13:36 +0100, Jan Kiszka wrote:
> >> On 2011-02-07 13:27, Glauber Costa wrote:
> >>> On Mon, 2011-02-07 at 12:19 +0100, Jan Kiszka wrote:
> >>>> If kvmclock is used, which implies the kernel supports it, register a
> >>>> kvmclock device with the sysbus. Its main purpose is to save and restore
> >>>> the kernel state on migration, but this will also allow to visualize it
> >>>> one day.
> >>>>
> >>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>> CC: Glauber Costa <glommer@redhat.com>
> >>>> ---
> >>>>  Makefile.target |    4 +-
> >>>>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  hw/kvmclock.h   |   14 ++++++
> >>>>  hw/pc_piix.c    |   31 +++++++++++---
> >>>>  4 files changed, 165 insertions(+), 9 deletions(-)
> >>>>  create mode 100644 hw/kvmclock.c
> >>>>  create mode 100644 hw/kvmclock.h
> >>>>
> >>>> diff --git a/Makefile.target b/Makefile.target
> >>>> index b0ba95f..30232fa 100644
> >>>> --- a/Makefile.target
> >>>> +++ b/Makefile.target
> >>>> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
> >>>>  LIBS+=-lm
> >>>>  endif
> >>>>  
> >>>> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
> >>>> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
> >>>>  
> >>>>  config-target.h: config-target.h-timestamp
> >>>>  config-target.h-timestamp: config-target.mak
> >>>> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
> >>>>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
> >>>>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
> >>>>  obj-i386-y += debugcon.o multiboot.o
> >>>> -obj-i386-y += pc_piix.o
> >>>> +obj-i386-y += pc_piix.o kvmclock.o
> >>>>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
> >>>>  
> >>>>  # shared objects
> >>>> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
> >>>> new file mode 100644
> >>>> index 0000000..b6ceddf
> >>>> --- /dev/null
> >>>> +++ b/hw/kvmclock.c
> >>>> @@ -0,0 +1,125 @@
> >>>> +/*
> >>>> + * QEMU KVM support, paravirtual clock device
> >>>> + *
> >>>> + * Copyright (C) 2011 Siemens AG
> >>>> + *
> >>>> + * Authors:
> >>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> >>>> + *
> >>>> + * This work is licensed under the terms of the GNU GPL version 2.
> >>>> + * See the COPYING file in the top-level directory.
> >>>> + *
> >>>> + */
> >>>> +
> >>>> +#include "qemu-common.h"
> >>>> +#include "sysemu.h"
> >>>> +#include "sysbus.h"
> >>>> +#include "kvm.h"
> >>>> +#include "kvmclock.h"
> >>>> +
> >>>> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
> >>>> +
> >>>> +#include <linux/kvm.h>
> >>>> +#include <linux/kvm_para.h>
> >>>> +
> >>>> +typedef struct KVMClockState {
> >>>> +    SysBusDevice busdev;
> >>>> +    uint64_t clock;
> >>>> +    bool clock_valid;
> >>>> +} KVMClockState;
> >>>> +
> >>>> +static void kvmclock_pre_save(void *opaque)
> >>>> +{
> >>>> +    KVMClockState *s = opaque;
> >>>> +    struct kvm_clock_data data;
> >>>> +    int ret;
> >>>> +
> >>>> +    if (s->clock_valid) {
> >>>> +        return;
> >>>> +    }
> >>>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >>>> +    if (ret < 0) {
> >>>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >>>> +        data.clock = 0;
> >>>> +    }
> >>>> +    s->clock = data.clock;
> >>>> +    /*
> >>>> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
> >>>> +     * it on next vmsave (which would return a different value). Will be reset
> >>>> +     * when the VM is continued.
> >>>> +     */
> >>>> +    s->clock_valid = !vm_running;
> >>>> +}
> >>>> +
> >>>> +static int kvmclock_post_load(void *opaque, int version_id)
> >>>> +{
> >>>> +    KVMClockState *s = opaque;
> >>>> +    struct kvm_clock_data data;
> >>>> +
> >>>> +    data.clock = s->clock;
> >>>> +    data.flags = 0;
> >>>> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >>>> +}
> >>>> +
> >>>> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
> >>>> +{
> >>>> +    KVMClockState *s = opaque;
> >>>> +
> >>>> +    if (running) {
> >>>> +        s->clock_valid = false;
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +static int kvmclock_init(SysBusDevice *dev)
> >>>> +{
> >>>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
> >>>> +
> >>>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>> +static const VMStateDescription kvmclock_vmsd = {
> >>>> +    .name = "kvmclock",
> >>>> +    .version_id = 1,
> >>>> +    .minimum_version_id = 1,
> >>>> +    .minimum_version_id_old = 1,
> >>>> +    .pre_save = kvmclock_pre_save,
> >>>> +    .post_load = kvmclock_post_load,
> >>>> +    .fields = (VMStateField[]) {
> >>>> +        VMSTATE_UINT64(clock, KVMClockState),
> >>>> +        VMSTATE_END_OF_LIST()
> >>>> +    }
> >>>> +};
> >>>> +
> >>>> +static SysBusDeviceInfo kvmclock_info = {
> >>>> +    .qdev.name = "kvmclock",
> >>>> +    .qdev.size = sizeof(KVMClockState),
> >>>> +    .qdev.vmsd = &kvmclock_vmsd,
> >>>> +    .qdev.no_user = 1,
> >>>> +    .init = kvmclock_init,
> >>>> +};
> >>>> +
> >>>> +/* Note: Must be called after VCPU initialization. */
> >>>> +void kvmclock_create(void)
> >>>> +{
> >>>> +    if (kvm_enabled() &&
> >>>> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
> >>>> +        sysbus_create_simple("kvmclock", -1, NULL);
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +static void kvmclock_register_device(void)
> >>>> +{
> >>>> +    if (kvm_enabled()) {
> >>>> +        sysbus_register_withprop(&kvmclock_info);
> >>>> +    }
> >>>> +}
> >>>> +
> >>>> +device_init(kvmclock_register_device);
> >>>> +
> >>>> +#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> >>>> +
> >>>> +void kvmclock_create(void)
> >>>> +{
> >>>> +}
> >>>> +#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> >>>> diff --git a/hw/kvmclock.h b/hw/kvmclock.h
> >>>> new file mode 100644
> >>>> index 0000000..7a83cbe
> >>>> --- /dev/null
> >>>> +++ b/hw/kvmclock.h
> >>>> @@ -0,0 +1,14 @@
> >>>> +/*
> >>>> + * QEMU KVM support, paravirtual clock device
> >>>> + *
> >>>> + * Copyright (C) 2011 Siemens AG
> >>>> + *
> >>>> + * Authors:
> >>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> >>>> + *
> >>>> + * This work is licensed under the terms of the GNU GPL version 2.
> >>>> + * See the COPYING file in the top-level directory.
> >>>> + *
> >>>> + */
> >>>> +
> >>>> +void kvmclock_create(void);
> >>>> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> >>>> index 7b74473..9bc4659 100644
> >>>> --- a/hw/pc_piix.c
> >>>> +++ b/hw/pc_piix.c
> >>>> @@ -32,6 +32,7 @@
> >>>>  #include "boards.h"
> >>>>  #include "ide.h"
> >>>>  #include "kvm.h"
> >>>> +#include "kvmclock.h"
> >>>>  #include "sysemu.h"
> >>>>  #include "sysbus.h"
> >>>>  #include "arch_init.h"
> >>>> @@ -66,7 +67,8 @@ static void pc_init1(ram_addr_t ram_size,
> >>>>                       const char *kernel_cmdline,
> >>>>                       const char *initrd_filename,
> >>>>                       const char *cpu_model,
> >>>> -                     int pci_enabled)
> >>>> +                     int pci_enabled,
> >>>> +                     int kvmclock_enabled)
> >>>>  
> >>> What exactly is your motivation to that ? I think mid/long-term
> >>> we should be making machine initialization more common among
> >>> architectures, not introducing more arch specific, or even worse, kvm
> >>> specific parameters here.
> >>>
> >>> I'd like to understand what do we gain from that, since opting kvmclock
> >>> in our out is done by cpuid anyway - no need for a specific machine.
> >>
> >> Is that really the case? I thought we were already shipping versions
> >> where that CPU feature was enabled by default. If not, I'll happily drop
> >> that admittedly clumsy approach above.
> > 
> > Yes, AFAIK, kvmclock is enabled by default, disabled by cpuid-leaf, as
> > in
> > -cpu kvm64,-kvmclock
> > 
> > So your test for cpuid bit before starting kvmclock should already cover
> > it.
> > 
> 
> No, just the contrary: As kvmclock was always enabled in older versions
> and the compat machines also expose it, we cannot rely on the flag to
> enable this new (and therefore 0.15-only) vmstate.

I see. You're not enabling kvmclock functionality, but rather, kvmclock
device.

It makes sense then, but I still would prefer it encoded somewhere else
(but not a hard constraint anymore). Can't qdev encode it, say, in the
value field of the device?
Jan Kiszka - Feb. 7, 2011, 6:12 p.m.
On 2011-02-07 19:04, Glauber Costa wrote:
> On Mon, 2011-02-07 at 15:03 +0100, Jan Kiszka wrote:
>> On 2011-02-07 14:40, Glauber Costa wrote:
>>> On Mon, 2011-02-07 at 13:36 +0100, Jan Kiszka wrote:
>>>> On 2011-02-07 13:27, Glauber Costa wrote:
>>>>> On Mon, 2011-02-07 at 12:19 +0100, Jan Kiszka wrote:
>>>>>> If kvmclock is used, which implies the kernel supports it, register a
>>>>>> kvmclock device with the sysbus. Its main purpose is to save and restore
>>>>>> the kernel state on migration, but this will also allow to visualize it
>>>>>> one day.
>>>>>>
>>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>>>>> CC: Glauber Costa <glommer@redhat.com>
>>>>>> ---
>>>>>>  Makefile.target |    4 +-
>>>>>>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  hw/kvmclock.h   |   14 ++++++
>>>>>>  hw/pc_piix.c    |   31 +++++++++++---
>>>>>>  4 files changed, 165 insertions(+), 9 deletions(-)
>>>>>>  create mode 100644 hw/kvmclock.c
>>>>>>  create mode 100644 hw/kvmclock.h
>>>>>>
>>>>>> diff --git a/Makefile.target b/Makefile.target
>>>>>> index b0ba95f..30232fa 100644
>>>>>> --- a/Makefile.target
>>>>>> +++ b/Makefile.target
>>>>>> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
>>>>>>  LIBS+=-lm
>>>>>>  endif
>>>>>>  
>>>>>> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>>>>>> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>>>>>>  
>>>>>>  config-target.h: config-target.h-timestamp
>>>>>>  config-target.h-timestamp: config-target.mak
>>>>>> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
>>>>>>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
>>>>>>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
>>>>>>  obj-i386-y += debugcon.o multiboot.o
>>>>>> -obj-i386-y += pc_piix.o
>>>>>> +obj-i386-y += pc_piix.o kvmclock.o
>>>>>>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
>>>>>>  
>>>>>>  # shared objects
>>>>>> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
>>>>>> new file mode 100644
>>>>>> index 0000000..b6ceddf
>>>>>> --- /dev/null
>>>>>> +++ b/hw/kvmclock.c
>>>>>> @@ -0,0 +1,125 @@
>>>>>> +/*
>>>>>> + * QEMU KVM support, paravirtual clock device
>>>>>> + *
>>>>>> + * Copyright (C) 2011 Siemens AG
>>>>>> + *
>>>>>> + * Authors:
>>>>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
>>>>>> + *
>>>>>> + * This work is licensed under the terms of the GNU GPL version 2.
>>>>>> + * See the COPYING file in the top-level directory.
>>>>>> + *
>>>>>> + */
>>>>>> +
>>>>>> +#include "qemu-common.h"
>>>>>> +#include "sysemu.h"
>>>>>> +#include "sysbus.h"
>>>>>> +#include "kvm.h"
>>>>>> +#include "kvmclock.h"
>>>>>> +
>>>>>> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
>>>>>> +
>>>>>> +#include <linux/kvm.h>
>>>>>> +#include <linux/kvm_para.h>
>>>>>> +
>>>>>> +typedef struct KVMClockState {
>>>>>> +    SysBusDevice busdev;
>>>>>> +    uint64_t clock;
>>>>>> +    bool clock_valid;
>>>>>> +} KVMClockState;
>>>>>> +
>>>>>> +static void kvmclock_pre_save(void *opaque)
>>>>>> +{
>>>>>> +    KVMClockState *s = opaque;
>>>>>> +    struct kvm_clock_data data;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    if (s->clock_valid) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>>>>>> +    if (ret < 0) {
>>>>>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>>>>>> +        data.clock = 0;
>>>>>> +    }
>>>>>> +    s->clock = data.clock;
>>>>>> +    /*
>>>>>> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
>>>>>> +     * it on next vmsave (which would return a different value). Will be reset
>>>>>> +     * when the VM is continued.
>>>>>> +     */
>>>>>> +    s->clock_valid = !vm_running;
>>>>>> +}
>>>>>> +
>>>>>> +static int kvmclock_post_load(void *opaque, int version_id)
>>>>>> +{
>>>>>> +    KVMClockState *s = opaque;
>>>>>> +    struct kvm_clock_data data;
>>>>>> +
>>>>>> +    data.clock = s->clock;
>>>>>> +    data.flags = 0;
>>>>>> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>>>>>> +}
>>>>>> +
>>>>>> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
>>>>>> +{
>>>>>> +    KVMClockState *s = opaque;
>>>>>> +
>>>>>> +    if (running) {
>>>>>> +        s->clock_valid = false;
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static int kvmclock_init(SysBusDevice *dev)
>>>>>> +{
>>>>>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
>>>>>> +
>>>>>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static const VMStateDescription kvmclock_vmsd = {
>>>>>> +    .name = "kvmclock",
>>>>>> +    .version_id = 1,
>>>>>> +    .minimum_version_id = 1,
>>>>>> +    .minimum_version_id_old = 1,
>>>>>> +    .pre_save = kvmclock_pre_save,
>>>>>> +    .post_load = kvmclock_post_load,
>>>>>> +    .fields = (VMStateField[]) {
>>>>>> +        VMSTATE_UINT64(clock, KVMClockState),
>>>>>> +        VMSTATE_END_OF_LIST()
>>>>>> +    }
>>>>>> +};
>>>>>> +
>>>>>> +static SysBusDeviceInfo kvmclock_info = {
>>>>>> +    .qdev.name = "kvmclock",
>>>>>> +    .qdev.size = sizeof(KVMClockState),
>>>>>> +    .qdev.vmsd = &kvmclock_vmsd,
>>>>>> +    .qdev.no_user = 1,
>>>>>> +    .init = kvmclock_init,
>>>>>> +};
>>>>>> +
>>>>>> +/* Note: Must be called after VCPU initialization. */
>>>>>> +void kvmclock_create(void)
>>>>>> +{
>>>>>> +    if (kvm_enabled() &&
>>>>>> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
>>>>>> +        sysbus_create_simple("kvmclock", -1, NULL);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +static void kvmclock_register_device(void)
>>>>>> +{
>>>>>> +    if (kvm_enabled()) {
>>>>>> +        sysbus_register_withprop(&kvmclock_info);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +device_init(kvmclock_register_device);
>>>>>> +
>>>>>> +#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
>>>>>> +
>>>>>> +void kvmclock_create(void)
>>>>>> +{
>>>>>> +}
>>>>>> +#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
>>>>>> diff --git a/hw/kvmclock.h b/hw/kvmclock.h
>>>>>> new file mode 100644
>>>>>> index 0000000..7a83cbe
>>>>>> --- /dev/null
>>>>>> +++ b/hw/kvmclock.h
>>>>>> @@ -0,0 +1,14 @@
>>>>>> +/*
>>>>>> + * QEMU KVM support, paravirtual clock device
>>>>>> + *
>>>>>> + * Copyright (C) 2011 Siemens AG
>>>>>> + *
>>>>>> + * Authors:
>>>>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
>>>>>> + *
>>>>>> + * This work is licensed under the terms of the GNU GPL version 2.
>>>>>> + * See the COPYING file in the top-level directory.
>>>>>> + *
>>>>>> + */
>>>>>> +
>>>>>> +void kvmclock_create(void);
>>>>>> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
>>>>>> index 7b74473..9bc4659 100644
>>>>>> --- a/hw/pc_piix.c
>>>>>> +++ b/hw/pc_piix.c
>>>>>> @@ -32,6 +32,7 @@
>>>>>>  #include "boards.h"
>>>>>>  #include "ide.h"
>>>>>>  #include "kvm.h"
>>>>>> +#include "kvmclock.h"
>>>>>>  #include "sysemu.h"
>>>>>>  #include "sysbus.h"
>>>>>>  #include "arch_init.h"
>>>>>> @@ -66,7 +67,8 @@ static void pc_init1(ram_addr_t ram_size,
>>>>>>                       const char *kernel_cmdline,
>>>>>>                       const char *initrd_filename,
>>>>>>                       const char *cpu_model,
>>>>>> -                     int pci_enabled)
>>>>>> +                     int pci_enabled,
>>>>>> +                     int kvmclock_enabled)
>>>>>>  
>>>>> What exactly is your motivation to that ? I think mid/long-term
>>>>> we should be making machine initialization more common among
>>>>> architectures, not introducing more arch specific, or even worse, kvm
>>>>> specific parameters here.
>>>>>
>>>>> I'd like to understand what do we gain from that, since opting kvmclock
>>>>> in our out is done by cpuid anyway - no need for a specific machine.
>>>>
>>>> Is that really the case? I thought we were already shipping versions
>>>> where that CPU feature was enabled by default. If not, I'll happily drop
>>>> that admittedly clumsy approach above.
>>>
>>> Yes, AFAIK, kvmclock is enabled by default, disabled by cpuid-leaf, as
>>> in
>>> -cpu kvm64,-kvmclock
>>>
>>> So your test for cpuid bit before starting kvmclock should already cover
>>> it.
>>>
>>
>> No, just the contrary: As kvmclock was always enabled in older versions
>> and the compat machines also expose it, we cannot rely on the flag to
>> enable this new (and therefore 0.15-only) vmstate.
> 
> I see. You're not enabling kvmclock functionality, but rather, kvmclock
> device.
> 
> It makes sense then, but I still would prefer it encoded somewhere else
> (but not a hard constraint anymore). Can't qdev encode it, say, in the
> value field of the device? 

It would take some machine properties that could be pre-set according to
compat requirements. I have such machine-specific properties in mind
anyway in order to implement -machine in a more flexible way than
proposed so far.

I'll surely clean up the small ugliness this patch introduces to
pc_piix.c once there is a better way, I just don't want to let kvmclock
wait for that perfect infrastructure.

Jan
Glauber Costa - Feb. 7, 2011, 6:26 p.m.
On Mon, 2011-02-07 at 19:12 +0100, Jan Kiszka wrote:
> On 2011-02-07 19:04, Glauber Costa wrote:
> > On Mon, 2011-02-07 at 15:03 +0100, Jan Kiszka wrote:
> >> On 2011-02-07 14:40, Glauber Costa wrote:
> >>> On Mon, 2011-02-07 at 13:36 +0100, Jan Kiszka wrote:
> >>>> On 2011-02-07 13:27, Glauber Costa wrote:
> >>>>> On Mon, 2011-02-07 at 12:19 +0100, Jan Kiszka wrote:
> >>>>>> If kvmclock is used, which implies the kernel supports it, register a
> >>>>>> kvmclock device with the sysbus. Its main purpose is to save and restore
> >>>>>> the kernel state on migration, but this will also allow to visualize it
> >>>>>> one day.
> >>>>>>
> >>>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >>>>>> CC: Glauber Costa <glommer@redhat.com>
> >>>>>> ---
> >>>>>>  Makefile.target |    4 +-
> >>>>>>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>  hw/kvmclock.h   |   14 ++++++
> >>>>>>  hw/pc_piix.c    |   31 +++++++++++---
> >>>>>>  4 files changed, 165 insertions(+), 9 deletions(-)
> >>>>>>  create mode 100644 hw/kvmclock.c
> >>>>>>  create mode 100644 hw/kvmclock.h
> >>>>>>
> >>>>>> diff --git a/Makefile.target b/Makefile.target
> >>>>>> index b0ba95f..30232fa 100644
> >>>>>> --- a/Makefile.target
> >>>>>> +++ b/Makefile.target
> >>>>>> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
> >>>>>>  LIBS+=-lm
> >>>>>>  endif
> >>>>>>  
> >>>>>> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
> >>>>>> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
> >>>>>>  
> >>>>>>  config-target.h: config-target.h-timestamp
> >>>>>>  config-target.h-timestamp: config-target.mak
> >>>>>> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
> >>>>>>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
> >>>>>>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
> >>>>>>  obj-i386-y += debugcon.o multiboot.o
> >>>>>> -obj-i386-y += pc_piix.o
> >>>>>> +obj-i386-y += pc_piix.o kvmclock.o
> >>>>>>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
> >>>>>>  
> >>>>>>  # shared objects
> >>>>>> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
> >>>>>> new file mode 100644
> >>>>>> index 0000000..b6ceddf
> >>>>>> --- /dev/null
> >>>>>> +++ b/hw/kvmclock.c
> >>>>>> @@ -0,0 +1,125 @@
> >>>>>> +/*
> >>>>>> + * QEMU KVM support, paravirtual clock device
> >>>>>> + *
> >>>>>> + * Copyright (C) 2011 Siemens AG
> >>>>>> + *
> >>>>>> + * Authors:
> >>>>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> >>>>>> + *
> >>>>>> + * This work is licensed under the terms of the GNU GPL version 2.
> >>>>>> + * See the COPYING file in the top-level directory.
> >>>>>> + *
> >>>>>> + */
> >>>>>> +
> >>>>>> +#include "qemu-common.h"
> >>>>>> +#include "sysemu.h"
> >>>>>> +#include "sysbus.h"
> >>>>>> +#include "kvm.h"
> >>>>>> +#include "kvmclock.h"
> >>>>>> +
> >>>>>> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
> >>>>>> +
> >>>>>> +#include <linux/kvm.h>
> >>>>>> +#include <linux/kvm_para.h>
> >>>>>> +
> >>>>>> +typedef struct KVMClockState {
> >>>>>> +    SysBusDevice busdev;
> >>>>>> +    uint64_t clock;
> >>>>>> +    bool clock_valid;
> >>>>>> +} KVMClockState;
> >>>>>> +
> >>>>>> +static void kvmclock_pre_save(void *opaque)
> >>>>>> +{
> >>>>>> +    KVMClockState *s = opaque;
> >>>>>> +    struct kvm_clock_data data;
> >>>>>> +    int ret;
> >>>>>> +
> >>>>>> +    if (s->clock_valid) {
> >>>>>> +        return;
> >>>>>> +    }
> >>>>>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >>>>>> +    if (ret < 0) {
> >>>>>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >>>>>> +        data.clock = 0;
> >>>>>> +    }
> >>>>>> +    s->clock = data.clock;
> >>>>>> +    /*
> >>>>>> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
> >>>>>> +     * it on next vmsave (which would return a different value). Will be reset
> >>>>>> +     * when the VM is continued.
> >>>>>> +     */
> >>>>>> +    s->clock_valid = !vm_running;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int kvmclock_post_load(void *opaque, int version_id)
> >>>>>> +{
> >>>>>> +    KVMClockState *s = opaque;
> >>>>>> +    struct kvm_clock_data data;
> >>>>>> +
> >>>>>> +    data.clock = s->clock;
> >>>>>> +    data.flags = 0;
> >>>>>> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
> >>>>>> +{
> >>>>>> +    KVMClockState *s = opaque;
> >>>>>> +
> >>>>>> +    if (running) {
> >>>>>> +        s->clock_valid = false;
> >>>>>> +    }
> >>>>>> +}
> >>>>>> +
> >>>>>> +static int kvmclock_init(SysBusDevice *dev)
> >>>>>> +{
> >>>>>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
> >>>>>> +
> >>>>>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >>>>>> +    return 0;
> >>>>>> +}
> >>>>>> +
> >>>>>> +static const VMStateDescription kvmclock_vmsd = {
> >>>>>> +    .name = "kvmclock",
> >>>>>> +    .version_id = 1,
> >>>>>> +    .minimum_version_id = 1,
> >>>>>> +    .minimum_version_id_old = 1,
> >>>>>> +    .pre_save = kvmclock_pre_save,
> >>>>>> +    .post_load = kvmclock_post_load,
> >>>>>> +    .fields = (VMStateField[]) {
> >>>>>> +        VMSTATE_UINT64(clock, KVMClockState),
> >>>>>> +        VMSTATE_END_OF_LIST()
> >>>>>> +    }
> >>>>>> +};
> >>>>>> +
> >>>>>> +static SysBusDeviceInfo kvmclock_info = {
> >>>>>> +    .qdev.name = "kvmclock",
> >>>>>> +    .qdev.size = sizeof(KVMClockState),
> >>>>>> +    .qdev.vmsd = &kvmclock_vmsd,
> >>>>>> +    .qdev.no_user = 1,
> >>>>>> +    .init = kvmclock_init,
> >>>>>> +};
> >>>>>> +
> >>>>>> +/* Note: Must be called after VCPU initialization. */
> >>>>>> +void kvmclock_create(void)
> >>>>>> +{
> >>>>>> +    if (kvm_enabled() &&
> >>>>>> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
> >>>>>> +        sysbus_create_simple("kvmclock", -1, NULL);
> >>>>>> +    }
> >>>>>> +}
> >>>>>> +
> >>>>>> +static void kvmclock_register_device(void)
> >>>>>> +{
> >>>>>> +    if (kvm_enabled()) {
> >>>>>> +        sysbus_register_withprop(&kvmclock_info);
> >>>>>> +    }
> >>>>>> +}
> >>>>>> +
> >>>>>> +device_init(kvmclock_register_device);
> >>>>>> +
> >>>>>> +#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> >>>>>> +
> >>>>>> +void kvmclock_create(void)
> >>>>>> +{
> >>>>>> +}
> >>>>>> +#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
> >>>>>> diff --git a/hw/kvmclock.h b/hw/kvmclock.h
> >>>>>> new file mode 100644
> >>>>>> index 0000000..7a83cbe
> >>>>>> --- /dev/null
> >>>>>> +++ b/hw/kvmclock.h
> >>>>>> @@ -0,0 +1,14 @@
> >>>>>> +/*
> >>>>>> + * QEMU KVM support, paravirtual clock device
> >>>>>> + *
> >>>>>> + * Copyright (C) 2011 Siemens AG
> >>>>>> + *
> >>>>>> + * Authors:
> >>>>>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
> >>>>>> + *
> >>>>>> + * This work is licensed under the terms of the GNU GPL version 2.
> >>>>>> + * See the COPYING file in the top-level directory.
> >>>>>> + *
> >>>>>> + */
> >>>>>> +
> >>>>>> +void kvmclock_create(void);
> >>>>>> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> >>>>>> index 7b74473..9bc4659 100644
> >>>>>> --- a/hw/pc_piix.c
> >>>>>> +++ b/hw/pc_piix.c
> >>>>>> @@ -32,6 +32,7 @@
> >>>>>>  #include "boards.h"
> >>>>>>  #include "ide.h"
> >>>>>>  #include "kvm.h"
> >>>>>> +#include "kvmclock.h"
> >>>>>>  #include "sysemu.h"
> >>>>>>  #include "sysbus.h"
> >>>>>>  #include "arch_init.h"
> >>>>>> @@ -66,7 +67,8 @@ static void pc_init1(ram_addr_t ram_size,
> >>>>>>                       const char *kernel_cmdline,
> >>>>>>                       const char *initrd_filename,
> >>>>>>                       const char *cpu_model,
> >>>>>> -                     int pci_enabled)
> >>>>>> +                     int pci_enabled,
> >>>>>> +                     int kvmclock_enabled)
> >>>>>>  
> >>>>> What exactly is your motivation to that ? I think mid/long-term
> >>>>> we should be making machine initialization more common among
> >>>>> architectures, not introducing more arch specific, or even worse, kvm
> >>>>> specific parameters here.
> >>>>>
> >>>>> I'd like to understand what do we gain from that, since opting kvmclock
> >>>>> in our out is done by cpuid anyway - no need for a specific machine.
> >>>>
> >>>> Is that really the case? I thought we were already shipping versions
> >>>> where that CPU feature was enabled by default. If not, I'll happily drop
> >>>> that admittedly clumsy approach above.
> >>>
> >>> Yes, AFAIK, kvmclock is enabled by default, disabled by cpuid-leaf, as
> >>> in
> >>> -cpu kvm64,-kvmclock
> >>>
> >>> So your test for cpuid bit before starting kvmclock should already cover
> >>> it.
> >>>
> >>
> >> No, just the contrary: As kvmclock was always enabled in older versions
> >> and the compat machines also expose it, we cannot rely on the flag to
> >> enable this new (and therefore 0.15-only) vmstate.
> > 
> > I see. You're not enabling kvmclock functionality, but rather, kvmclock
> > device.
> > 
> > It makes sense then, but I still would prefer it encoded somewhere else
> > (but not a hard constraint anymore). Can't qdev encode it, say, in the
> > value field of the device? 
> 
> It would take some machine properties that could be pre-set according to
> compat requirements. I have such machine-specific properties in mind
> anyway in order to implement -machine in a more flexible way than
> proposed so far.
> 
> I'll surely clean up the small ugliness this patch introduces to
> pc_piix.c once there is a better way, I just don't want to let kvmclock
> wait for that perfect infrastructure.

Perfect is the enemy of good, I'm all for it.
Jan Kiszka - Feb. 7, 2011, 9:48 p.m.
On 2011-02-07 20:39, Blue Swirl wrote:
> On Mon, Feb 7, 2011 at 1:19 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> If kvmclock is used, which implies the kernel supports it, register a
>> kvmclock device with the sysbus. Its main purpose is to save and restore
>> the kernel state on migration, but this will also allow to visualize it
>> one day.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> CC: Glauber Costa <glommer@redhat.com>
>> ---
>>  Makefile.target |    4 +-
>>  hw/kvmclock.c   |  125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/kvmclock.h   |   14 ++++++
>>  hw/pc_piix.c    |   31 +++++++++++---
>>  4 files changed, 165 insertions(+), 9 deletions(-)
>>  create mode 100644 hw/kvmclock.c
>>  create mode 100644 hw/kvmclock.h
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index b0ba95f..30232fa 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -37,7 +37,7 @@ ifndef CONFIG_HAIKU
>>  LIBS+=-lm
>>  endif
>>
>> -kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>> +kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
>>
>>  config-target.h: config-target.h-timestamp
>>  config-target.h-timestamp: config-target.mak
>> @@ -218,7 +218,7 @@ obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
>>  obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
>>  obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
>>  obj-i386-y += debugcon.o multiboot.o
>> -obj-i386-y += pc_piix.o
>> +obj-i386-y += pc_piix.o kvmclock.o
> 
> Please build kvmclock.o conditionally to CONFIG_something...
> 
>>  obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
>>
>>  # shared objects
>> diff --git a/hw/kvmclock.c b/hw/kvmclock.c
>> new file mode 100644
>> index 0000000..b6ceddf
>> --- /dev/null
>> +++ b/hw/kvmclock.c
>> @@ -0,0 +1,125 @@
>> +/*
>> + * QEMU KVM support, paravirtual clock device
>> + *
>> + * Copyright (C) 2011 Siemens AG
>> + *
>> + * Authors:
>> + *  Jan Kiszka        <jan.kiszka@siemens.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL version 2.
>> + * See the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include "sysemu.h"
>> +#include "sysbus.h"
>> +#include "kvm.h"
>> +#include "kvmclock.h"
>> +
>> +#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
>> +
>> +#include <linux/kvm.h>
>> +#include <linux/kvm_para.h>
>> +
>> +typedef struct KVMClockState {
>> +    SysBusDevice busdev;
>> +    uint64_t clock;
>> +    bool clock_valid;
>> +} KVMClockState;
>> +
>> +static void kvmclock_pre_save(void *opaque)
>> +{
>> +    KVMClockState *s = opaque;
>> +    struct kvm_clock_data data;
>> +    int ret;
>> +
>> +    if (s->clock_valid) {
>> +        return;
>> +    }
>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> +    if (ret < 0) {
>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> +        data.clock = 0;
>> +    }
>> +    s->clock = data.clock;
>> +    /*
>> +     * If the VM is stopped, declare the clock state valid to avoid re-reading
>> +     * it on next vmsave (which would return a different value). Will be reset
>> +     * when the VM is continued.
>> +     */
>> +    s->clock_valid = !vm_running;
>> +}
>> +
>> +static int kvmclock_post_load(void *opaque, int version_id)
>> +{
>> +    KVMClockState *s = opaque;
>> +    struct kvm_clock_data data;
>> +
>> +    data.clock = s->clock;
>> +    data.flags = 0;
>> +    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>> +}
>> +
>> +static void kvmclock_vm_state_change(void *opaque, int running, int reason)
>> +{
>> +    KVMClockState *s = opaque;
>> +
>> +    if (running) {
>> +        s->clock_valid = false;
>> +    }
>> +}
>> +
>> +static int kvmclock_init(SysBusDevice *dev)
>> +{
>> +    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
>> +
>> +    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>> +    return 0;
>> +}
>> +
>> +static const VMStateDescription kvmclock_vmsd = {
>> +    .name = "kvmclock",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .minimum_version_id_old = 1,
>> +    .pre_save = kvmclock_pre_save,
>> +    .post_load = kvmclock_post_load,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT64(clock, KVMClockState),
>> +        VMSTATE_END_OF_LIST()
>> +    }
>> +};
>> +
>> +static SysBusDeviceInfo kvmclock_info = {
>> +    .qdev.name = "kvmclock",
>> +    .qdev.size = sizeof(KVMClockState),
>> +    .qdev.vmsd = &kvmclock_vmsd,
>> +    .qdev.no_user = 1,
>> +    .init = kvmclock_init,
>> +};
>> +
>> +/* Note: Must be called after VCPU initialization. */
>> +void kvmclock_create(void)
>> +{
>> +    if (kvm_enabled() &&
>> +        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
>> +        sysbus_create_simple("kvmclock", -1, NULL);
>> +    }
>> +}
> 
> ... and with this moved to a header as a static inline function, it
> should be possible to use sysbus_try_create() (coming soon) to try to
> create the device. Then it's not fatal if the device can't be created,
> that just means that the capability was not available at build time.

I played with this, and while it is generally a nice thing, it doesn't
help us here. We would just push the logic around, from kvmclock.c to
the header or even to configure (KVM_FEATURE_CLOCKSOURCE is not
unconditionally available).

I rather hope we finally agree on merging the required kvm headers into
qemu so that all this usually broken #ifdef KVM_CAP_* can be removed.

Jan

Patch

diff --git a/Makefile.target b/Makefile.target
index b0ba95f..30232fa 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -37,7 +37,7 @@  ifndef CONFIG_HAIKU
 LIBS+=-lm
 endif
 
-kvm.o kvm-all.o vhost.o vhost_net.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
+kvm.o kvm-all.o vhost.o vhost_net.o kvmclock.o: QEMU_CFLAGS+=$(KVM_CFLAGS)
 
 config-target.h: config-target.h-timestamp
 config-target.h-timestamp: config-target.mak
@@ -218,7 +218,7 @@  obj-i386-y += cirrus_vga.o apic.o ioapic.o piix_pci.o
 obj-i386-y += vmmouse.o vmport.o hpet.o applesmc.o
 obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o
 obj-i386-y += debugcon.o multiboot.o
-obj-i386-y += pc_piix.o
+obj-i386-y += pc_piix.o kvmclock.o
 obj-i386-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
 
 # shared objects
diff --git a/hw/kvmclock.c b/hw/kvmclock.c
new file mode 100644
index 0000000..b6ceddf
--- /dev/null
+++ b/hw/kvmclock.c
@@ -0,0 +1,125 @@ 
+/*
+ * QEMU KVM support, paravirtual clock device
+ *
+ * Copyright (C) 2011 Siemens AG
+ *
+ * Authors:
+ *  Jan Kiszka        <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "sysemu.h"
+#include "sysbus.h"
+#include "kvm.h"
+#include "kvmclock.h"
+
+#if defined(CONFIG_KVM_PARA) && defined(KVM_CAP_ADJUST_CLOCK)
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+
+typedef struct KVMClockState {
+    SysBusDevice busdev;
+    uint64_t clock;
+    bool clock_valid;
+} KVMClockState;
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    struct kvm_clock_data data;
+    int ret;
+
+    if (s->clock_valid) {
+        return;
+    }
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+        data.clock = 0;
+    }
+    s->clock = data.clock;
+    /*
+     * If the VM is stopped, declare the clock state valid to avoid re-reading
+     * it on next vmsave (which would return a different value). Will be reset
+     * when the VM is continued.
+     */
+    s->clock_valid = !vm_running;
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+    KVMClockState *s = opaque;
+    struct kvm_clock_data data;
+
+    data.clock = s->clock;
+    data.flags = 0;
+    return kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
+}
+
+static void kvmclock_vm_state_change(void *opaque, int running, int reason)
+{
+    KVMClockState *s = opaque;
+
+    if (running) {
+        s->clock_valid = false;
+    }
+}
+
+static int kvmclock_init(SysBusDevice *dev)
+{
+    KVMClockState *s = FROM_SYSBUS(KVMClockState, dev);
+
+    qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
+    return 0;
+}
+
+static const VMStateDescription kvmclock_vmsd = {
+    .name = "kvmclock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .pre_save = kvmclock_pre_save,
+    .post_load = kvmclock_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static SysBusDeviceInfo kvmclock_info = {
+    .qdev.name = "kvmclock",
+    .qdev.size = sizeof(KVMClockState),
+    .qdev.vmsd = &kvmclock_vmsd,
+    .qdev.no_user = 1,
+    .init = kvmclock_init,
+};
+
+/* Note: Must be called after VCPU initialization. */
+void kvmclock_create(void)
+{
+    if (kvm_enabled() &&
+        first_cpu->cpuid_kvm_features & (1ULL << KVM_FEATURE_CLOCKSOURCE)) {
+        sysbus_create_simple("kvmclock", -1, NULL);
+    }
+}
+
+static void kvmclock_register_device(void)
+{
+    if (kvm_enabled()) {
+        sysbus_register_withprop(&kvmclock_info);
+    }
+}
+
+device_init(kvmclock_register_device);
+
+#else /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
+
+void kvmclock_create(void)
+{
+}
+#endif /* !(CONFIG_KVM_PARA && KVM_CAP_ADJUST_CLOCK) */
diff --git a/hw/kvmclock.h b/hw/kvmclock.h
new file mode 100644
index 0000000..7a83cbe
--- /dev/null
+++ b/hw/kvmclock.h
@@ -0,0 +1,14 @@ 
+/*
+ * QEMU KVM support, paravirtual clock device
+ *
+ * Copyright (C) 2011 Siemens AG
+ *
+ * Authors:
+ *  Jan Kiszka        <jan.kiszka@siemens.com>
+ *
+ * This work is licensed under the terms of the GNU GPL version 2.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+void kvmclock_create(void);
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 7b74473..9bc4659 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -32,6 +32,7 @@ 
 #include "boards.h"
 #include "ide.h"
 #include "kvm.h"
+#include "kvmclock.h"
 #include "sysemu.h"
 #include "sysbus.h"
 #include "arch_init.h"
@@ -66,7 +67,8 @@  static void pc_init1(ram_addr_t ram_size,
                      const char *kernel_cmdline,
                      const char *initrd_filename,
                      const char *cpu_model,
-                     int pci_enabled)
+                     int pci_enabled,
+                     int kvmclock_enabled)
 {
     int i;
     ram_addr_t below_4g_mem_size, above_4g_mem_size;
@@ -87,6 +89,9 @@  static void pc_init1(ram_addr_t ram_size,
     pc_cpus_init(cpu_model);
 
     vmport_init();
+    if (kvmclock_enabled) {
+        kvmclock_create();
+    }
 
     /* allocate ram and load rom/bios */
     pc_memory_init(ram_size, kernel_filename, kernel_cmdline, initrd_filename,
@@ -195,7 +200,19 @@  static void pc_init_pci(ram_addr_t ram_size,
 {
     pc_init1(ram_size, boot_device,
              kernel_filename, kernel_cmdline,
-             initrd_filename, cpu_model, 1);
+             initrd_filename, cpu_model, 1, 1);
+}
+
+static void pc_init_pci_no_kvmclock(ram_addr_t ram_size,
+                                    const char *boot_device,
+                                    const char *kernel_filename,
+                                    const char *kernel_cmdline,
+                                    const char *initrd_filename,
+                                    const char *cpu_model)
+{
+    pc_init1(ram_size, boot_device,
+             kernel_filename, kernel_cmdline,
+             initrd_filename, cpu_model, 1, 0);
 }
 
 static void pc_init_isa(ram_addr_t ram_size,
@@ -209,7 +226,7 @@  static void pc_init_isa(ram_addr_t ram_size,
         cpu_model = "486";
     pc_init1(ram_size, boot_device,
              kernel_filename, kernel_cmdline,
-             initrd_filename, cpu_model, 0);
+             initrd_filename, cpu_model, 0, 1);
 }
 
 static QEMUMachine pc_machine = {
@@ -224,7 +241,7 @@  static QEMUMachine pc_machine = {
 static QEMUMachine pc_machine_v0_13 = {
     .name = "pc-0.13",
     .desc = "Standard PC",
-    .init = pc_init_pci,
+    .init = pc_init_pci_no_kvmclock,
     .max_cpus = 255,
     .compat_props = (GlobalProperty[]) {
         {
@@ -251,7 +268,7 @@  static QEMUMachine pc_machine_v0_13 = {
 static QEMUMachine pc_machine_v0_12 = {
     .name = "pc-0.12",
     .desc = "Standard PC",
-    .init = pc_init_pci,
+    .init = pc_init_pci_no_kvmclock,
     .max_cpus = 255,
     .compat_props = (GlobalProperty[]) {
         {
@@ -282,7 +299,7 @@  static QEMUMachine pc_machine_v0_12 = {
 static QEMUMachine pc_machine_v0_11 = {
     .name = "pc-0.11",
     .desc = "Standard PC, qemu 0.11",
-    .init = pc_init_pci,
+    .init = pc_init_pci_no_kvmclock,
     .max_cpus = 255,
     .compat_props = (GlobalProperty[]) {
         {
@@ -321,7 +338,7 @@  static QEMUMachine pc_machine_v0_11 = {
 static QEMUMachine pc_machine_v0_10 = {
     .name = "pc-0.10",
     .desc = "Standard PC, qemu 0.10",
-    .init = pc_init_pci,
+    .init = pc_init_pci_no_kvmclock,
     .max_cpus = 255,
     .compat_props = (GlobalProperty[]) {
         {