diff mbox

[v5] spapr: Add support for time base offset migration

Message ID 1397317963-23823-1-git-send-email-aik@ozlabs.ru
State New
Headers show

Commit Message

Alexey Kardashevskiy April 12, 2014, 3:52 p.m. UTC
This allows guests to have a different timebase origin from the host.

This is needed for migration, where a guest can migrate from one host
to another and the two hosts might have a different timebase origin.
However, the timebase seen by the guest must not go backwards, and
should go forwards only by a small amount corresponding to the time
taken for the migration.

This is only supported for recent POWER hardware which has the TBU40
(timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
970.

This adds kvm_access_one_reg() to access a special register which is not
in env->spr.

The feature must be present in the host kernel.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v5:
* fixed multiple comments in cpu_ppc_get_adjusted_tb and merged it
into timebase_post_load()
* removed round_up(1<<24) as KVM is expected to do this anyway
* removed @freq from migration stream
* renamed PPCTimebaseOffset to PPCTimebase
* CLOCKS_PER_SEC is used as a constant which 1000000us/s (man clock)

v4:
* made it per machine timebase offser rather than per CPU

v3:
* kvm_access_one_reg moved out to a separate patch
* tb_offset and host_timebase were replaced with guest_timebase as
the destionation does not really care of offset on the source

v2:
* bumped the vmstate_ppc_cpu version
* defined version for the env.tb_env field
---
 hw/ppc/ppc.c           | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr.c         |  4 +--
 include/hw/ppc/spapr.h |  1 +
 target-ppc/cpu-qom.h   | 15 ++++++++++
 target-ppc/kvm.c       |  5 ++++
 trace-events           |  3 ++
 6 files changed, 102 insertions(+), 2 deletions(-)

Comments

Alexander Graf April 14, 2014, 7:08 a.m. UTC | #1
On 12.04.14 17:52, Alexey Kardashevskiy wrote:
> This allows guests to have a different timebase origin from the host.
>
> This is needed for migration, where a guest can migrate from one host
> to another and the two hosts might have a different timebase origin.
> However, the timebase seen by the guest must not go backwards, and
> should go forwards only by a small amount corresponding to the time
> taken for the migration.
>
> This is only supported for recent POWER hardware which has the TBU40
> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
> 970.
>
> This adds kvm_access_one_reg() to access a special register which is not
> in env->spr.
>
> The feature must be present in the host kernel.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
> Changes:
> v5:
> * fixed multiple comments in cpu_ppc_get_adjusted_tb and merged it
> into timebase_post_load()
> * removed round_up(1<<24) as KVM is expected to do this anyway
> * removed @freq from migration stream
> * renamed PPCTimebaseOffset to PPCTimebase
> * CLOCKS_PER_SEC is used as a constant which 1000000us/s (man clock)
>
> v4:
> * made it per machine timebase offser rather than per CPU
>
> v3:
> * kvm_access_one_reg moved out to a separate patch
> * tb_offset and host_timebase were replaced with guest_timebase as
> the destionation does not really care of offset on the source
>
> v2:
> * bumped the vmstate_ppc_cpu version
> * defined version for the env.tb_env field
> ---
>   hw/ppc/ppc.c           | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/ppc/spapr.c         |  4 +--
>   include/hw/ppc/spapr.h |  1 +
>   target-ppc/cpu-qom.h   | 15 ++++++++++
>   target-ppc/kvm.c       |  5 ++++
>   trace-events           |  3 ++
>   6 files changed, 102 insertions(+), 2 deletions(-)
>
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index 71df471..bf61a0a 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -29,9 +29,11 @@
>   #include "sysemu/cpus.h"
>   #include "hw/timer/m48t59.h"
>   #include "qemu/log.h"
> +#include "qemu/error-report.h"
>   #include "hw/loader.h"
>   #include "sysemu/kvm.h"
>   #include "kvm_ppc.h"
> +#include "trace.h"
>   
>   //#define PPC_DEBUG_IRQ
>   //#define PPC_DEBUG_TB
> @@ -829,6 +831,80 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
>       cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>   }
>   
> +static void timebase_pre_save(void *opaque)
> +{
> +    PPCTimebase *tb = opaque;
> +    uint64_t ticks = cpu_get_real_ticks();
> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
> +
> +    if (!first_ppc_cpu->env.tb_env) {
> +        error_report("No timebase object");
> +        return;
> +    }
> +
> +    tb->time_of_the_day = get_clock_realtime() / 1000;

It'd be good to indicate in the field name that we're in us units. In 
fact it probably makes sense to use ns throughout and not convert :). 
Nanoseconds are QEMU's "native" internal time format.

> +    /*
> +     * tb_offset is only expected to be changed by migration so
> +     * there is no need to update it from KVM here
> +     */
> +    tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
> +}
> +
> +static int timebase_post_load(void *opaque, int version_id)
> +{
> +    PPCTimebase *tb = opaque;
> +    CPUState *cpu;
> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
> +    int64_t tb_off_adj, tb_off;
> +    int64_t migration_duration_us, migration_duration_tb, guest_tb, host_us;
> +    unsigned long freq;
> +
> +    if (!first_ppc_cpu->env.tb_env) {
> +        error_report("No timebase object");
> +        return -1;
> +    }
> +
> +    freq = first_ppc_cpu->env.tb_env->tb_freq;
> +    /*
> +     * Calculate timebase on the destination side of migration.
> +     * The destination timebase must be not less than the source timebase.
> +     * We try to adjust timebase by downtime if host clocks are not
> +     * too much out of sync (1 second for now).
> +     */
> +    host_us = get_clock_realtime() / 1000;
> +    migration_duration_us = MIN(CLOCKS_PER_SEC, host_us - tb->time_of_the_day);

CLOCKS_PER_SEC only make sense with the return value of clock(). I 
wouldn't be surprised if this calculation goes totally bonkers on BSD or 
Windows systems.

I still don't understand why we're doing a MIN() here. I would 
understand a MAX() to guard against unsynchronized times on 
source/destination.

> +    migration_duration_tb = migration_duration_us * (freq / CLOCKS_PER_SEC);

I think muldiv64() is safer here in case tb_freq is small and the clock 
rate is high, no?

> +    guest_tb = tb->guest_timebase + MIN(0, migration_duration_tb);
> +
> +    tb_off_adj = guest_tb - cpu_get_real_ticks();
> +
> +    tb_off = first_ppc_cpu->env.tb_env->tb_offset;

Before migration env->tb_env->tb_offset should always be 0, no?

> +    trace_ppc_tb_adjust(tb_off, tb_off_adj, tb_off_adj - tb_off,
> +                        (tb_off_adj - tb_off) / freq);
> +
> +    /* Set new offset to all CPUs */
> +    CPU_FOREACH(cpu) {
> +        PowerPCCPU *pcpu = POWERPC_CPU(cpu);
> +        pcpu->env.tb_env->tb_offset = tb_off_adj;
> +    }
> +
> +    return 0;
> +}
> +
> +const VMStateDescription vmstate_ppc_timebase = {
> +    .name = "timebase",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .minimum_version_id_old = 1,
> +    .pre_save = timebase_pre_save,
> +    .post_load = timebase_post_load,
> +    .fields      = (VMStateField []) {
> +        VMSTATE_UINT64(guest_timebase, PPCTimebase),
> +        VMSTATE_UINT64(time_of_the_day, PPCTimebase),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>   /* Set up (once) timebase frequency (in Hz) */
>   clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>   {
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 451c473..6d5412b 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -818,7 +818,7 @@ static int spapr_vga_init(PCIBus *pci_bus)
>   
>   static const VMStateDescription vmstate_spapr = {
>       .name = "spapr",
> -    .version_id = 1,
> +    .version_id = 2,

Ok, good, you're bumping the version because we become incompatible, but ...

>       .minimum_version_id = 1,
>       .minimum_version_id_old = 1,
>       .fields      = (VMStateField []) {
> @@ -826,7 +826,7 @@ static const VMStateDescription vmstate_spapr = {
>   
>           /* RTC offset */
>           VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
> -
> +        VMSTATE_PPC_TIMEBASE(tb, sPAPREnvironment),

Don't we have to indicate somewhere how to load version 1 if we don't 
declare it the new minimum_version?


Alex
Alexey Kardashevskiy April 14, 2014, 7:33 a.m. UTC | #2
On 04/14/2014 05:08 PM, Alexander Graf wrote:
> 
> On 12.04.14 17:52, Alexey Kardashevskiy wrote:
>> This allows guests to have a different timebase origin from the host.
>>
>> This is needed for migration, where a guest can migrate from one host
>> to another and the two hosts might have a different timebase origin.
>> However, the timebase seen by the guest must not go backwards, and
>> should go forwards only by a small amount corresponding to the time
>> taken for the migration.
>>
>> This is only supported for recent POWER hardware which has the TBU40
>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
>> 970.
>>
>> This adds kvm_access_one_reg() to access a special register which is not
>> in env->spr.
>>
>> The feature must be present in the host kernel.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>> Changes:
>> v5:
>> * fixed multiple comments in cpu_ppc_get_adjusted_tb and merged it
>> into timebase_post_load()
>> * removed round_up(1<<24) as KVM is expected to do this anyway
>> * removed @freq from migration stream
>> * renamed PPCTimebaseOffset to PPCTimebase
>> * CLOCKS_PER_SEC is used as a constant which 1000000us/s (man clock)
>>
>> v4:
>> * made it per machine timebase offser rather than per CPU
>>
>> v3:
>> * kvm_access_one_reg moved out to a separate patch
>> * tb_offset and host_timebase were replaced with guest_timebase as
>> the destionation does not really care of offset on the source
>>
>> v2:
>> * bumped the vmstate_ppc_cpu version
>> * defined version for the env.tb_env field
>> ---
>>   hw/ppc/ppc.c           | 76
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   hw/ppc/spapr.c         |  4 +--
>>   include/hw/ppc/spapr.h |  1 +
>>   target-ppc/cpu-qom.h   | 15 ++++++++++
>>   target-ppc/kvm.c       |  5 ++++
>>   trace-events           |  3 ++
>>   6 files changed, 102 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>> index 71df471..bf61a0a 100644
>> --- a/hw/ppc/ppc.c
>> +++ b/hw/ppc/ppc.c
>> @@ -29,9 +29,11 @@
>>   #include "sysemu/cpus.h"
>>   #include "hw/timer/m48t59.h"
>>   #include "qemu/log.h"
>> +#include "qemu/error-report.h"
>>   #include "hw/loader.h"
>>   #include "sysemu/kvm.h"
>>   #include "kvm_ppc.h"
>> +#include "trace.h"
>>     //#define PPC_DEBUG_IRQ
>>   //#define PPC_DEBUG_TB
>> @@ -829,6 +831,80 @@ static void cpu_ppc_set_tb_clk (void *opaque,
>> uint32_t freq)
>>       cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>>   }
>>   +static void timebase_pre_save(void *opaque)
>> +{
>> +    PPCTimebase *tb = opaque;
>> +    uint64_t ticks = cpu_get_real_ticks();
>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>> +
>> +    if (!first_ppc_cpu->env.tb_env) {
>> +        error_report("No timebase object");
>> +        return;
>> +    }
>> +
>> +    tb->time_of_the_day = get_clock_realtime() / 1000;
> 
> It'd be good to indicate in the field name that we're in us units. In fact
> it probably makes sense to use ns throughout and not convert :).
> Nanoseconds are QEMU's "native" internal time format.

Ooookay, I'll make it "ns", then no indicator is needed, right?


>> +    /*
>> +     * tb_offset is only expected to be changed by migration so
>> +     * there is no need to update it from KVM here
>> +     */
>> +    tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
>> +}
>> +
>> +static int timebase_post_load(void *opaque, int version_id)
>> +{
>> +    PPCTimebase *tb = opaque;
>> +    CPUState *cpu;
>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>> +    int64_t tb_off_adj, tb_off;
>> +    int64_t migration_duration_us, migration_duration_tb, guest_tb,
>> host_us;
>> +    unsigned long freq;
>> +
>> +    if (!first_ppc_cpu->env.tb_env) {
>> +        error_report("No timebase object");
>> +        return -1;
>> +    }
>> +
>> +    freq = first_ppc_cpu->env.tb_env->tb_freq;
>> +    /*
>> +     * Calculate timebase on the destination side of migration.
>> +     * The destination timebase must be not less than the source timebase.
>> +     * We try to adjust timebase by downtime if host clocks are not
>> +     * too much out of sync (1 second for now).
>> +     */
>> +    host_us = get_clock_realtime() / 1000;
>> +    migration_duration_us = MIN(CLOCKS_PER_SEC, host_us -
>> tb->time_of_the_day);
> 
> CLOCKS_PER_SEC only make sense with the return value of clock(). I wouldn't
> be surprised if this calculation goes totally bonkers on BSD or Windows
> systems.


http://linux.die.net/man/3/clock
C89, C99, POSIX.1-2001. POSIX requires that CLOCKS_PER_SEC equals 1000000
independent of the actual resolution.

This should be valid for BSD too. Ok, this has no effect on Windows.
Replace definition with get_ticks_per_sec()? Defining USEC_PER_SEC (or
NSEC_xxx) does not make much sense.

> 
> I still don't understand why we're doing a MIN() here. I would understand a
> MAX() to guard against unsynchronized times on source/destination.

If I added time difference between out-of-sync hosts, the delta would be
huge and the destination guest would behave really weird until this time
difference is over. Guests can cope with accidental timebase jump of
seconds but not minutes.



>> +    migration_duration_tb = migration_duration_us * (freq /
>> CLOCKS_PER_SEC);
> 
> I think muldiv64() is safer here in case tb_freq is small and the clock
> rate is high, no?

Oh. Ok.



>> +    guest_tb = tb->guest_timebase + MIN(0, migration_duration_tb);
>> +
>> +    tb_off_adj = guest_tb - cpu_get_real_ticks();
>> +
>> +    tb_off = first_ppc_cpu->env.tb_env->tb_offset;
> 
> Before migration env->tb_env->tb_offset should always be 0, no?


Before the very first migration - yes. Before subsequent ones - no.


>> +    trace_ppc_tb_adjust(tb_off, tb_off_adj, tb_off_adj - tb_off,
>> +                        (tb_off_adj - tb_off) / freq);
>> +
>> +    /* Set new offset to all CPUs */
>> +    CPU_FOREACH(cpu) {
>> +        PowerPCCPU *pcpu = POWERPC_CPU(cpu);
>> +        pcpu->env.tb_env->tb_offset = tb_off_adj;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +const VMStateDescription vmstate_ppc_timebase = {
>> +    .name = "timebase",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .minimum_version_id_old = 1,
>> +    .pre_save = timebase_pre_save,
>> +    .post_load = timebase_post_load,
>> +    .fields      = (VMStateField []) {
>> +        VMSTATE_UINT64(guest_timebase, PPCTimebase),
>> +        VMSTATE_UINT64(time_of_the_day, PPCTimebase),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>   /* Set up (once) timebase frequency (in Hz) */
>>   clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>>   {
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 451c473..6d5412b 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -818,7 +818,7 @@ static int spapr_vga_init(PCIBus *pci_bus)
>>     static const VMStateDescription vmstate_spapr = {
>>       .name = "spapr",
>> -    .version_id = 1,
>> +    .version_id = 2,
> 
> Ok, good, you're bumping the version because we become incompatible, but ...
> 
>>       .minimum_version_id = 1,
>>       .minimum_version_id_old = 1,
>>       .fields      = (VMStateField []) {
>> @@ -826,7 +826,7 @@ static const VMStateDescription vmstate_spapr = {
>>             /* RTC offset */
>>           VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
>> -
>> +        VMSTATE_PPC_TIMEBASE(tb, sPAPREnvironment),
> 
> Don't we have to indicate somewhere how to load version 1 if we don't
> declare it the new minimum_version?

I can fix VMSTATE_PPC_TIMEBASE to accept version and make is "2", yes.

I just do not see much point in this as I seem to be the only person who
does something about migration on PPC regularly and for me migration in the
upstream QEMU is useless without this patch :)
Alexander Graf April 14, 2014, 7:46 a.m. UTC | #3
On 14.04.14 09:33, Alexey Kardashevskiy wrote:
> On 04/14/2014 05:08 PM, Alexander Graf wrote:
>> On 12.04.14 17:52, Alexey Kardashevskiy wrote:
>>> This allows guests to have a different timebase origin from the host.
>>>
>>> This is needed for migration, where a guest can migrate from one host
>>> to another and the two hosts might have a different timebase origin.
>>> However, the timebase seen by the guest must not go backwards, and
>>> should go forwards only by a small amount corresponding to the time
>>> taken for the migration.
>>>
>>> This is only supported for recent POWER hardware which has the TBU40
>>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
>>> 970.
>>>
>>> This adds kvm_access_one_reg() to access a special register which is not
>>> in env->spr.
>>>
>>> The feature must be present in the host kernel.
>>>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>> Changes:
>>> v5:
>>> * fixed multiple comments in cpu_ppc_get_adjusted_tb and merged it
>>> into timebase_post_load()
>>> * removed round_up(1<<24) as KVM is expected to do this anyway
>>> * removed @freq from migration stream
>>> * renamed PPCTimebaseOffset to PPCTimebase
>>> * CLOCKS_PER_SEC is used as a constant which 1000000us/s (man clock)
>>>
>>> v4:
>>> * made it per machine timebase offser rather than per CPU
>>>
>>> v3:
>>> * kvm_access_one_reg moved out to a separate patch
>>> * tb_offset and host_timebase were replaced with guest_timebase as
>>> the destionation does not really care of offset on the source
>>>
>>> v2:
>>> * bumped the vmstate_ppc_cpu version
>>> * defined version for the env.tb_env field
>>> ---
>>>    hw/ppc/ppc.c           | 76
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>    hw/ppc/spapr.c         |  4 +--
>>>    include/hw/ppc/spapr.h |  1 +
>>>    target-ppc/cpu-qom.h   | 15 ++++++++++
>>>    target-ppc/kvm.c       |  5 ++++
>>>    trace-events           |  3 ++
>>>    6 files changed, 102 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>>> index 71df471..bf61a0a 100644
>>> --- a/hw/ppc/ppc.c
>>> +++ b/hw/ppc/ppc.c
>>> @@ -29,9 +29,11 @@
>>>    #include "sysemu/cpus.h"
>>>    #include "hw/timer/m48t59.h"
>>>    #include "qemu/log.h"
>>> +#include "qemu/error-report.h"
>>>    #include "hw/loader.h"
>>>    #include "sysemu/kvm.h"
>>>    #include "kvm_ppc.h"
>>> +#include "trace.h"
>>>      //#define PPC_DEBUG_IRQ
>>>    //#define PPC_DEBUG_TB
>>> @@ -829,6 +831,80 @@ static void cpu_ppc_set_tb_clk (void *opaque,
>>> uint32_t freq)
>>>        cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>>>    }
>>>    +static void timebase_pre_save(void *opaque)
>>> +{
>>> +    PPCTimebase *tb = opaque;
>>> +    uint64_t ticks = cpu_get_real_ticks();
>>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>>> +
>>> +    if (!first_ppc_cpu->env.tb_env) {
>>> +        error_report("No timebase object");
>>> +        return;
>>> +    }
>>> +
>>> +    tb->time_of_the_day = get_clock_realtime() / 1000;
>> It'd be good to indicate in the field name that we're in us units. In fact
>> it probably makes sense to use ns throughout and not convert :).
>> Nanoseconds are QEMU's "native" internal time format.
> Ooookay, I'll make it "ns", then no indicator is needed, right?

I still prefer to keep an indicator. Makes it more readable.

>
>
>>> +    /*
>>> +     * tb_offset is only expected to be changed by migration so
>>> +     * there is no need to update it from KVM here
>>> +     */
>>> +    tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
>>> +}
>>> +
>>> +static int timebase_post_load(void *opaque, int version_id)
>>> +{
>>> +    PPCTimebase *tb = opaque;
>>> +    CPUState *cpu;
>>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>>> +    int64_t tb_off_adj, tb_off;
>>> +    int64_t migration_duration_us, migration_duration_tb, guest_tb,
>>> host_us;
>>> +    unsigned long freq;
>>> +
>>> +    if (!first_ppc_cpu->env.tb_env) {
>>> +        error_report("No timebase object");
>>> +        return -1;
>>> +    }
>>> +
>>> +    freq = first_ppc_cpu->env.tb_env->tb_freq;
>>> +    /*
>>> +     * Calculate timebase on the destination side of migration.
>>> +     * The destination timebase must be not less than the source timebase.
>>> +     * We try to adjust timebase by downtime if host clocks are not
>>> +     * too much out of sync (1 second for now).
>>> +     */
>>> +    host_us = get_clock_realtime() / 1000;
>>> +    migration_duration_us = MIN(CLOCKS_PER_SEC, host_us -
>>> tb->time_of_the_day);
>> CLOCKS_PER_SEC only make sense with the return value of clock(). I wouldn't
>> be surprised if this calculation goes totally bonkers on BSD or Windows
>> systems.
>
> http://linux.die.net/man/3/clock
> C89, C99, POSIX.1-2001. POSIX requires that CLOCKS_PER_SEC equals 1000000
> independent of the actual resolution.
>
> This should be valid for BSD too. Ok, this has no effect on Windows.
> Replace definition with get_ticks_per_sec()? Defining USEC_PER_SEC (or
> NSEC_xxx) does not make much sense.

I'd just define NSEC_PER_SEC.

>
>> I still don't understand why we're doing a MIN() here. I would understand a
>> MAX() to guard against unsynchronized times on source/destination.
> If I added time difference between out-of-sync hosts, the delta would be
> huge and the destination guest would behave really weird until this time
> difference is over. Guests can cope with accidental timebase jump of
> seconds but not minutes.

Exactly. That's MAX(), right?

>
>
>
>>> +    migration_duration_tb = migration_duration_us * (freq /
>>> CLOCKS_PER_SEC);
>> I think muldiv64() is safer here in case tb_freq is small and the clock
>> rate is high, no?
> Oh. Ok.
>
>
>
>>> +    guest_tb = tb->guest_timebase + MIN(0, migration_duration_tb);
>>> +
>>> +    tb_off_adj = guest_tb - cpu_get_real_ticks();
>>> +
>>> +    tb_off = first_ppc_cpu->env.tb_env->tb_offset;
>> Before migration env->tb_env->tb_offset should always be 0, no?
>
> Before the very first migration - yes. Before subsequent ones - no.

Good point :).

>
>
>>> +    trace_ppc_tb_adjust(tb_off, tb_off_adj, tb_off_adj - tb_off,
>>> +                        (tb_off_adj - tb_off) / freq);
>>> +
>>> +    /* Set new offset to all CPUs */
>>> +    CPU_FOREACH(cpu) {
>>> +        PowerPCCPU *pcpu = POWERPC_CPU(cpu);
>>> +        pcpu->env.tb_env->tb_offset = tb_off_adj;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +const VMStateDescription vmstate_ppc_timebase = {
>>> +    .name = "timebase",
>>> +    .version_id = 1,
>>> +    .minimum_version_id = 1,
>>> +    .minimum_version_id_old = 1,
>>> +    .pre_save = timebase_pre_save,
>>> +    .post_load = timebase_post_load,
>>> +    .fields      = (VMStateField []) {
>>> +        VMSTATE_UINT64(guest_timebase, PPCTimebase),
>>> +        VMSTATE_UINT64(time_of_the_day, PPCTimebase),
>>> +        VMSTATE_END_OF_LIST()
>>> +    },
>>> +};
>>> +
>>>    /* Set up (once) timebase frequency (in Hz) */
>>>    clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
>>>    {
>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>> index 451c473..6d5412b 100644
>>> --- a/hw/ppc/spapr.c
>>> +++ b/hw/ppc/spapr.c
>>> @@ -818,7 +818,7 @@ static int spapr_vga_init(PCIBus *pci_bus)
>>>      static const VMStateDescription vmstate_spapr = {
>>>        .name = "spapr",
>>> -    .version_id = 1,
>>> +    .version_id = 2,
>> Ok, good, you're bumping the version because we become incompatible, but ...
>>
>>>        .minimum_version_id = 1,
>>>        .minimum_version_id_old = 1,
>>>        .fields      = (VMStateField []) {
>>> @@ -826,7 +826,7 @@ static const VMStateDescription vmstate_spapr = {
>>>              /* RTC offset */
>>>            VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
>>> -
>>> +        VMSTATE_PPC_TIMEBASE(tb, sPAPREnvironment),
>> Don't we have to indicate somewhere how to load version 1 if we don't
>> declare it the new minimum_version?
> I can fix VMSTATE_PPC_TIMEBASE to accept version and make is "2", yes.
>
> I just do not see much point in this as I seem to be the only person who
> does something about migration on PPC regularly and for me migration in the
> upstream QEMU is useless without this patch :)

I think it makes sense to learn how to keep migration backwards 
compatible ;).


Alex
Alexey Kardashevskiy April 14, 2014, 8:10 a.m. UTC | #4
On 04/14/2014 05:46 PM, Alexander Graf wrote:
> 
> On 14.04.14 09:33, Alexey Kardashevskiy wrote:
>> On 04/14/2014 05:08 PM, Alexander Graf wrote:
>>> On 12.04.14 17:52, Alexey Kardashevskiy wrote:
>>>> This allows guests to have a different timebase origin from the host.
>>>>
>>>> This is needed for migration, where a guest can migrate from one host
>>>> to another and the two hosts might have a different timebase origin.
>>>> However, the timebase seen by the guest must not go backwards, and
>>>> should go forwards only by a small amount corresponding to the time
>>>> taken for the migration.
>>>>
>>>> This is only supported for recent POWER hardware which has the TBU40
>>>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
>>>> 970.
>>>>
>>>> This adds kvm_access_one_reg() to access a special register which is not
>>>> in env->spr.
>>>>
>>>> The feature must be present in the host kernel.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>> Changes:
>>>> v5:
>>>> * fixed multiple comments in cpu_ppc_get_adjusted_tb and merged it
>>>> into timebase_post_load()
>>>> * removed round_up(1<<24) as KVM is expected to do this anyway
>>>> * removed @freq from migration stream
>>>> * renamed PPCTimebaseOffset to PPCTimebase
>>>> * CLOCKS_PER_SEC is used as a constant which 1000000us/s (man clock)
>>>>
>>>> v4:
>>>> * made it per machine timebase offser rather than per CPU
>>>>
>>>> v3:
>>>> * kvm_access_one_reg moved out to a separate patch
>>>> * tb_offset and host_timebase were replaced with guest_timebase as
>>>> the destionation does not really care of offset on the source
>>>>
>>>> v2:
>>>> * bumped the vmstate_ppc_cpu version
>>>> * defined version for the env.tb_env field
>>>> ---
>>>>    hw/ppc/ppc.c           | 76
>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    hw/ppc/spapr.c         |  4 +--
>>>>    include/hw/ppc/spapr.h |  1 +
>>>>    target-ppc/cpu-qom.h   | 15 ++++++++++
>>>>    target-ppc/kvm.c       |  5 ++++
>>>>    trace-events           |  3 ++
>>>>    6 files changed, 102 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>>>> index 71df471..bf61a0a 100644
>>>> --- a/hw/ppc/ppc.c
>>>> +++ b/hw/ppc/ppc.c
>>>> @@ -29,9 +29,11 @@
>>>>    #include "sysemu/cpus.h"
>>>>    #include "hw/timer/m48t59.h"
>>>>    #include "qemu/log.h"
>>>> +#include "qemu/error-report.h"
>>>>    #include "hw/loader.h"
>>>>    #include "sysemu/kvm.h"
>>>>    #include "kvm_ppc.h"
>>>> +#include "trace.h"
>>>>      //#define PPC_DEBUG_IRQ
>>>>    //#define PPC_DEBUG_TB
>>>> @@ -829,6 +831,80 @@ static void cpu_ppc_set_tb_clk (void *opaque,
>>>> uint32_t freq)
>>>>        cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>>>>    }
>>>>    +static void timebase_pre_save(void *opaque)
>>>> +{
>>>> +    PPCTimebase *tb = opaque;
>>>> +    uint64_t ticks = cpu_get_real_ticks();
>>>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>>>> +
>>>> +    if (!first_ppc_cpu->env.tb_env) {
>>>> +        error_report("No timebase object");
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    tb->time_of_the_day = get_clock_realtime() / 1000;
>>> It'd be good to indicate in the field name that we're in us units. In fact
>>> it probably makes sense to use ns throughout and not convert :).
>>> Nanoseconds are QEMU's "native" internal time format.
>> Ooookay, I'll make it "ns", then no indicator is needed, right?
> 
> I still prefer to keep an indicator. Makes it more readable.
> 
>>
>>
>>>> +    /*
>>>> +     * tb_offset is only expected to be changed by migration so
>>>> +     * there is no need to update it from KVM here
>>>> +     */
>>>> +    tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
>>>> +}
>>>> +
>>>> +static int timebase_post_load(void *opaque, int version_id)
>>>> +{
>>>> +    PPCTimebase *tb = opaque;
>>>> +    CPUState *cpu;
>>>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>>>> +    int64_t tb_off_adj, tb_off;
>>>> +    int64_t migration_duration_us, migration_duration_tb, guest_tb,
>>>> host_us;
>>>> +    unsigned long freq;
>>>> +
>>>> +    if (!first_ppc_cpu->env.tb_env) {
>>>> +        error_report("No timebase object");
>>>> +        return -1;
>>>> +    }
>>>> +
>>>> +    freq = first_ppc_cpu->env.tb_env->tb_freq;
>>>> +    /*
>>>> +     * Calculate timebase on the destination side of migration.
>>>> +     * The destination timebase must be not less than the source
>>>> timebase.
>>>> +     * We try to adjust timebase by downtime if host clocks are not
>>>> +     * too much out of sync (1 second for now).
>>>> +     */
>>>> +    host_us = get_clock_realtime() / 1000;
>>>> +    migration_duration_us = MIN(CLOCKS_PER_SEC, host_us -
>>>> tb->time_of_the_day);
>>> CLOCKS_PER_SEC only make sense with the return value of clock(). I wouldn't
>>> be surprised if this calculation goes totally bonkers on BSD or Windows
>>> systems.
>>
>> http://linux.die.net/man/3/clock
>> C89, C99, POSIX.1-2001. POSIX requires that CLOCKS_PER_SEC equals 1000000
>> independent of the actual resolution.
>>
>> This should be valid for BSD too. Ok, this has no effect on Windows.
>> Replace definition with get_ticks_per_sec()? Defining USEC_PER_SEC (or
>> NSEC_xxx) does not make much sense.
> 
> I'd just define NSEC_PER_SEC.
> 
>>
>>> I still don't understand why we're doing a MIN() here. I would understand a
>>> MAX() to guard against unsynchronized times on source/destination.
>> If I added time difference between out-of-sync hosts, the delta would be
>> huge and the destination guest would behave really weird until this time
>> difference is over. Guests can cope with accidental timebase jump of
>> seconds but not minutes.
> 
> Exactly. That's MAX(), right?


Sorry, I am not following you here :(
MAX gives bigger number than MIN and bigger number is what I want to avoid.
I even tried both, MIN works, MAX does not.
Alexander Graf April 14, 2014, 8:16 a.m. UTC | #5
On 14.04.14 10:10, Alexey Kardashevskiy wrote:
> On 04/14/2014 05:46 PM, Alexander Graf wrote:
>> On 14.04.14 09:33, Alexey Kardashevskiy wrote:
>>> On 04/14/2014 05:08 PM, Alexander Graf wrote:
>>>> On 12.04.14 17:52, Alexey Kardashevskiy wrote:
>>>>> This allows guests to have a different timebase origin from the host.
>>>>>
>>>>> This is needed for migration, where a guest can migrate from one host
>>>>> to another and the two hosts might have a different timebase origin.
>>>>> However, the timebase seen by the guest must not go backwards, and
>>>>> should go forwards only by a small amount corresponding to the time
>>>>> taken for the migration.
>>>>>
>>>>> This is only supported for recent POWER hardware which has the TBU40
>>>>> (timebase upper 40 bits) register. That includes POWER6, 7, 8 but not
>>>>> 970.
>>>>>
>>>>> This adds kvm_access_one_reg() to access a special register which is not
>>>>> in env->spr.
>>>>>
>>>>> The feature must be present in the host kernel.
>>>>>
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>> ---
>>>>> Changes:
>>>>> v5:
>>>>> * fixed multiple comments in cpu_ppc_get_adjusted_tb and merged it
>>>>> into timebase_post_load()
>>>>> * removed round_up(1<<24) as KVM is expected to do this anyway
>>>>> * removed @freq from migration stream
>>>>> * renamed PPCTimebaseOffset to PPCTimebase
>>>>> * CLOCKS_PER_SEC is used as a constant which 1000000us/s (man clock)
>>>>>
>>>>> v4:
>>>>> * made it per machine timebase offser rather than per CPU
>>>>>
>>>>> v3:
>>>>> * kvm_access_one_reg moved out to a separate patch
>>>>> * tb_offset and host_timebase were replaced with guest_timebase as
>>>>> the destionation does not really care of offset on the source
>>>>>
>>>>> v2:
>>>>> * bumped the vmstate_ppc_cpu version
>>>>> * defined version for the env.tb_env field
>>>>> ---
>>>>>     hw/ppc/ppc.c           | 76
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>     hw/ppc/spapr.c         |  4 +--
>>>>>     include/hw/ppc/spapr.h |  1 +
>>>>>     target-ppc/cpu-qom.h   | 15 ++++++++++
>>>>>     target-ppc/kvm.c       |  5 ++++
>>>>>     trace-events           |  3 ++
>>>>>     6 files changed, 102 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
>>>>> index 71df471..bf61a0a 100644
>>>>> --- a/hw/ppc/ppc.c
>>>>> +++ b/hw/ppc/ppc.c
>>>>> @@ -29,9 +29,11 @@
>>>>>     #include "sysemu/cpus.h"
>>>>>     #include "hw/timer/m48t59.h"
>>>>>     #include "qemu/log.h"
>>>>> +#include "qemu/error-report.h"
>>>>>     #include "hw/loader.h"
>>>>>     #include "sysemu/kvm.h"
>>>>>     #include "kvm_ppc.h"
>>>>> +#include "trace.h"
>>>>>       //#define PPC_DEBUG_IRQ
>>>>>     //#define PPC_DEBUG_TB
>>>>> @@ -829,6 +831,80 @@ static void cpu_ppc_set_tb_clk (void *opaque,
>>>>> uint32_t freq)
>>>>>         cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>>>>>     }
>>>>>     +static void timebase_pre_save(void *opaque)
>>>>> +{
>>>>> +    PPCTimebase *tb = opaque;
>>>>> +    uint64_t ticks = cpu_get_real_ticks();
>>>>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>>>>> +
>>>>> +    if (!first_ppc_cpu->env.tb_env) {
>>>>> +        error_report("No timebase object");
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    tb->time_of_the_day = get_clock_realtime() / 1000;
>>>> It'd be good to indicate in the field name that we're in us units. In fact
>>>> it probably makes sense to use ns throughout and not convert :).
>>>> Nanoseconds are QEMU's "native" internal time format.
>>> Ooookay, I'll make it "ns", then no indicator is needed, right?
>> I still prefer to keep an indicator. Makes it more readable.
>>
>>>
>>>>> +    /*
>>>>> +     * tb_offset is only expected to be changed by migration so
>>>>> +     * there is no need to update it from KVM here
>>>>> +     */
>>>>> +    tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
>>>>> +}
>>>>> +
>>>>> +static int timebase_post_load(void *opaque, int version_id)
>>>>> +{
>>>>> +    PPCTimebase *tb = opaque;
>>>>> +    CPUState *cpu;
>>>>> +    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>>>>> +    int64_t tb_off_adj, tb_off;
>>>>> +    int64_t migration_duration_us, migration_duration_tb, guest_tb,
>>>>> host_us;
>>>>> +    unsigned long freq;
>>>>> +
>>>>> +    if (!first_ppc_cpu->env.tb_env) {
>>>>> +        error_report("No timebase object");
>>>>> +        return -1;
>>>>> +    }
>>>>> +
>>>>> +    freq = first_ppc_cpu->env.tb_env->tb_freq;
>>>>> +    /*
>>>>> +     * Calculate timebase on the destination side of migration.
>>>>> +     * The destination timebase must be not less than the source
>>>>> timebase.
>>>>> +     * We try to adjust timebase by downtime if host clocks are not
>>>>> +     * too much out of sync (1 second for now).
>>>>> +     */
>>>>> +    host_us = get_clock_realtime() / 1000;
>>>>> +    migration_duration_us = MIN(CLOCKS_PER_SEC, host_us -
>>>>> tb->time_of_the_day);
>>>> CLOCKS_PER_SEC only make sense with the return value of clock(). I wouldn't
>>>> be surprised if this calculation goes totally bonkers on BSD or Windows
>>>> systems.
>>> http://linux.die.net/man/3/clock
>>> C89, C99, POSIX.1-2001. POSIX requires that CLOCKS_PER_SEC equals 1000000
>>> independent of the actual resolution.
>>>
>>> This should be valid for BSD too. Ok, this has no effect on Windows.
>>> Replace definition with get_ticks_per_sec()? Defining USEC_PER_SEC (or
>>> NSEC_xxx) does not make much sense.
>> I'd just define NSEC_PER_SEC.
>>
>>>> I still don't understand why we're doing a MIN() here. I would understand a
>>>> MAX() to guard against unsynchronized times on source/destination.
>>> If I added time difference between out-of-sync hosts, the delta would be
>>> huge and the destination guest would behave really weird until this time
>>> difference is over. Guests can cope with accidental timebase jump of
>>> seconds but not minutes.
>> Exactly. That's MAX(), right?
>
> Sorry, I am not following you here :(
> MAX gives bigger number than MIN and bigger number is what I want to avoid.
> I even tried both, MIN works, MAX does not.

-EMESTUPID.

You're right :)


Alex
diff mbox

Patch

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 71df471..bf61a0a 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -29,9 +29,11 @@ 
 #include "sysemu/cpus.h"
 #include "hw/timer/m48t59.h"
 #include "qemu/log.h"
+#include "qemu/error-report.h"
 #include "hw/loader.h"
 #include "sysemu/kvm.h"
 #include "kvm_ppc.h"
+#include "trace.h"
 
 //#define PPC_DEBUG_IRQ
 //#define PPC_DEBUG_TB
@@ -829,6 +831,80 @@  static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
     cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
 }
 
+static void timebase_pre_save(void *opaque)
+{
+    PPCTimebase *tb = opaque;
+    uint64_t ticks = cpu_get_real_ticks();
+    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
+
+    if (!first_ppc_cpu->env.tb_env) {
+        error_report("No timebase object");
+        return;
+    }
+
+    tb->time_of_the_day = get_clock_realtime() / 1000;
+    /*
+     * tb_offset is only expected to be changed by migration so
+     * there is no need to update it from KVM here
+     */
+    tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
+}
+
+static int timebase_post_load(void *opaque, int version_id)
+{
+    PPCTimebase *tb = opaque;
+    CPUState *cpu;
+    PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
+    int64_t tb_off_adj, tb_off;
+    int64_t migration_duration_us, migration_duration_tb, guest_tb, host_us;
+    unsigned long freq;
+
+    if (!first_ppc_cpu->env.tb_env) {
+        error_report("No timebase object");
+        return -1;
+    }
+
+    freq = first_ppc_cpu->env.tb_env->tb_freq;
+    /*
+     * Calculate timebase on the destination side of migration.
+     * The destination timebase must be not less than the source timebase.
+     * We try to adjust timebase by downtime if host clocks are not
+     * too much out of sync (1 second for now).
+     */
+    host_us = get_clock_realtime() / 1000;
+    migration_duration_us = MIN(CLOCKS_PER_SEC, host_us - tb->time_of_the_day);
+    migration_duration_tb = migration_duration_us * (freq / CLOCKS_PER_SEC);
+    guest_tb = tb->guest_timebase + MIN(0, migration_duration_tb);
+
+    tb_off_adj = guest_tb - cpu_get_real_ticks();
+
+    tb_off = first_ppc_cpu->env.tb_env->tb_offset;
+    trace_ppc_tb_adjust(tb_off, tb_off_adj, tb_off_adj - tb_off,
+                        (tb_off_adj - tb_off) / freq);
+
+    /* Set new offset to all CPUs */
+    CPU_FOREACH(cpu) {
+        PowerPCCPU *pcpu = POWERPC_CPU(cpu);
+        pcpu->env.tb_env->tb_offset = tb_off_adj;
+    }
+
+    return 0;
+}
+
+const VMStateDescription vmstate_ppc_timebase = {
+    .name = "timebase",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .pre_save = timebase_pre_save,
+    .post_load = timebase_post_load,
+    .fields      = (VMStateField []) {
+        VMSTATE_UINT64(guest_timebase, PPCTimebase),
+        VMSTATE_UINT64(time_of_the_day, PPCTimebase),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 /* Set up (once) timebase frequency (in Hz) */
 clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t freq)
 {
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 451c473..6d5412b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -818,7 +818,7 @@  static int spapr_vga_init(PCIBus *pci_bus)
 
 static const VMStateDescription vmstate_spapr = {
     .name = "spapr",
-    .version_id = 1,
+    .version_id = 2,
     .minimum_version_id = 1,
     .minimum_version_id_old = 1,
     .fields      = (VMStateField []) {
@@ -826,7 +826,7 @@  static const VMStateDescription vmstate_spapr = {
 
         /* RTC offset */
         VMSTATE_UINT64(rtc_offset, sPAPREnvironment),
-
+        VMSTATE_PPC_TIMEBASE(tb, sPAPREnvironment),
         VMSTATE_END_OF_LIST()
     },
 };
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 5fdac1e..9f8bb89 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -29,6 +29,7 @@  typedef struct sPAPREnvironment {
     target_ulong entry_point;
     uint32_t next_irq;
     uint64_t rtc_offset;
+    struct PPCTimebase tb;
     bool has_graphics;
 
     uint32_t epow_irq;
diff --git a/target-ppc/cpu-qom.h b/target-ppc/cpu-qom.h
index 47dc8e6..e185850 100644
--- a/target-ppc/cpu-qom.h
+++ b/target-ppc/cpu-qom.h
@@ -120,6 +120,21 @@  int ppc64_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
                                int cpuid, void *opaque);
 #ifndef CONFIG_USER_ONLY
 extern const struct VMStateDescription vmstate_ppc_cpu;
+
+typedef struct PPCTimebase {
+    uint64_t guest_timebase;
+    uint64_t time_of_the_day;
+} PPCTimebase;
+
+extern const struct VMStateDescription vmstate_ppc_timebase;
+
+#define VMSTATE_PPC_TIMEBASE(_field, _state) {                        \
+    .name       = (stringify(_field)),                                \
+    .size       = sizeof(PPCTimebase),                                \
+    .vmsd       = &vmstate_ppc_timebase,                              \
+    .flags      = VMS_STRUCT,                                         \
+    .offset     = vmstate_offset_value(_state, _field, PPCTimebase),  \
+}
 #endif
 
 #endif
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 73dbb02..a8a1498 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -35,6 +35,7 @@ 
 #include "hw/sysbus.h"
 #include "hw/ppc/spapr.h"
 #include "hw/ppc/spapr_vio.h"
+#include "hw/ppc/ppc.h"
 #include "sysemu/watchdog.h"
 #include "trace.h"
 
@@ -890,6 +891,8 @@  int kvm_arch_put_registers(CPUState *cs, int level)
                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
             }
         }
+
+        kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
 #endif /* TARGET_PPC64 */
     }
 
@@ -1133,6 +1136,8 @@  int kvm_arch_get_registers(CPUState *cs)
                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
             }
         }
+
+        kvm_get_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
 #endif
     }
 
diff --git a/trace-events b/trace-events
index 9303245..ce629c1 100644
--- a/trace-events
+++ b/trace-events
@@ -1161,6 +1161,9 @@  spapr_iommu_get(uint64_t liobn, uint64_t ioba, uint64_t ret, uint64_t tce) "liob
 spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u mask=%x"
 spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) "liobn=%"PRIx64" tcet=%p table=%p fd=%d"
 
+# hw/ppc/ppc.c
+ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)"
+
 # util/hbitmap.c
 hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
 hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64