diff mbox

[PULL,095/107] spapr: clock should count only if vm is running

Message ID 20170202051445.5735-96-david@gibson.dropbear.id.au
State New
Headers show

Commit Message

David Gibson Feb. 2, 2017, 5:14 a.m. UTC
From: Laurent Vivier <lvivier@redhat.com>

This is a port to ppc of the i386 commit:
    00f4d64 kvmclock: clock should count only if vm is running

We remove timebase_post_load function, and use the VM state
change handler to save and restore the guest_timebase (on stop
and continue).

We keep timebase_pre_save to reduce the clock difference on
migration like in:
    6053a86 kvmclock: reduce kvmclock difference on migration

Time base offset has originally been introduced by commit
    98a8b52 spapr: Add support for time base offset migration

So while VM is paused, the time is stopped. This allows to have
the same result with date (based on Time Base Register) and
hwclock (based on "get-time-of-day" RTAS call).

Moreover in TCG mode, the Time Base is always paused, so this
patch also adjust the behavior between TCG and KVM.

VM state field "time_of_the_day_ns" is now useless but we keep
it to be able to migrate to older version of the machine.

As vmstate_ppc_timebase structure (with timebase_pre_save() and
timebase_post_load() functions) was only used by vmstate_spapr,
we register the VM state change handler only in ppc_spapr_init().

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 hw/ppc/ppc.c         | 66 ++++++++++++++++++++++++++++++++++------------------
 hw/ppc/spapr.c       |  6 +++++
 target/ppc/cpu-qom.h |  3 +++
 3 files changed, 52 insertions(+), 23 deletions(-)

Comments

Mark Cave-Ayland Feb. 2, 2017, 8:37 a.m. UTC | #1
On 02/02/17 05:14, David Gibson wrote:

> From: Laurent Vivier <lvivier@redhat.com>
> 
> This is a port to ppc of the i386 commit:
>     00f4d64 kvmclock: clock should count only if vm is running
> 
> We remove timebase_post_load function, and use the VM state
> change handler to save and restore the guest_timebase (on stop
> and continue).
> 
> We keep timebase_pre_save to reduce the clock difference on
> migration like in:
>     6053a86 kvmclock: reduce kvmclock difference on migration
> 
> Time base offset has originally been introduced by commit
>     98a8b52 spapr: Add support for time base offset migration
> 
> So while VM is paused, the time is stopped. This allows to have
> the same result with date (based on Time Base Register) and
> hwclock (based on "get-time-of-day" RTAS call).
> 
> Moreover in TCG mode, the Time Base is always paused, so this
> patch also adjust the behavior between TCG and KVM.
> 
> VM state field "time_of_the_day_ns" is now useless but we keep
> it to be able to migrate to older version of the machine.
> 
> As vmstate_ppc_timebase structure (with timebase_pre_save() and
> timebase_post_load() functions) was only used by vmstate_spapr,
> we register the VM state change handler only in ppc_spapr_init().
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  hw/ppc/ppc.c         | 66 ++++++++++++++++++++++++++++++++++------------------
>  hw/ppc/spapr.c       |  6 +++++
>  target/ppc/cpu-qom.h |  3 +++
>  3 files changed, 52 insertions(+), 23 deletions(-)
> 
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index f9a4b51..d171e60 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -847,9 +847,8 @@ static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
>      cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
>  }
>  
> -static void timebase_pre_save(void *opaque)
> +static void timebase_save(PPCTimebase *tb)
>  {
> -    PPCTimebase *tb = opaque;
>      uint64_t ticks = cpu_get_host_ticks();
>      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>  
> @@ -858,43 +857,30 @@ static void timebase_pre_save(void *opaque)
>          return;
>      }
>  
> +    /* not used anymore, we keep it for compatibility */
>      tb->time_of_the_day_ns = qemu_clock_get_ns(QEMU_CLOCK_HOST);
>      /*
> -     * tb_offset is only expected to be changed by migration so
> +     * tb_offset is only expected to be changed by QEMU so
>       * there is no need to update it from KVM here
>       */
>      tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
>  }
>  
> -static int timebase_post_load(void *opaque, int version_id)
> +static void timebase_load(PPCTimebase *tb)
>  {
> -    PPCTimebase *tb_remote = opaque;
>      CPUState *cpu;
>      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
> -    int64_t tb_off_adj, tb_off, ns_diff;
> -    int64_t migration_duration_ns, migration_duration_tb, guest_tb, host_ns;
> +    int64_t tb_off_adj, tb_off;
>      unsigned long freq;
>  
>      if (!first_ppc_cpu->env.tb_env) {
>          error_report("No timebase object");
> -        return -1;
> +        return;
>      }
>  
>      freq = first_ppc_cpu->env.tb_env->tb_freq;
> -    /*
> -     * Calculate timebase on the destination side of migration.
> -     * The destination timebase must be not less than the source timebase.
> -     * We try to adjust timebase by downtime if host clocks are not
> -     * too much out of sync (1 second for now).
> -     */
> -    host_ns = qemu_clock_get_ns(QEMU_CLOCK_HOST);
> -    ns_diff = MAX(0, host_ns - tb_remote->time_of_the_day_ns);
> -    migration_duration_ns = MIN(NANOSECONDS_PER_SECOND, ns_diff);
> -    migration_duration_tb = muldiv64(freq, migration_duration_ns,
> -                                     NANOSECONDS_PER_SECOND);
> -    guest_tb = tb_remote->guest_timebase + MIN(0, migration_duration_tb);
>  
> -    tb_off_adj = guest_tb - cpu_get_host_ticks();
> +    tb_off_adj = tb->guest_timebase - cpu_get_host_ticks();
>  
>      tb_off = first_ppc_cpu->env.tb_env->tb_offset;
>      trace_ppc_tb_adjust(tb_off, tb_off_adj, tb_off_adj - tb_off,
> @@ -904,9 +890,44 @@ static int timebase_post_load(void *opaque, int version_id)
>      CPU_FOREACH(cpu) {
>          PowerPCCPU *pcpu = POWERPC_CPU(cpu);
>          pcpu->env.tb_env->tb_offset = tb_off_adj;
> +#if defined(CONFIG_KVM)
> +        kvm_set_one_reg(cpu, KVM_REG_PPC_TB_OFFSET,
> +                        &pcpu->env.tb_env->tb_offset);
> +#endif
>      }
> +}
>  
> -    return 0;
> +void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> +                                   RunState state)
> +{
> +    PPCTimebase *tb = opaque;
> +
> +    if (running) {
> +        timebase_load(tb);
> +    } else {
> +        timebase_save(tb);
> +    }
> +}
> +
> +/*
> + * When migrating, read the clock just before migration,
> + * so that the guest clock counts during the events
> + * between:
> + *
> + *  * vm_stop()
> + *  *
> + *  * pre_save()
> + *
> + *  This reduces clock difference on migration from 5s
> + *  to 0.1s (when max_downtime == 5s), because sending the
> + *  final pages of memory (which happens between vm_stop()
> + *  and pre_save()) takes max_downtime.
> + */
> +static void timebase_pre_save(void *opaque)
> +{
> +    PPCTimebase *tb = opaque;
> +
> +    timebase_save(tb);
>  }
>  
>  const VMStateDescription vmstate_ppc_timebase = {
> @@ -915,7 +936,6 @@ const VMStateDescription vmstate_ppc_timebase = {
>      .minimum_version_id = 1,
>      .minimum_version_id_old = 1,
>      .pre_save = timebase_pre_save,
> -    .post_load = timebase_post_load,
>      .fields      = (VMStateField []) {
>          VMSTATE_UINT64(guest_timebase, PPCTimebase),
>          VMSTATE_INT64(time_of_the_day_ns, PPCTimebase),
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index b71cd7a..9fc3fb9 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2129,6 +2129,12 @@ static void ppc_spapr_init(MachineState *machine)
>      qemu_register_reset(spapr_ccs_reset_hook, spapr);
>  
>      qemu_register_boot_set(spapr_boot_set, spapr);
> +
> +    /* to stop and start vmclock */
> +    if (kvm_enabled()) {
> +        qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
> +                                         &spapr->tb);
> +    }
>  }
>  
>  static int spapr_kvm_type(const char *vm_type)
> diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> index d46c31a..b7977ba 100644
> --- a/target/ppc/cpu-qom.h
> +++ b/target/ppc/cpu-qom.h
> @@ -214,6 +214,9 @@ extern const struct VMStateDescription vmstate_ppc_timebase;
>      .flags      = VMS_STRUCT,                                         \
>      .offset     = vmstate_offset_value(_state, _field, PPCTimebase),  \
>  }
> +
> +void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> +                                   RunState state);
>  #endif
>  
>  #endif

Hi David/Laurent,

I just noticed this in your pull request today - this looks like it is
along similar lines to the prototype patch I proposed last year as part
of the decrementer migration thread discussion, i.e. use a
vm_change_state_handler() to sync the clock on pause/resume.

Am I right in thinking this now solves the timebase migration problem,
and so the only part required is to encode the decrementer relative to
the timebase during migration to ensure its value is also migrated
correctly?


ATB,

Mark.
Laurent Vivier Feb. 2, 2017, 9:13 a.m. UTC | #2
On 02/02/2017 09:37, Mark Cave-Ayland wrote:
> On 02/02/17 05:14, David Gibson wrote:
> 
...
> Hi David/Laurent,

Hi Mark,

> I just noticed this in your pull request today - this looks like it is
> along similar lines to the prototype patch I proposed last year as part
> of the decrementer migration thread discussion, i.e. use a
> vm_change_state_handler() to sync the clock on pause/resume.
> 
> Am I right in thinking this now solves the timebase migration problem,
> and so the only part required is to encode the decrementer relative to
> the timebase during migration to ensure its value is also migrated
> correctly?

Do you have a link to this thread discussion?

The main purpose of this patch was only to stop the clock (TBR) while
the machine is paused, so I'd like to know what is the problem you are
speaking about.

Thanks,
Laurent
Mark Cave-Ayland Feb. 2, 2017, 10:40 a.m. UTC | #3
On 02/02/17 09:13, Laurent Vivier wrote:

> On 02/02/2017 09:37, Mark Cave-Ayland wrote:
>> On 02/02/17 05:14, David Gibson wrote:
>>
> ...
>> Hi David/Laurent,
> 
> Hi Mark,
> 
>> I just noticed this in your pull request today - this looks like it is
>> along similar lines to the prototype patch I proposed last year as part
>> of the decrementer migration thread discussion, i.e. use a
>> vm_change_state_handler() to sync the clock on pause/resume.
>>
>> Am I right in thinking this now solves the timebase migration problem,
>> and so the only part required is to encode the decrementer relative to
>> the timebase during migration to ensure its value is also migrated
>> correctly?
> 
> Do you have a link to this thread discussion?
> 
> The main purpose of this patch was only to stop the clock (TBR) while
> the machine is paused, so I'd like to know what is the problem you are
> speaking about.

Hi Laurent,

Yes indeed. The discussion spanned a couple of threads last year, but
the start of it was my patch to migrate the decrementer to fix an issue
I was seeing when migrating Darwin images on the Mac machines under TCG:

https://lists.gnu.org/archive/html/qemu-devel/2016-01/msg00544.html

This then eventually became a separate thread here:

https://lists.gnu.org/archive/html/qemu-devel/2016-01/msg04622.html


ATB,

Mark.
Laurent Vivier Feb. 2, 2017, 2:20 p.m. UTC | #4
On 02/02/2017 11:40, Mark Cave-Ayland wrote:
> On 02/02/17 09:13, Laurent Vivier wrote:
> 
>> On 02/02/2017 09:37, Mark Cave-Ayland wrote:
>>> On 02/02/17 05:14, David Gibson wrote:
>>>
>> ...
>>> Hi David/Laurent,
>>
>> Hi Mark,
>>
>>> I just noticed this in your pull request today - this looks like it is
>>> along similar lines to the prototype patch I proposed last year as part
>>> of the decrementer migration thread discussion, i.e. use a
>>> vm_change_state_handler() to sync the clock on pause/resume.
>>>
>>> Am I right in thinking this now solves the timebase migration problem,
>>> and so the only part required is to encode the decrementer relative to
>>> the timebase during migration to ensure its value is also migrated
>>> correctly?
>>
>> Do you have a link to this thread discussion?
>>
>> The main purpose of this patch was only to stop the clock (TBR) while
>> the machine is paused, so I'd like to know what is the problem you are
>> speaking about.
> 
> Hi Laurent,
> 
> Yes indeed. The discussion spanned a couple of threads last year, but
> the start of it was my patch to migrate the decrementer to fix an issue
> I was seeing when migrating Darwin images on the Mac machines under TCG:
> 
> https://lists.gnu.org/archive/html/qemu-devel/2016-01/msg00544.html
> 
> This then eventually became a separate thread here:
> 
> https://lists.gnu.org/archive/html/qemu-devel/2016-01/msg04622.html

I think adding the the PPCTimebase field and the VMSTATE_PPC_TIMEBASE_V
macro to the PMac machines should fix your issue.

Do you have a test case I can try?

Laurent
Mark Cave-Ayland Feb. 2, 2017, 3:50 p.m. UTC | #5
On 02/02/17 14:20, Laurent Vivier wrote:

> I think adding the the PPCTimebase field and the VMSTATE_PPC_TIMEBASE_V
> macro to the PMac machines should fix your issue.
> 
> Do you have a test case I can try?
> 
> Laurent

Hi Laurent,

Yes I'd say that is required, although I still think you need to migrate
the decrementer value as per the comments on
https://lists.gnu.org/archive/html/qemu-devel/2016-01/msg00546.html.

Here's the reproducer from an off-list email I sent last year:

1) Download https://www.ilande.co.uk/tmp/darwin_empty.qcow2.xz and
decompress the image (it's a pre-partitioned empty Apple Partition Map disk)

2) Download https://opensource.apple.com/static/iso/darwinppc-602.cdr.gz
image, gunzip it and rename with .iso extension

3) Start QEMU using the attached "empty" disk like this:

./qemu-system-ppc -hda darwin_empty.qcow2 -cdrom darwinppc-602.iso -boot d

4) Start the installer in the guest and you'll see lots of files with
ASCII progress bars as the various files are copied to disk

Then to see the problem with the progress bar, repeat the following:

5) Pause the VM

6) Issue "savevm foo" in the monitor

7) Exit QEMU

8) Start QEMU again as below:

./qemu-system-ppc -hda darwin_empty.qcow2 -cdrom darwinppc-602.iso -boot
d -loadvm foo

If you do this enough times (maybe 10 or so?) you'll see the progress
bars stop working correctly and get out of sync, i.e. it will freeze for
long periods of time and then "jump" to catch-up but not all the way.

With my above patch applied to include the decrementer in the migration,
the bug was no longer visible in my tests.


HTH,

Mark.
Alexander Graf Dec. 13, 2017, 7:19 p.m. UTC | #6
On 02.02.17 06:14, David Gibson wrote:
> From: Laurent Vivier <lvivier@redhat.com>
> 
> This is a port to ppc of the i386 commit:
>     00f4d64 kvmclock: clock should count only if vm is running
> 
> We remove timebase_post_load function, and use the VM state
> change handler to save and restore the guest_timebase (on stop
> and continue).
> 
> We keep timebase_pre_save to reduce the clock difference on
> migration like in:
>     6053a86 kvmclock: reduce kvmclock difference on migration
> 
> Time base offset has originally been introduced by commit
>     98a8b52 spapr: Add support for time base offset migration
> 
> So while VM is paused, the time is stopped. This allows to have
> the same result with date (based on Time Base Register) and
> hwclock (based on "get-time-of-day" RTAS call).
> 
> Moreover in TCG mode, the Time Base is always paused, so this
> patch also adjust the behavior between TCG and KVM.
> 
> VM state field "time_of_the_day_ns" is now useless but we keep
> it to be able to migrate to older version of the machine.
> 
> As vmstate_ppc_timebase structure (with timebase_pre_save() and
> timebase_post_load() functions) was only used by vmstate_spapr,
> we register the VM state change handler only in ppc_spapr_init().
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>

Just a small heads-up: I've been debugging an OpenQA regression lately
where our automated testing regressed with QEMU 2.9. With stock 2.9.1, I
get a failure rate of "weird" effects (probably TB divergence between
vcpus) of ~30%. With this patch reverted it's back to 0%.

I *think* something here causes the TB offset of multiple threads (I'm
running -smp 2,threads=2) to diverge.

I'll keep debugging things tomorrow, but I'll be happy to see anyone
else beat me to analyze what is going wrong ;).


Alex
Laurent Vivier Dec. 13, 2017, 7:29 p.m. UTC | #7
On 13/12/2017 20:19, Alexander Graf wrote:
> 
> 
> On 02.02.17 06:14, David Gibson wrote:
>> From: Laurent Vivier <lvivier@redhat.com>
>>
>> This is a port to ppc of the i386 commit:
>>     00f4d64 kvmclock: clock should count only if vm is running
>>
>> We remove timebase_post_load function, and use the VM state
>> change handler to save and restore the guest_timebase (on stop
>> and continue).
>>
>> We keep timebase_pre_save to reduce the clock difference on
>> migration like in:
>>     6053a86 kvmclock: reduce kvmclock difference on migration
>>
>> Time base offset has originally been introduced by commit
>>     98a8b52 spapr: Add support for time base offset migration
>>
>> So while VM is paused, the time is stopped. This allows to have
>> the same result with date (based on Time Base Register) and
>> hwclock (based on "get-time-of-day" RTAS call).
>>
>> Moreover in TCG mode, the Time Base is always paused, so this
>> patch also adjust the behavior between TCG and KVM.
>>
>> VM state field "time_of_the_day_ns" is now useless but we keep
>> it to be able to migrate to older version of the machine.
>>
>> As vmstate_ppc_timebase structure (with timebase_pre_save() and
>> timebase_post_load() functions) was only used by vmstate_spapr,
>> we register the VM state change handler only in ppc_spapr_init().
>>
>> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> 
> Just a small heads-up: I've been debugging an OpenQA regression lately
> where our automated testing regressed with QEMU 2.9. With stock 2.9.1, I
> get a failure rate of "weird" effects (probably TB divergence between
> vcpus) of ~30%. With this patch reverted it's back to 0%.
> 
> I *think* something here causes the TB offset of multiple threads (I'm
> running -smp 2,threads=2) to diverge.
> 
> I'll keep debugging things tomorrow, but I'll be happy to see anyone
> else beat me to analyze what is going wrong ;).

Don't know if it can be related, but for migration we need:

http://patchwork.ozlabs.org/patch/840170/

diff --git a/target/ppc/translate_init.c b/target/ppc/translate_init.c
index b9c49c22f2..4e11e6f489 100644
--- a/target/ppc/translate_init.c
+++ b/target/ppc/translate_init.c
@@ -8081,10 +8081,10 @@  static void gen_spr_power8_ebb(CPUPPCState *env)
 /* Virtual Time Base */
 static void gen_spr_vtb(CPUPPCState *env)
 {
-    spr_register(env, SPR_VTB, "VTB",
+    spr_register_kvm(env, SPR_VTB, "VTB",
                  SPR_NOACCESS, SPR_NOACCESS,
                  &spr_read_tbl, SPR_NOACCESS,
-                 0x00000000);
+                 KVM_REG_PPC_VTB, 0x00000000);
 }

 static void gen_spr_power8_fscr(CPUPPCState *env)

Thanks,
Laurent
Alexander Graf Dec. 13, 2017, 7:33 p.m. UTC | #8
On 13.12.17 20:29, Laurent Vivier wrote:
> On 13/12/2017 20:19, Alexander Graf wrote:
>>
>>
>> On 02.02.17 06:14, David Gibson wrote:
>>> From: Laurent Vivier <lvivier@redhat.com>
>>>
>>> This is a port to ppc of the i386 commit:
>>>     00f4d64 kvmclock: clock should count only if vm is running
>>>
>>> We remove timebase_post_load function, and use the VM state
>>> change handler to save and restore the guest_timebase (on stop
>>> and continue).
>>>
>>> We keep timebase_pre_save to reduce the clock difference on
>>> migration like in:
>>>     6053a86 kvmclock: reduce kvmclock difference on migration
>>>
>>> Time base offset has originally been introduced by commit
>>>     98a8b52 spapr: Add support for time base offset migration
>>>
>>> So while VM is paused, the time is stopped. This allows to have
>>> the same result with date (based on Time Base Register) and
>>> hwclock (based on "get-time-of-day" RTAS call).
>>>
>>> Moreover in TCG mode, the Time Base is always paused, so this
>>> patch also adjust the behavior between TCG and KVM.
>>>
>>> VM state field "time_of_the_day_ns" is now useless but we keep
>>> it to be able to migrate to older version of the machine.
>>>
>>> As vmstate_ppc_timebase structure (with timebase_pre_save() and
>>> timebase_post_load() functions) was only used by vmstate_spapr,
>>> we register the VM state change handler only in ppc_spapr_init().
>>>
>>> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>
>> Just a small heads-up: I've been debugging an OpenQA regression lately
>> where our automated testing regressed with QEMU 2.9. With stock 2.9.1, I
>> get a failure rate of "weird" effects (probably TB divergence between
>> vcpus) of ~30%. With this patch reverted it's back to 0%.
>>
>> I *think* something here causes the TB offset of multiple threads (I'm
>> running -smp 2,threads=2) to diverge.
>>
>> I'll keep debugging things tomorrow, but I'll be happy to see anyone
>> else beat me to analyze what is going wrong ;).
> 
> Don't know if it can be related, but for migration we need:

I doubt that fixes it, but I'll give it a try and will let you know the
results :).


Alex
Alexander Graf Dec. 13, 2017, 7:59 p.m. UTC | #9
On 13.12.17 20:29, Laurent Vivier wrote:
> On 13/12/2017 20:19, Alexander Graf wrote:
>>
>>
>> On 02.02.17 06:14, David Gibson wrote:
>>> From: Laurent Vivier <lvivier@redhat.com>
>>>
>>> This is a port to ppc of the i386 commit:
>>>     00f4d64 kvmclock: clock should count only if vm is running
>>>
>>> We remove timebase_post_load function, and use the VM state
>>> change handler to save and restore the guest_timebase (on stop
>>> and continue).
>>>
>>> We keep timebase_pre_save to reduce the clock difference on
>>> migration like in:
>>>     6053a86 kvmclock: reduce kvmclock difference on migration
>>>
>>> Time base offset has originally been introduced by commit
>>>     98a8b52 spapr: Add support for time base offset migration
>>>
>>> So while VM is paused, the time is stopped. This allows to have
>>> the same result with date (based on Time Base Register) and
>>> hwclock (based on "get-time-of-day" RTAS call).
>>>
>>> Moreover in TCG mode, the Time Base is always paused, so this
>>> patch also adjust the behavior between TCG and KVM.
>>>
>>> VM state field "time_of_the_day_ns" is now useless but we keep
>>> it to be able to migrate to older version of the machine.
>>>
>>> As vmstate_ppc_timebase structure (with timebase_pre_save() and
>>> timebase_post_load() functions) was only used by vmstate_spapr,
>>> we register the VM state change handler only in ppc_spapr_init().
>>>
>>> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>
>> Just a small heads-up: I've been debugging an OpenQA regression lately
>> where our automated testing regressed with QEMU 2.9. With stock 2.9.1, I
>> get a failure rate of "weird" effects (probably TB divergence between
>> vcpus) of ~30%. With this patch reverted it's back to 0%.
>>
>> I *think* something here causes the TB offset of multiple threads (I'm
>> running -smp 2,threads=2) to diverge.
>>
>> I'll keep debugging things tomorrow, but I'll be happy to see anyone
>> else beat me to analyze what is going wrong ;).
> 
> Don't know if it can be related, but for migration we need:


As expected, this did not fix it. I'll keep digging.

My hunch is that we now set VTB on different cores at different times,
introducing tiny VTB offsets which can lead to negative TB differences
inside the guest.


Alex
Laurent Vivier Dec. 14, 2017, 7:33 a.m. UTC | #10
On 13/12/2017 20:59, Alexander Graf wrote:
> 
> 
> On 13.12.17 20:29, Laurent Vivier wrote:
>> On 13/12/2017 20:19, Alexander Graf wrote:
>>>
>>>
>>> On 02.02.17 06:14, David Gibson wrote:
>>>> From: Laurent Vivier <lvivier@redhat.com>
>>>>
>>>> This is a port to ppc of the i386 commit:
>>>>     00f4d64 kvmclock: clock should count only if vm is running
>>>>
>>>> We remove timebase_post_load function, and use the VM state
>>>> change handler to save and restore the guest_timebase (on stop
>>>> and continue).
>>>>
>>>> We keep timebase_pre_save to reduce the clock difference on
>>>> migration like in:
>>>>     6053a86 kvmclock: reduce kvmclock difference on migration
>>>>
>>>> Time base offset has originally been introduced by commit
>>>>     98a8b52 spapr: Add support for time base offset migration
>>>>
>>>> So while VM is paused, the time is stopped. This allows to have
>>>> the same result with date (based on Time Base Register) and
>>>> hwclock (based on "get-time-of-day" RTAS call).
>>>>
>>>> Moreover in TCG mode, the Time Base is always paused, so this
>>>> patch also adjust the behavior between TCG and KVM.
>>>>
>>>> VM state field "time_of_the_day_ns" is now useless but we keep
>>>> it to be able to migrate to older version of the machine.
>>>>
>>>> As vmstate_ppc_timebase structure (with timebase_pre_save() and
>>>> timebase_post_load() functions) was only used by vmstate_spapr,
>>>> we register the VM state change handler only in ppc_spapr_init().
>>>>
>>>> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
>>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>>
>>> Just a small heads-up: I've been debugging an OpenQA regression lately
>>> where our automated testing regressed with QEMU 2.9. With stock 2.9.1, I
>>> get a failure rate of "weird" effects (probably TB divergence between
>>> vcpus) of ~30%. With this patch reverted it's back to 0%.
>>>
>>> I *think* something here causes the TB offset of multiple threads (I'm
>>> running -smp 2,threads=2) to diverge.
>>>
>>> I'll keep debugging things tomorrow, but I'll be happy to see anyone
>>> else beat me to analyze what is going wrong ;).
>>
>> Don't know if it can be related, but for migration we need:
> 
> 
> As expected, this did not fix it. I'll keep digging.
> 
> My hunch is that we now set VTB on different cores at different times,
> introducing tiny VTB offsets which can lead to negative TB differences
> inside the guest.
> 
> 
> Alex
> 

I agree.
I'm wondering if something like that can fix it:

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 7ec35de5ae..48737cbe04 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -884,7 +884,6 @@ static void timebase_load(PPCTimebase *tb)
 {
     CPUState *cpu;
     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
-    int64_t tb_off_adj, tb_off;
     unsigned long freq;

     if (!first_ppc_cpu->env.tb_env) {
@@ -894,16 +893,10 @@ static void timebase_load(PPCTimebase *tb)

     freq = first_ppc_cpu->env.tb_env->tb_freq;

-    tb_off_adj = tb->guest_timebase - cpu_get_host_ticks();
-
-    tb_off = first_ppc_cpu->env.tb_env->tb_offset;
-    trace_ppc_tb_adjust(tb_off, tb_off_adj, tb_off_adj - tb_off,
-                        (tb_off_adj - tb_off) / freq);
-
     /* Set new offset to all CPUs */
     CPU_FOREACH(cpu) {
         PowerPCCPU *pcpu = POWERPC_CPU(cpu);
-        pcpu->env.tb_env->tb_offset = tb_off_adj;
+        pcpu->env.tb_env->tb_offset = tb->guest_timebase -
cpu_get_host_ticks();
 #if defined(CONFIG_KVM)
         kvm_set_one_reg(cpu, KVM_REG_PPC_TB_OFFSET,
                         &pcpu->env.tb_env->tb_offset);
Laurent Vivier Dec. 18, 2017, 10:17 a.m. UTC | #11
On 13/12/2017 20:59, Alexander Graf wrote:
> 
> 
> On 13.12.17 20:29, Laurent Vivier wrote:
>> On 13/12/2017 20:19, Alexander Graf wrote:
>>>
>>>
>>> On 02.02.17 06:14, David Gibson wrote:
>>>> From: Laurent Vivier <lvivier@redhat.com>
>>>>
>>>> This is a port to ppc of the i386 commit:
>>>>     00f4d64 kvmclock: clock should count only if vm is running
>>>>
>>>> We remove timebase_post_load function, and use the VM state
>>>> change handler to save and restore the guest_timebase (on stop
>>>> and continue).
>>>>
>>>> We keep timebase_pre_save to reduce the clock difference on
>>>> migration like in:
>>>>     6053a86 kvmclock: reduce kvmclock difference on migration
>>>>
>>>> Time base offset has originally been introduced by commit
>>>>     98a8b52 spapr: Add support for time base offset migration
>>>>
>>>> So while VM is paused, the time is stopped. This allows to have
>>>> the same result with date (based on Time Base Register) and
>>>> hwclock (based on "get-time-of-day" RTAS call).
>>>>
>>>> Moreover in TCG mode, the Time Base is always paused, so this
>>>> patch also adjust the behavior between TCG and KVM.
>>>>
>>>> VM state field "time_of_the_day_ns" is now useless but we keep
>>>> it to be able to migrate to older version of the machine.
>>>>
>>>> As vmstate_ppc_timebase structure (with timebase_pre_save() and
>>>> timebase_post_load() functions) was only used by vmstate_spapr,
>>>> we register the VM state change handler only in ppc_spapr_init().
>>>>
>>>> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
>>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>>
>>> Just a small heads-up: I've been debugging an OpenQA regression lately
>>> where our automated testing regressed with QEMU 2.9. With stock 2.9.1, I
>>> get a failure rate of "weird" effects (probably TB divergence between
>>> vcpus) of ~30%. With this patch reverted it's back to 0%.
>>>
>>> I *think* something here causes the TB offset of multiple threads (I'm
>>> running -smp 2,threads=2) to diverge.
>>>
>>> I'll keep debugging things tomorrow, but I'll be happy to see anyone
>>> else beat me to analyze what is going wrong ;).
>>
>> Don't know if it can be related, but for migration we need:
> 
> 
> As expected, this did not fix it. I'll keep digging.
> 
> My hunch is that we now set VTB on different cores at different times,
> introducing tiny VTB offsets which can lead to negative TB differences
> inside the guest.

Did you find where is the problem?

Can I help?

Thanks,
Laurent
diff mbox

Patch

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index f9a4b51..d171e60 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -847,9 +847,8 @@  static void cpu_ppc_set_tb_clk (void *opaque, uint32_t freq)
     cpu_ppc_store_purr(cpu, 0x0000000000000000ULL);
 }
 
-static void timebase_pre_save(void *opaque)
+static void timebase_save(PPCTimebase *tb)
 {
-    PPCTimebase *tb = opaque;
     uint64_t ticks = cpu_get_host_ticks();
     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
 
@@ -858,43 +857,30 @@  static void timebase_pre_save(void *opaque)
         return;
     }
 
+    /* not used anymore, we keep it for compatibility */
     tb->time_of_the_day_ns = qemu_clock_get_ns(QEMU_CLOCK_HOST);
     /*
-     * tb_offset is only expected to be changed by migration so
+     * tb_offset is only expected to be changed by QEMU so
      * there is no need to update it from KVM here
      */
     tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
 }
 
-static int timebase_post_load(void *opaque, int version_id)
+static void timebase_load(PPCTimebase *tb)
 {
-    PPCTimebase *tb_remote = opaque;
     CPUState *cpu;
     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
-    int64_t tb_off_adj, tb_off, ns_diff;
-    int64_t migration_duration_ns, migration_duration_tb, guest_tb, host_ns;
+    int64_t tb_off_adj, tb_off;
     unsigned long freq;
 
     if (!first_ppc_cpu->env.tb_env) {
         error_report("No timebase object");
-        return -1;
+        return;
     }
 
     freq = first_ppc_cpu->env.tb_env->tb_freq;
-    /*
-     * Calculate timebase on the destination side of migration.
-     * The destination timebase must be not less than the source timebase.
-     * We try to adjust timebase by downtime if host clocks are not
-     * too much out of sync (1 second for now).
-     */
-    host_ns = qemu_clock_get_ns(QEMU_CLOCK_HOST);
-    ns_diff = MAX(0, host_ns - tb_remote->time_of_the_day_ns);
-    migration_duration_ns = MIN(NANOSECONDS_PER_SECOND, ns_diff);
-    migration_duration_tb = muldiv64(freq, migration_duration_ns,
-                                     NANOSECONDS_PER_SECOND);
-    guest_tb = tb_remote->guest_timebase + MIN(0, migration_duration_tb);
 
-    tb_off_adj = guest_tb - cpu_get_host_ticks();
+    tb_off_adj = tb->guest_timebase - cpu_get_host_ticks();
 
     tb_off = first_ppc_cpu->env.tb_env->tb_offset;
     trace_ppc_tb_adjust(tb_off, tb_off_adj, tb_off_adj - tb_off,
@@ -904,9 +890,44 @@  static int timebase_post_load(void *opaque, int version_id)
     CPU_FOREACH(cpu) {
         PowerPCCPU *pcpu = POWERPC_CPU(cpu);
         pcpu->env.tb_env->tb_offset = tb_off_adj;
+#if defined(CONFIG_KVM)
+        kvm_set_one_reg(cpu, KVM_REG_PPC_TB_OFFSET,
+                        &pcpu->env.tb_env->tb_offset);
+#endif
     }
+}
 
-    return 0;
+void cpu_ppc_clock_vm_state_change(void *opaque, int running,
+                                   RunState state)
+{
+    PPCTimebase *tb = opaque;
+
+    if (running) {
+        timebase_load(tb);
+    } else {
+        timebase_save(tb);
+    }
+}
+
+/*
+ * When migrating, read the clock just before migration,
+ * so that the guest clock counts during the events
+ * between:
+ *
+ *  * vm_stop()
+ *  *
+ *  * pre_save()
+ *
+ *  This reduces clock difference on migration from 5s
+ *  to 0.1s (when max_downtime == 5s), because sending the
+ *  final pages of memory (which happens between vm_stop()
+ *  and pre_save()) takes max_downtime.
+ */
+static void timebase_pre_save(void *opaque)
+{
+    PPCTimebase *tb = opaque;
+
+    timebase_save(tb);
 }
 
 const VMStateDescription vmstate_ppc_timebase = {
@@ -915,7 +936,6 @@  const VMStateDescription vmstate_ppc_timebase = {
     .minimum_version_id = 1,
     .minimum_version_id_old = 1,
     .pre_save = timebase_pre_save,
-    .post_load = timebase_post_load,
     .fields      = (VMStateField []) {
         VMSTATE_UINT64(guest_timebase, PPCTimebase),
         VMSTATE_INT64(time_of_the_day_ns, PPCTimebase),
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index b71cd7a..9fc3fb9 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2129,6 +2129,12 @@  static void ppc_spapr_init(MachineState *machine)
     qemu_register_reset(spapr_ccs_reset_hook, spapr);
 
     qemu_register_boot_set(spapr_boot_set, spapr);
+
+    /* to stop and start vmclock */
+    if (kvm_enabled()) {
+        qemu_add_vm_change_state_handler(cpu_ppc_clock_vm_state_change,
+                                         &spapr->tb);
+    }
 }
 
 static int spapr_kvm_type(const char *vm_type)
diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
index d46c31a..b7977ba 100644
--- a/target/ppc/cpu-qom.h
+++ b/target/ppc/cpu-qom.h
@@ -214,6 +214,9 @@  extern const struct VMStateDescription vmstate_ppc_timebase;
     .flags      = VMS_STRUCT,                                         \
     .offset     = vmstate_offset_value(_state, _field, PPCTimebase),  \
 }
+
+void cpu_ppc_clock_vm_state_change(void *opaque, int running,
+                                   RunState state);
 #endif
 
 #endif